Compare commits
91 Commits
v0.6.0
...
v0.6.1.pos
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
acda0b35d0 | ||
|
|
ba77527955 | ||
|
|
6821020109 | ||
|
|
8427550488 | ||
|
|
3f79bc3d1a | ||
|
|
40c396533d | ||
|
|
5ec9c0fb3c | ||
|
|
8f44a92d85 | ||
|
|
360ddbd37e | ||
|
|
a480939e8e | ||
|
|
d31174a4e1 | ||
|
|
b61bd98f90 | ||
|
|
c16369455f | ||
|
|
019877253b | ||
|
|
551ce01078 | ||
|
|
a6c0f3658d | ||
|
|
f2e263b801 | ||
|
|
1f0c75afa9 | ||
|
|
8a23e93302 | ||
|
|
c6202daeed | ||
|
|
e56bf27741 | ||
|
|
520ca380ae | ||
|
|
7de49aa86c | ||
|
|
42ffba11ad | ||
|
|
295c4730a8 | ||
|
|
1bf2dd9df0 | ||
|
|
5a60699c45 | ||
|
|
b6c75e1cf2 | ||
|
|
b71c956deb | ||
|
|
f842a7aff1 | ||
|
|
a65cb16067 | ||
|
|
3fd2b0d21c | ||
|
|
d394787e52 | ||
|
|
775f00f81e | ||
|
|
8baa454937 | ||
|
|
73202dbe77 | ||
|
|
7015417fd4 | ||
|
|
aea02f30de | ||
|
|
0b952af458 | ||
|
|
3b7fea770f | ||
|
|
cea95dfb94 | ||
|
|
6a512a00df | ||
|
|
efcf946a15 | ||
|
|
1230263e16 | ||
|
|
e497b8aeff | ||
|
|
94144e726c | ||
|
|
1d5e397aa4 | ||
|
|
22f3a4bc6c | ||
|
|
b1f3e18958 | ||
|
|
04e7c4e771 | ||
|
|
5faedf1b62 | ||
|
|
02751a7a42 | ||
|
|
f421f3cefb | ||
|
|
8c054b7a62 | ||
|
|
6234385f4a | ||
|
|
da1a844e61 | ||
|
|
a1d874224d | ||
|
|
6cd5e5b07e | ||
|
|
c7cb5c3335 | ||
|
|
f9b4a2d415 | ||
|
|
58fcc8545a | ||
|
|
08287ef675 | ||
|
|
4ef41b8476 | ||
|
|
cfe712bf1a | ||
|
|
b962ee1470 | ||
|
|
36bf8150cc | ||
|
|
e807125936 | ||
|
|
9f68e00d27 | ||
|
|
ce2702a923 | ||
|
|
795b662cff | ||
|
|
2f707fcb35 | ||
|
|
41e95c5247 | ||
|
|
12dd715807 | ||
|
|
29f49cd6e3 | ||
|
|
23f322297f | ||
|
|
9db52eab3d | ||
|
|
1447c97e75 | ||
|
|
de80783b69 | ||
|
|
e5cab71531 | ||
|
|
baa5467547 | ||
|
|
db3bf7c991 | ||
|
|
2febcf2777 | ||
|
|
2ee45281a5 | ||
|
|
9da25a88aa | ||
|
|
8685ba1a1e | ||
|
|
288a938872 | ||
|
|
e39ebf5cf5 | ||
|
|
ba262c4e5a | ||
|
|
4624d98dbd | ||
|
|
1afc931987 | ||
|
|
e01c2beb7d |
@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
|
|||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
commands=$@
|
commands=$@
|
||||||
|
echo "Commands:$commands"
|
||||||
|
#ignore certain kernels tests
|
||||||
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/test_attention.py \
|
||||||
|
--ignore=kernels/test_attention_selector.py \
|
||||||
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
|
--ignore=kernels/test_flash_attn.py \
|
||||||
|
--ignore=kernels/test_flashinfer.py \
|
||||||
|
--ignore=kernels/test_int8_quant.py \
|
||||||
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/test_moe.py \
|
||||||
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
|
--ignore=kernels/test_rand.py \
|
||||||
|
--ignore=kernels/test_sampler.py"
|
||||||
|
fi
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
#replace shard arguments
|
#replace shard arguments
|
||||||
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
|
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||||
|
echo "Shard ${GPU} commands:$commands"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd --device /dev/dri \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network host \
|
--network host \
|
||||||
|
|||||||
33
.buildkite/run-cpu-test-ppc64le.sh
Executable file
33
.buildkite/run-cpu-test-ppc64le.sh
Executable file
@@ -0,0 +1,33 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f cpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
|
source /etc/environment
|
||||||
|
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
pip install pytest matplotlib einops transformers_stream_generator
|
||||||
|
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
|
# online inference
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
@@ -30,6 +30,12 @@ docker exec cpu-test bash -c "
|
|||||||
--ignore=tests/models/test_jamba.py \
|
--ignore=tests/models/test_jamba.py \
|
||||||
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
|
# Run compressed-tensor test
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
pytest -s -v \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
|
||||||
|
|
||||||
# online inference
|
# online inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ steps:
|
|||||||
- tests/worker
|
- tests/worker
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s async_engine # Async Engine
|
- pytest -v -s async_engine # Async Engine
|
||||||
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
@@ -91,6 +92,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai
|
- pytest -v -s entrypoints/openai
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
@@ -158,6 +160,7 @@ steps:
|
|||||||
- python3 offline_inference_with_prefix.py
|
- python3 offline_inference_with_prefix.py
|
||||||
- python3 llm_engine_example.py
|
- python3 llm_engine_example.py
|
||||||
- python3 offline_inference_vision_language.py
|
- python3 offline_inference_vision_language.py
|
||||||
|
- python3 offline_inference_vision_language_multi_image.py
|
||||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference_encoder_decoder.py
|
- python3 offline_inference_encoder_decoder.py
|
||||||
|
|
||||||
@@ -216,7 +219,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# See https://github.com/vllm-project/vllm/issues/5152
|
# See https://github.com/vllm-project/vllm/issues/5152
|
||||||
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||||
- pytest -v -s spec_decode
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
|
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 30min each
|
- label: LoRA Test %N # 30min each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -227,6 +231,7 @@ steps:
|
|||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Test %N # 30min each
|
- label: Kernels Test %N # 30min each
|
||||||
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
@@ -368,6 +373,7 @@ steps:
|
|||||||
- label: LoRA Long Context (Distributed) # 11min
|
- label: LoRA Long Context (Distributed) # 11min
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora/test_long_context
|
- tests/lora/test_long_context
|
||||||
@@ -384,7 +390,18 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|||||||
9
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
9
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
@@ -30,6 +30,15 @@ body:
|
|||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Model Input Dumps
|
||||||
|
description: |
|
||||||
|
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
||||||
|
placeholder: |
|
||||||
|
Upload the dumped input file.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: 🐛 Describe the bug
|
label: 🐛 Describe the bug
|
||||||
|
|||||||
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
|
|||||||
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
<h3>Adding or changing kernels</h3>
|
||||||
|
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
|
||||||
|
<ul>
|
||||||
|
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
|
||||||
|
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
|
||||||
|
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
|
||||||
|
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
|
||||||
|
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
|
||||||
|
</ul>
|
||||||
|
|
||||||
<h3>Notes for Large Changes</h3>
|
<h3>Notes for Large Changes</h3>
|
||||||
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
||||||
|
|
||||||
|
|||||||
@@ -181,7 +181,6 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
@@ -196,9 +195,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# CUTLASS 3.5.1
|
GIT_TAG v3.5.1
|
||||||
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
|
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
|
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
||||||
|
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
@@ -232,6 +235,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"-gencode arch=compute_90a,code=sm_90a")
|
"-gencode arch=compute_90a,code=sm_90a")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
|
|
||||||
@@ -290,6 +294,12 @@ define_gpu_extension_target(
|
|||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
|
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
||||||
|
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
||||||
|
# driver API. This causes problems when linking with earlier versions of CUDA.
|
||||||
|
# Setting this variable sidesteps the issue by calling the driver directly.
|
||||||
|
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||||
|
|
||||||
#
|
#
|
||||||
# _moe_C extension
|
# _moe_C extension
|
||||||
#
|
#
|
||||||
|
|||||||
128
CODE_OF_CONDUCT.md
Normal file
128
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
|
||||||
|
# vLLM Code of Conduct
|
||||||
|
|
||||||
|
## Our Pledge
|
||||||
|
|
||||||
|
We as members, contributors, and leaders pledge to make participation in our
|
||||||
|
community a harassment-free experience for everyone, regardless of age, body
|
||||||
|
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||||
|
identity and expression, level of experience, education, socioeconomic status,
|
||||||
|
nationality, personal appearance, race, caste, color, religion, or sexual
|
||||||
|
identity and orientation.
|
||||||
|
|
||||||
|
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||||
|
diverse, inclusive, and healthy community.
|
||||||
|
|
||||||
|
## Our Standards
|
||||||
|
|
||||||
|
Examples of behavior that contributes to a positive environment for our
|
||||||
|
community include:
|
||||||
|
|
||||||
|
* Demonstrating empathy and kindness toward other people
|
||||||
|
* Being respectful of differing opinions, viewpoints, and experiences
|
||||||
|
* Giving and gracefully accepting constructive feedback
|
||||||
|
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||||
|
and learning from the experience
|
||||||
|
* Focusing on what is best not just for us as individuals, but for the overall
|
||||||
|
community
|
||||||
|
|
||||||
|
Examples of unacceptable behavior include:
|
||||||
|
|
||||||
|
* The use of sexualized language or imagery, and sexual attention or advances of
|
||||||
|
any kind
|
||||||
|
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||||
|
* Public or private harassment
|
||||||
|
* Publishing others' private information, such as a physical or email address,
|
||||||
|
without their explicit permission
|
||||||
|
* Other conduct which could reasonably be considered inappropriate in a
|
||||||
|
professional setting
|
||||||
|
|
||||||
|
## Enforcement Responsibilities
|
||||||
|
|
||||||
|
Community leaders are responsible for clarifying and enforcing our standards of
|
||||||
|
acceptable behavior and will take appropriate and fair corrective action in
|
||||||
|
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||||
|
or harmful.
|
||||||
|
|
||||||
|
Community leaders have the right and responsibility to remove, edit, or reject
|
||||||
|
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||||
|
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||||
|
decisions when appropriate.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This Code of Conduct applies within all community spaces, and also applies when
|
||||||
|
an individual is officially representing the community in public spaces.
|
||||||
|
Examples of representing our community include using an official email address,
|
||||||
|
posting via an official social media account, or acting as an appointed
|
||||||
|
representative at an online or offline/IRL event.
|
||||||
|
|
||||||
|
## Enforcement
|
||||||
|
|
||||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
|
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
||||||
|
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
|
||||||
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
|
reporter of any incident.
|
||||||
|
|
||||||
|
## Enforcement Guidelines
|
||||||
|
|
||||||
|
Community leaders will follow these Community Impact Guidelines in determining
|
||||||
|
the consequences for any action they deem in violation of this Code of Conduct:
|
||||||
|
|
||||||
|
### 1. Correction
|
||||||
|
|
||||||
|
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||||
|
unprofessional or unwelcome in the community.
|
||||||
|
|
||||||
|
**Consequence**: A private, written warning from community leaders, providing
|
||||||
|
clarity around the nature of the violation and an explanation of why the
|
||||||
|
behavior was inappropriate. A public apology may be requested.
|
||||||
|
|
||||||
|
### 2. Warning
|
||||||
|
|
||||||
|
**Community Impact**: A violation through a single incident or series of
|
||||||
|
actions.
|
||||||
|
|
||||||
|
**Consequence**: A warning with consequences for continued behavior. No
|
||||||
|
interaction with the people involved, including unsolicited interaction with
|
||||||
|
those enforcing the Code of Conduct, for a specified period of time. This
|
||||||
|
includes avoiding interactions in community spaces as well as external channels
|
||||||
|
like social media. Violating these terms may lead to a temporary or permanent
|
||||||
|
ban.
|
||||||
|
|
||||||
|
### 3. Temporary Ban
|
||||||
|
|
||||||
|
**Community Impact**: A serious violation of community standards, including
|
||||||
|
sustained inappropriate behavior.
|
||||||
|
|
||||||
|
**Consequence**: A temporary ban from any sort of interaction or public
|
||||||
|
communication with the community for a specified period of time. No public or
|
||||||
|
private interaction with the people involved, including unsolicited interaction
|
||||||
|
with those enforcing the Code of Conduct, is allowed during this period.
|
||||||
|
Violating these terms may lead to a permanent ban.
|
||||||
|
|
||||||
|
### 4. Permanent Ban
|
||||||
|
|
||||||
|
**Community Impact**: Demonstrating a pattern of violation of community
|
||||||
|
standards, including sustained inappropriate behavior, harassment of an
|
||||||
|
individual, or aggression toward or disparagement of classes of individuals.
|
||||||
|
|
||||||
|
**Consequence**: A permanent ban from any sort of public interaction within the
|
||||||
|
community.
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
|
||||||
|
version 2.1, available at
|
||||||
|
[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
|
||||||
|
|
||||||
|
Community Impact Guidelines were inspired by
|
||||||
|
[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
|
||||||
|
|
||||||
|
For answers to common questions about this code of conduct, see the
|
||||||
|
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
|
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||||
|
|
||||||
11
Dockerfile
11
Dockerfile
@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
|
|||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.12
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install Python and other dependencies
|
||||||
@@ -37,7 +37,6 @@ WORKDIR /workspace
|
|||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
COPY requirements-common.txt requirements-common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements-adag.txt requirements-adag.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-cuda.txt
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
@@ -66,7 +65,6 @@ COPY setup.py setup.py
|
|||||||
COPY cmake cmake
|
COPY cmake cmake
|
||||||
COPY CMakeLists.txt CMakeLists.txt
|
COPY CMakeLists.txt CMakeLists.txt
|
||||||
COPY requirements-common.txt requirements-common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements-adag.txt requirements-adag.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
COPY pyproject.toml pyproject.toml
|
COPY pyproject.toml pyproject.toml
|
||||||
COPY vllm vllm
|
COPY vllm vllm
|
||||||
@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
@@ -147,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||||
@@ -181,6 +180,10 @@ FROM vllm-base AS test
|
|||||||
ADD . /vllm-workspace/
|
ADD . /vllm-workspace/
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
|
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
|
||||||
|
# This installation must complete before the test dependencies are collected and installed.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install "setuptools>=74.1.1"
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-dev.txt
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
|||||||
@@ -2,9 +2,14 @@
|
|||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-1
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
|
||||||
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
apt-get update -y \
|
apt-get update -y \
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
pip install --upgrade pip && \
|
pip install --upgrade pip && \
|
||||||
pip install -r requirements-build.txt
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
|
# install oneDNN
|
||||||
|
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
|
||||||
|
-DONEDNN_BUILD_DOC=OFF \
|
||||||
|
-DONEDNN_BUILD_EXAMPLES=OFF \
|
||||||
|
-DONEDNN_BUILD_TESTS=OFF \
|
||||||
|
-DONEDNN_BUILD_GRAPH=OFF \
|
||||||
|
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
|
||||||
|
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
|
||||||
|
cmake --build ./oneDNN/build --target install --config Release
|
||||||
|
|
||||||
FROM cpu-test-1 AS build
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@@ -40,7 +58,6 @@ COPY ./ ./
|
|||||||
ARG VLLM_CPU_DISABLE_AVX512
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||||
|
|||||||
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
|
|||||||
RUN echo "Base image is $BASE_IMAGE"
|
RUN echo "Base image is $BASE_IMAGE"
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update \
|
||||||
|
&& apt-get install python3 python3-pip -y \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
### Mount Point ###
|
### Mount Point ###
|
||||||
# When launching the container, mount the code directory to /app
|
# When launching the container, mount the code directory to /app
|
||||||
|
|||||||
@@ -4,7 +4,8 @@
|
|||||||
FROM ubuntu:22.04 AS dev
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
RUN apt-get update -y && \
|
||||||
apt-get install -y python3-pip git
|
apt-get install -y python3-pip git && \
|
||||||
|
apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# copy requirements
|
# copy requirements
|
||||||
|
|||||||
@@ -2,21 +2,26 @@ FROM mambaorg/micromamba
|
|||||||
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
# Some packages in requirements-cpu are installed here
|
# Some packages in requirements-cpu are installed here
|
||||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
# Currently these may not be available for venv or pip directly
|
# Currently these may not be available for venv or pip directly
|
||||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
# These packages will be in rocketce eventually
|
# These packages will be in rocketce eventually
|
||||||
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /workspace/
|
||||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
|
|||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
# Install the TPU and Pallas dependencies.
|
# Install the TPU and Pallas dependencies.
|
||||||
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||||
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
|
|||||||
@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
|
|||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
|
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include requirements-adag.txt
|
|
||||||
include requirements-common.txt
|
include requirements-common.txt
|
||||||
include requirements-cuda.txt
|
include requirements-cuda.txt
|
||||||
include requirements-rocm.txt
|
include requirements-rocm.txt
|
||||||
|
|||||||
16
README.md
16
README.md
@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
|
**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
|
||||||
|
|
||||||
We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
|
We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
|
||||||
Join us to hear the vLLM's recent update about performance.
|
Join us to learn more about recent advancements of vLLM on MI300X.
|
||||||
Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
|
Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||||
@@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
year={2023}
|
year={2023}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Contact Us
|
||||||
|
|
||||||
|
* For technical questions and feature requests, please use Github issues or discussions.
|
||||||
|
* For discussing with fellow users, please use Discord.
|
||||||
|
* For security disclosures, please use Github's security advisory feature.
|
||||||
|
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||||
@@ -24,6 +24,7 @@ class RequestFuncInput:
|
|||||||
model: str
|
model: str
|
||||||
best_of: int = 1
|
best_of: int = 1
|
||||||
use_beam_search: bool = False
|
use_beam_search: bool = False
|
||||||
|
logprobs: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -236,6 +237,7 @@ async def async_request_openai_completions(
|
|||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
}
|
}
|
||||||
headers = {
|
headers = {
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import torch
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
|
||||||
from vllm.inputs import PromptInputs
|
from vllm.inputs import PromptInputs
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@@ -205,13 +205,11 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help=('path to save the pytorch profiler output. Can be visualized '
|
help=('path to save the pytorch profiler output. Can be visualized '
|
||||||
'with ui.perfetto.dev or Tensorboard.'))
|
'with ui.perfetto.dev or Tensorboard.'))
|
||||||
parser.add_argument(
|
parser.add_argument("--device",
|
||||||
"--device",
|
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
choices=DEVICE_OPTIONS,
|
||||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
help='device type for vLLM execution')
|
||||||
'CPU.')
|
|
||||||
parser.add_argument('--block-size',
|
parser.add_argument('--block-size',
|
||||||
type=int,
|
type=int,
|
||||||
default=16,
|
default=16,
|
||||||
|
|||||||
@@ -195,8 +195,16 @@ def sample_sonnet_requests(
|
|||||||
|
|
||||||
|
|
||||||
def sample_random_requests(
|
def sample_random_requests(
|
||||||
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
|
prefix_len: int,
|
||||||
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
|
input_len: int,
|
||||||
|
output_len: int,
|
||||||
|
num_prompts: int,
|
||||||
|
range_ratio: float,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
) -> List[Tuple[str, int, int]]:
|
||||||
|
prefix_token_ids = np.random.randint(0,
|
||||||
|
tokenizer.vocab_size,
|
||||||
|
size=prefix_len).tolist()
|
||||||
|
|
||||||
input_lens = np.random.randint(
|
input_lens = np.random.randint(
|
||||||
int(input_len * range_ratio),
|
int(input_len * range_ratio),
|
||||||
@@ -211,10 +219,12 @@ def sample_random_requests(
|
|||||||
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
||||||
input_requests = []
|
input_requests = []
|
||||||
for i in range(num_prompts):
|
for i in range(num_prompts):
|
||||||
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
|
prompt = tokenizer.decode(prefix_token_ids +
|
||||||
|
[(offsets[i] + i + j) % tokenizer.vocab_size
|
||||||
for j in range(input_lens[i])])
|
for j in range(input_lens[i])])
|
||||||
|
|
||||||
input_requests.append(
|
input_requests.append(
|
||||||
(prompt, int(input_lens[i]), int(output_lens[i])))
|
(prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
|
||||||
|
|
||||||
return input_requests
|
return input_requests
|
||||||
|
|
||||||
@@ -318,6 +328,7 @@ async def benchmark(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
|
logprobs: Optional[int],
|
||||||
best_of: int,
|
best_of: int,
|
||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
@@ -339,6 +350,7 @@ async def benchmark(
|
|||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -358,6 +370,7 @@ async def benchmark(
|
|||||||
api_url=base_url + "/start_profile",
|
api_url=base_url + "/start_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -379,6 +392,7 @@ async def benchmark(
|
|||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
output_len=output_len,
|
output_len=output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -396,6 +410,7 @@ async def benchmark(
|
|||||||
api_url=base_url + "/stop_profile",
|
api_url=base_url + "/stop_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -562,6 +577,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
elif args.dataset_name == "random":
|
elif args.dataset_name == "random":
|
||||||
input_requests = sample_random_requests(
|
input_requests = sample_random_requests(
|
||||||
|
prefix_len=args.random_prefix_len,
|
||||||
input_len=args.random_input_len,
|
input_len=args.random_input_len,
|
||||||
output_len=args.random_output_len,
|
output_len=args.random_output_len,
|
||||||
num_prompts=args.num_prompts,
|
num_prompts=args.num_prompts,
|
||||||
@@ -580,6 +596,7 @@ def main(args: argparse.Namespace):
|
|||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
input_requests=input_requests,
|
input_requests=input_requests,
|
||||||
|
logprobs=args.logprobs,
|
||||||
best_of=args.best_of,
|
best_of=args.best_of,
|
||||||
use_beam_search=args.use_beam_search,
|
use_beam_search=args.use_beam_search,
|
||||||
request_rate=args.request_rate,
|
request_rate=args.request_rate,
|
||||||
@@ -721,6 +738,16 @@ if __name__ == "__main__":
|
|||||||
help=
|
help=
|
||||||
"Number of output tokens per request, used only for sonnet dataset.",
|
"Number of output tokens per request, used only for sonnet dataset.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--logprobs",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help=("Number of logprobs-per-token to compute & return as part of "
|
||||||
|
"the request. If unspecified, then either (1) if beam search "
|
||||||
|
"is disabled, no logprobs are computed & a single dummy "
|
||||||
|
"logprob is returned for each token; or (2) if beam search "
|
||||||
|
"is enabled 1 logprob per token is computed"),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--sonnet-prefix-len",
|
"--sonnet-prefix-len",
|
||||||
type=int,
|
type=int,
|
||||||
@@ -749,6 +776,14 @@ if __name__ == "__main__":
|
|||||||
help="Range of sampled ratio of input/output length, "
|
help="Range of sampled ratio of input/output length, "
|
||||||
"used only for random sampling.",
|
"used only for random sampling.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-prefix-len",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Number of fixed prefix tokens before random "
|
||||||
|
" context. The length range of context in a random "
|
||||||
|
" request is [random-prefix-len, "
|
||||||
|
" random-prefix-len + random-prefix-len * random-range-ratio).")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--request-rate",
|
"--request-rate",
|
||||||
type=float,
|
type=float,
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from tqdm import tqdm
|
|||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
@@ -451,13 +451,11 @@ if __name__ == "__main__":
|
|||||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
'instead supported for common inference criteria.')
|
'instead supported for common inference criteria.')
|
||||||
parser.add_argument(
|
parser.add_argument("--device",
|
||||||
"--device",
|
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
choices=DEVICE_OPTIONS,
|
||||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
help='device type for vLLM execution')
|
||||||
'CPU.')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-scheduler-steps",
|
"--num-scheduler-steps",
|
||||||
type=int,
|
type=int,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define environment variables for special configurations
|
# Define environment variables for special configurations
|
||||||
@@ -83,12 +84,7 @@ endif()
|
|||||||
|
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
|
|
||||||
list(APPEND LIBS "numa")
|
list(APPEND LIBS dnnl numa)
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Define extension targets
|
|
||||||
#
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# _C extension
|
# _C extension
|
||||||
@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/cpu/torch_bindings.cpp")
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
|
|
||||||
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/cpu/quant.cpp"
|
||||||
|
${VLLM_EXT_SRC})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Define extension targets
|
||||||
|
#
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_C
|
_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
|
|||||||
@@ -350,6 +350,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
||||||
${GPU_INCLUDE_DIRECTORIES})
|
${GPU_INCLUDE_DIRECTORIES})
|
||||||
|
|
||||||
|
# TODO: is torch_python_LIBRARY needed?
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
|
||||||
${GPU_LIBRARIES})
|
${GPU_LIBRARIES})
|
||||||
|
|
||||||
|
|||||||
@@ -24,8 +24,8 @@ namespace vec_op {
|
|||||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
#else
|
#else
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
std::cout << #NAME << " invoked." << std::endl;
|
RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
|||||||
explicit BF16Vec16(const FP32Vec16 &);
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||||
|
|
||||||
|
void save(void* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm256_mask_storeu_epi16(ptr, mask, reg);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
|
||||||
|
return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 max(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(_mm512_max_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 abs() const {
|
||||||
|
return FP32Vec16(_mm512_abs_ps(reg));
|
||||||
|
}
|
||||||
|
|
||||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||||
|
|
||||||
|
float reduce_max() const { return _mm512_reduce_max_ps(reg); }
|
||||||
|
|
||||||
template <int group_size> float reduce_sub_sum(int idx) {
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||||
|
|
||||||
|
void save(float* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm512_mask_storeu_ps(ptr, mask, reg);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
#else
|
#else
|
||||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
struct INT8Vec16: public Vec<INT8Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
__m128i reg;
|
||||||
|
int8_t values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m128i reg;
|
||||||
|
|
||||||
|
explicit INT8Vec16(const FP32Vec16& vec) : reg(
|
||||||
|
_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
|
||||||
|
) {}
|
||||||
|
|
||||||
|
void save(int8_t* ptr) const {
|
||||||
|
_mm_storeu_epi8(ptr, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(int8_t* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm_mask_storeu_epi8(ptr, mask, reg);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T> struct VecType { using vec_type = void; };
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|||||||
168
csrc/cpu/dnnl_helper.hpp
Normal file
168
csrc/cpu/dnnl_helper.hpp
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
#ifndef DNNL_HELPER_HPP
|
||||||
|
#define DNNL_HELPER_HPP
|
||||||
|
|
||||||
|
#include <c10/util/BFloat16.h>
|
||||||
|
|
||||||
|
#include "oneapi/dnnl/dnnl.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T>
|
||||||
|
struct DNNLType {
|
||||||
|
static constexpr dnnl::memory::data_type type =
|
||||||
|
dnnl::memory::data_type::undef;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<int8_t> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<int32_t> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<float> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<c10::BFloat16> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
constexpr inline dnnl::memory::data_type get_dnnl_type() {
|
||||||
|
return DNNLType<std::decay_t<T>>::type;
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <bool InputNoScale>
|
||||||
|
class DNNLPrimitiveHelper {
|
||||||
|
public:
|
||||||
|
// I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
|
||||||
|
// A: [M, K], row-major
|
||||||
|
// B: [K, N], column-major
|
||||||
|
// C: [M, N], row-major
|
||||||
|
// bias: [N], row-major, optional
|
||||||
|
// a_scales: [MS]
|
||||||
|
// b_scales: [NS]
|
||||||
|
// Note: Due to the limitation of oneDNN
|
||||||
|
// (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
|
||||||
|
// not supported.
|
||||||
|
template <typename OutputT, typename BiasT>
|
||||||
|
static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
|
||||||
|
const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
|
||||||
|
dnnl_dim_t K, const float* a_scales,
|
||||||
|
const float* b_scales, dnnl_dim_t MS,
|
||||||
|
dnnl_dim_t NS) {
|
||||||
|
auto&& OutputType = get_dnnl_type<OutputT>();
|
||||||
|
auto&& BiasType = get_dnnl_type<BiasT>();
|
||||||
|
|
||||||
|
dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
|
||||||
|
dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
|
||||||
|
dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
|
||||||
|
|
||||||
|
dnnl::primitive_attr attr;
|
||||||
|
if constexpr (!InputNoScale) {
|
||||||
|
if (MS == 1) {
|
||||||
|
// per-tensor
|
||||||
|
attr.set_scales_mask(DNNL_ARG_SRC, 0);
|
||||||
|
} else {
|
||||||
|
// per-token
|
||||||
|
TORCH_CHECK(false, "per-token quantization is unsupported.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NS == 1) {
|
||||||
|
// per-tensor
|
||||||
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
|
||||||
|
} else {
|
||||||
|
// per-channel
|
||||||
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
dnnl::matmul::primitive_desc matmul_pd;
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
|
||||||
|
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
|
||||||
|
bias_md, c_md, attr);
|
||||||
|
} else {
|
||||||
|
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
|
||||||
|
c_md, attr);
|
||||||
|
}
|
||||||
|
dnnl::matmul matmul(matmul_pd);
|
||||||
|
|
||||||
|
auto& engine = default_engine();
|
||||||
|
|
||||||
|
dnnl::memory a_m(a_md, engine, (void*)a);
|
||||||
|
dnnl::memory b_m(b_md, engine, (void*)b);
|
||||||
|
dnnl::memory c_m(c_md, engine, (void*)c);
|
||||||
|
dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
|
||||||
|
(void*)a_scales);
|
||||||
|
dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
|
||||||
|
(void*)b_scales);
|
||||||
|
|
||||||
|
auto& stream = default_stream();
|
||||||
|
if constexpr (InputNoScale) {
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({N}, BiasType, {1});
|
||||||
|
dnnl::memory bias_m(bias_md, engine, (void*)bias);
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_BIAS, bias_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({N}, BiasType, {1});
|
||||||
|
dnnl::memory bias_m(bias_md, engine, (void*)bias);
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_BIAS, bias_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stream.wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static dnnl::engine& default_engine() {
|
||||||
|
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
||||||
|
return engine;
|
||||||
|
}
|
||||||
|
|
||||||
|
static dnnl::stream& default_stream() {
|
||||||
|
static dnnl::stream stream(default_engine());
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
294
csrc/cpu/quant.cpp
Normal file
294
csrc/cpu/quant.cpp
Normal file
@@ -0,0 +1,294 @@
|
|||||||
|
#include "cpu_types.hpp"
|
||||||
|
#include "dnnl_helper.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t>
|
||||||
|
struct KernelVecType {
|
||||||
|
using load_vec_type = void;
|
||||||
|
using cvt_vec_type = void;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<float> {
|
||||||
|
using load_vec_type = vec_op::FP32Vec16;
|
||||||
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<c10::BFloat16> {
|
||||||
|
using load_vec_type = vec_op::BF16Vec16;
|
||||||
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
template <typename scalar_t>
|
||||||
|
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
const float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
constexpr float i8_min =
|
||||||
|
static_cast<float>(std::numeric_limits<int8_t>::min());
|
||||||
|
constexpr float i8_max =
|
||||||
|
static_cast<float>(std::numeric_limits<int8_t>::max());
|
||||||
|
const cvt_vec_t inv_scale(1.0 / *scale);
|
||||||
|
const cvt_vec_t i8_min_vec(i8_min);
|
||||||
|
const cvt_vec_t i8_max_vec(i8_max);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
cvt_vec_t max_abs(0.0);
|
||||||
|
{
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs());
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs());
|
||||||
|
} else {
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float scale_val = max_abs.reduce_max() / 127.0f;
|
||||||
|
scale[i] = scale_val;
|
||||||
|
const cvt_vec_t inv_scale(1.0 / scale_val);
|
||||||
|
|
||||||
|
{
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool Bias, typename scalar_t>
|
||||||
|
void dynamic_output_scale_impl(const float* input, scalar_t* output,
|
||||||
|
const float* scale, const scalar_t* bias,
|
||||||
|
const int num_tokens, const int hidden_size) {
|
||||||
|
CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
int j = 0;
|
||||||
|
cvt_vec_t token_scale_vec(scale[i]);
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
cvt_vec_t elems_fp32(input + i * hidden_size + j);
|
||||||
|
elems_fp32 = elems_fp32 * token_scale_vec;
|
||||||
|
|
||||||
|
if constexpr (Bias) {
|
||||||
|
load_vec_t bias_vec(bias + j);
|
||||||
|
cvt_vec_t bias_vec_fp32(bias_vec);
|
||||||
|
elems_fp32 = elems_fp32 + bias_vec_fp32;
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems_out(elems_fp32);
|
||||||
|
elems_out.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
cvt_vec_t elems_fp32(input + i * hidden_size + j);
|
||||||
|
elems_fp32 = elems_fp32 * token_scale_vec;
|
||||||
|
|
||||||
|
if constexpr (Bias) {
|
||||||
|
load_vec_t bias_vec(bias + j);
|
||||||
|
cvt_vec_t bias_vec_fp32(bias_vec);
|
||||||
|
elems_fp32 = elems_fp32 + bias_vec_fp32;
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems_out(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_out.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_out.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <typename scalar_t>
|
||||||
|
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
const float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_output_scale_impl() {
|
||||||
|
TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
|
||||||
|
const torch::Tensor& a, // [M, IC], row-major
|
||||||
|
const torch::Tensor& b, // [IC, OC], column-major
|
||||||
|
const torch::Tensor& a_scales, // [1] or [M]
|
||||||
|
const torch::Tensor& b_scales, // [1] or [OC]
|
||||||
|
const c10::optional<torch::Tensor>& bias // [OC]
|
||||||
|
) {
|
||||||
|
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
|
||||||
|
// Checks for conformality
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
|
||||||
|
"int8_scaled_mm only supports INT8 inputs.")
|
||||||
|
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
|
||||||
|
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
|
||||||
|
b.size(1) == c.size(1));
|
||||||
|
TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
|
||||||
|
TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
|
||||||
|
|
||||||
|
// Check for strides and alignment
|
||||||
|
TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
|
||||||
|
TORCH_CHECK(b.stride(0) == 1); // Column-major
|
||||||
|
TORCH_CHECK(c.stride(0) % 16 == 0 &&
|
||||||
|
b.stride(1) % 16 == 0); // 16 Byte Alignment
|
||||||
|
TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
|
||||||
|
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
|
||||||
|
bias->dim() == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
|
||||||
|
if (a_scales.numel() != 1) {
|
||||||
|
// per-token
|
||||||
|
// Note: oneDNN doesn't support per-token activation quantization
|
||||||
|
torch::Tensor tmp_fp32_out =
|
||||||
|
torch::empty_like(c, ::at::ScalarType::Float);
|
||||||
|
DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
|
||||||
|
tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
|
||||||
|
a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
|
||||||
|
b_scales.numel());
|
||||||
|
if (bias.has_value()) {
|
||||||
|
dynamic_output_scale_impl<true>(
|
||||||
|
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
|
||||||
|
a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
|
||||||
|
c.size(1));
|
||||||
|
} else {
|
||||||
|
dynamic_output_scale_impl<false>(
|
||||||
|
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
|
||||||
|
a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// per-tensor
|
||||||
|
if (bias.has_value()) {
|
||||||
|
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
|
||||||
|
bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
|
||||||
|
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
|
||||||
|
a_scales.numel(), b_scales.numel());
|
||||||
|
} else {
|
||||||
|
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
|
||||||
|
(void*)(0), a.size(0), b.size(1), a.size(1),
|
||||||
|
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
|
||||||
|
a_scales.numel(), b_scales.numel());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// static-per-tensor quantization.
|
||||||
|
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||||
|
const torch::Tensor& input, // [..., hidden_size]
|
||||||
|
const torch::Tensor& scale) {
|
||||||
|
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
TORCH_CHECK(scale.numel() == 1);
|
||||||
|
|
||||||
|
const int hidden_size = input.size(-1);
|
||||||
|
const int num_tokens = input.numel() / hidden_size;
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
|
||||||
|
static_scaled_int8_quant_impl(
|
||||||
|
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
|
||||||
|
scale.data_ptr<float>(), num_tokens, hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// dynamic-per-token quantization.
|
||||||
|
void dynamic_scaled_int8_quant(
|
||||||
|
torch::Tensor& out, // [..., hidden_size]
|
||||||
|
const torch::Tensor& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& scale // [..., 1]
|
||||||
|
) {
|
||||||
|
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
|
||||||
|
int const hidden_size = input.size(-1);
|
||||||
|
int const num_tokens = input.numel() / hidden_size;
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
|
||||||
|
dynamic_scaled_int8_quant_impl(
|
||||||
|
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
|
||||||
|
scale.data_ptr<float>(), num_tokens, hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -4,7 +4,12 @@
|
|||||||
|
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
void init_cpu_threads_env(const std::string& cpu_ids);
|
std::string init_cpu_threads_env(const std::string& cpu_ids);
|
||||||
|
|
||||||
|
void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||||
|
const torch::Tensor& b, const torch::Tensor& a_scales,
|
||||||
|
const torch::Tensor& b_scales,
|
||||||
|
const c10::optional<torch::Tensor>& bias);
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||||
// vLLM custom ops
|
// vLLM custom ops
|
||||||
@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// PagedAttention V2.
|
// PagedAttention V2.
|
||||||
ops.def(
|
ops.def(
|
||||||
"paged_attention_v2("
|
"paged_attention_v2("
|
||||||
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
" Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
|
||||||
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
" Tensor! tmp_out, Tensor query, Tensor key_cache,"
|
||||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
" int max_seq_len, Tensor? alibi_slopes,"
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" Tensor! key, int head_size,"
|
" Tensor! key, int head_size,"
|
||||||
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
||||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
|
||||||
|
// Quantization
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
|
ops.def(
|
||||||
|
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
|
||||||
|
// Compute int8 quantized tensor and scaling factor
|
||||||
|
ops.def(
|
||||||
|
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
|
||||||
|
&dynamic_scaled_int8_quant);
|
||||||
|
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
|
// quantization.
|
||||||
|
ops.def(
|
||||||
|
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
||||||
|
" Tensor b, Tensor a_scales,"
|
||||||
|
" Tensor b_scales, Tensor? bias) -> ()");
|
||||||
|
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||||
@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
// Copy the cache blocks from src to dst.
|
// Copy the cache blocks from src to dst.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||||
"block_mapping) -> ()");
|
"Tensor block_mapping) -> ()");
|
||||||
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
||||||
|
|
||||||
// Reshape the key and value tensors and cache them.
|
// Reshape the key and value tensors and cache them.
|
||||||
@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||||
// CPU utils
|
// CPU utils
|
||||||
utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
|
utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#include "cpu_types.hpp"
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
void init_cpu_threads_env(const std::string& cpu_ids) {
|
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||||
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
|
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
|
||||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
TORCH_CHECK(omp_cpu_mask->size > 0);
|
||||||
std::vector<int> omp_cpu_ids;
|
std::vector<int> omp_cpu_ids;
|
||||||
@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
torch::set_num_threads((int)omp_cpu_ids.size());
|
torch::set_num_threads((int)omp_cpu_ids.size());
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
|
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
|
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> thread_core_mapping;
|
||||||
|
thread_core_mapping.reserve(omp_cpu_ids.size());
|
||||||
|
omp_lock_t writelock;
|
||||||
|
omp_init_lock(&writelock);
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static, 1)
|
#pragma omp parallel for schedule(static, 1)
|
||||||
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
|
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
|
||||||
cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
|
cpu_set_t mask;
|
||||||
size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
|
CPU_ZERO(&mask);
|
||||||
CPU_ZERO_S(size, mask);
|
CPU_SET(omp_cpu_ids[i], &mask);
|
||||||
CPU_SET_S(omp_cpu_ids[i], size, mask);
|
int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
|
||||||
sched_setaffinity(0, sizeof(cpu_set_t), mask);
|
if (ret == -1) {
|
||||||
CPU_FREE(mask);
|
TORCH_CHECK(false,
|
||||||
|
"sched_setaffinity failed. errno: " + std::to_string(errno));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
omp_set_lock(&writelock);
|
||||||
|
thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
|
||||||
|
omp_unset_lock(&writelock);
|
||||||
|
}
|
||||||
|
|
||||||
|
omp_destroy_lock(&writelock);
|
||||||
|
|
||||||
numa_free_nodemask(omp_cpu_mask);
|
numa_free_nodemask(omp_cpu_mask);
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "OMP threads binding of Process " << getpid() << ":\n";
|
||||||
|
std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
|
||||||
|
[](auto&& a, auto&& b) { return a.second < b.second; });
|
||||||
|
for (auto&& item : thread_core_mapping) {
|
||||||
|
ss << "\t"
|
||||||
|
<< "OMP tid: " << item.first << ", core " << item.second << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
"g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
|
"g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
|
||||||
"size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
|
"size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
|
||||||
"bool replicate_input, bool apply_weights) -> Tensor");
|
"bool replicate_input, bool apply_weights) -> Tensor");
|
||||||
|
|
||||||
m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
|
m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
26
csrc/ops.h
26
csrc/ops.h
@@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
|||||||
|
|
||||||
void gelu_quick(torch::Tensor& out, torch::Tensor& input);
|
void gelu_quick(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
|
||||||
|
int64_t block_size, torch::Tensor& input_tokens,
|
||||||
|
torch::Tensor& sampled_token_ids,
|
||||||
|
torch::Tensor& input_positions,
|
||||||
|
torch::Tensor& seq_lens,
|
||||||
|
torch::Tensor& slot_mapping,
|
||||||
|
torch::Tensor& block_tables);
|
||||||
|
|
||||||
|
void advance_step_flashinfer(
|
||||||
|
int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables);
|
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
|
||||||
|
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||||
|
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||||
@@ -123,9 +134,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|||||||
int64_t size_k, int64_t size_n,
|
int64_t size_k, int64_t size_n,
|
||||||
int64_t num_bits);
|
int64_t num_bits);
|
||||||
|
|
||||||
|
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& perm, c10::SymInt size_k,
|
||||||
|
c10::SymInt size_n, int64_t num_bits);
|
||||||
|
|
||||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
||||||
int64_t size_n, int64_t num_bits);
|
int64_t size_n, int64_t num_bits);
|
||||||
|
|
||||||
|
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
c10::SymInt size_k, c10::SymInt size_n,
|
||||||
|
int64_t num_bits);
|
||||||
|
|
||||||
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
||||||
int64_t n);
|
int64_t n);
|
||||||
|
|
||||||
@@ -170,9 +189,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
|||||||
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
torch::Tensor& scales);
|
torch::Tensor& scales);
|
||||||
|
|
||||||
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
|
||||||
torch::Tensor lookup_table);
|
|
||||||
|
|
||||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||||
torch::Tensor b_gptq_qzeros,
|
torch::Tensor b_gptq_qzeros,
|
||||||
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||||
|
|||||||
@@ -12,12 +12,10 @@ namespace prepare_inputs {
|
|||||||
|
|
||||||
//
|
//
|
||||||
template <int const num_threads>
|
template <int const num_threads>
|
||||||
__global__ void advance_step_kernel(int num_seqs, int num_queries,
|
__global__ void advance_step_flashattn_kernel(
|
||||||
int block_size, long* input_tokens_ptr,
|
int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
|
||||||
long const* sampled_token_ids_ptr,
|
long const* sampled_token_ids_ptr, long* input_positions_ptr,
|
||||||
long* input_positions_ptr,
|
int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
|
||||||
int* seq_lens_ptr, long* slot_mapping_ptr,
|
|
||||||
int const* block_tables_ptr,
|
|
||||||
int64_t const block_tables_stride) {
|
int64_t const block_tables_stride) {
|
||||||
int num_query_blocks = div_ceil(num_queries, num_threads);
|
int num_query_blocks = div_ceil(num_queries, num_threads);
|
||||||
|
|
||||||
@@ -79,7 +77,82 @@ inline void verify_tensor(std::string const& name, torch::Tensor& t,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void advance_step(int num_seqs, int num_queries, int block_size,
|
__global__ void advance_step_flashinfer_kernel(
|
||||||
|
int num_threads, int num_seqs, int num_queries, int block_size,
|
||||||
|
long* input_tokens_ptr, long const* sampled_token_ids_ptr,
|
||||||
|
long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
|
||||||
|
int const* block_tables_ptr, int64_t const block_tables_stride,
|
||||||
|
int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
|
||||||
|
int num_query_blocks = div_ceil(num_queries, num_threads);
|
||||||
|
|
||||||
|
if (blockIdx.x < num_query_blocks) {
|
||||||
|
int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
|
||||||
|
|
||||||
|
if (cur_query_id < num_queries) {
|
||||||
|
// Update input_tokens
|
||||||
|
input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
|
||||||
|
|
||||||
|
int seq_len = seq_lens_ptr[cur_query_id];
|
||||||
|
int next_seq_len = seq_len + 1;
|
||||||
|
int next_input_pos = next_seq_len - 1;
|
||||||
|
|
||||||
|
// Update seq_lens
|
||||||
|
seq_lens_ptr[cur_query_id] = next_seq_len;
|
||||||
|
// Update input_positions
|
||||||
|
input_positions_ptr[cur_query_id] = next_input_pos;
|
||||||
|
|
||||||
|
int const* seq_block_tables_ptr =
|
||||||
|
block_tables_ptr + block_tables_stride * cur_query_id;
|
||||||
|
|
||||||
|
int block_index = next_input_pos / block_size;
|
||||||
|
int block_offset = next_input_pos % block_size;
|
||||||
|
|
||||||
|
// Update paged_kv_last_page_len
|
||||||
|
paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
|
||||||
|
|
||||||
|
int slot_num =
|
||||||
|
seq_block_tables_ptr[block_index] * block_size + block_offset;
|
||||||
|
// Update slot_mapping
|
||||||
|
slot_mapping_ptr[cur_query_id] = slot_num;
|
||||||
|
block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void advance_step_flashinfer_indptr_kernel(
|
||||||
|
int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
|
||||||
|
int* block_table_bound_ptr) {
|
||||||
|
int idx = blockIdx.x * num_threads + threadIdx.x;
|
||||||
|
|
||||||
|
// Update paged_kv_indptr
|
||||||
|
if (idx < num_queries) {
|
||||||
|
int sum = 0;
|
||||||
|
for (int i = 0; i <= idx; ++i) {
|
||||||
|
sum += block_table_bound_ptr[i];
|
||||||
|
}
|
||||||
|
paged_kv_indptr_ptr[idx + 1] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void advance_step_flashinfer_indices_kernel(
|
||||||
|
int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
|
||||||
|
int64_t const block_tables_stride, int* paged_kv_indices_ptr,
|
||||||
|
int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
|
||||||
|
int idx = blockIdx.x * num_threads + threadIdx.x;
|
||||||
|
int row = idx / block_tables_stride;
|
||||||
|
int col = idx % block_tables_stride;
|
||||||
|
|
||||||
|
if (row < num_queries && col < block_table_bound_ptr[row]) {
|
||||||
|
paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
|
||||||
|
block_tables_ptr[row * block_tables_stride + col];
|
||||||
|
}
|
||||||
|
// if cudagraph, fill padded seqs with the last valid seq's indptr
|
||||||
|
if (num_queries < row && row <= num_seqs) {
|
||||||
|
paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
|
||||||
torch::Tensor& input_tokens, // type: long
|
torch::Tensor& input_tokens, // type: long
|
||||||
torch::Tensor& sampled_token_ids, // type: long
|
torch::Tensor& sampled_token_ids, // type: long
|
||||||
torch::Tensor& input_positions, // type: long
|
torch::Tensor& input_positions, // type: long
|
||||||
@@ -88,7 +161,7 @@ void advance_step(int num_seqs, int num_queries, int block_size,
|
|||||||
torch::Tensor& block_tables) { // type: int
|
torch::Tensor& block_tables) { // type: int
|
||||||
|
|
||||||
if (logging) {
|
if (logging) {
|
||||||
printf("advance_step:\n");
|
printf("advance_step_flashattn:\n");
|
||||||
printf(" num_seqs = %d\n", num_seqs);
|
printf(" num_seqs = %d\n", num_seqs);
|
||||||
printf(" num_queries = %d\n", num_queries);
|
printf(" num_queries = %d\n", num_queries);
|
||||||
printf(" block_size = %d\n", block_size);
|
printf(" block_size = %d\n", block_size);
|
||||||
@@ -108,7 +181,8 @@ void advance_step(int num_seqs, int num_queries, int block_size,
|
|||||||
int blocks;
|
int blocks;
|
||||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||||
|
|
||||||
advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
|
advance_step_flashattn_kernel<max_threads>
|
||||||
|
<<<blocks, max_threads, 0, stream>>>(
|
||||||
num_seqs, num_queries, block_size,
|
num_seqs, num_queries, block_size,
|
||||||
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
||||||
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
|
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
|
||||||
@@ -119,13 +193,114 @@ void advance_step(int num_seqs, int num_queries, int block_size,
|
|||||||
block_tables.stride(0));
|
block_tables.stride(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void advance_step_flashinfer(
|
||||||
|
int num_seqs, int num_queries, int block_size,
|
||||||
|
torch::Tensor& input_tokens, // type: long
|
||||||
|
torch::Tensor& sampled_token_ids, // type: long
|
||||||
|
torch::Tensor& input_positions, // type: long
|
||||||
|
torch::Tensor& seq_lens, // type: int
|
||||||
|
torch::Tensor& slot_mapping, // type: long
|
||||||
|
torch::Tensor& block_tables, // type: int
|
||||||
|
torch::Tensor& paged_kv_indices, // type: int
|
||||||
|
torch::Tensor& paged_kv_indptr, // type: int
|
||||||
|
torch::Tensor& paged_kv_last_page_len, // type: int
|
||||||
|
torch::Tensor& block_table_bound) { // type: int
|
||||||
|
|
||||||
|
if (logging) {
|
||||||
|
printf("advance_step_flashinfer:\n");
|
||||||
|
printf(" num_seqs = %d\n", num_seqs);
|
||||||
|
printf(" num_queries = %d\n", num_queries);
|
||||||
|
printf(" block_size = %d\n", block_size);
|
||||||
|
printf(" block_tables.stride(0) = %d\n", block_tables.stride(0));
|
||||||
|
}
|
||||||
|
// Verify all tensors
|
||||||
|
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
|
||||||
|
// verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
|
||||||
|
// at::kLong);
|
||||||
|
verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
|
||||||
|
verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
|
||||||
|
verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
|
||||||
|
verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
|
||||||
|
|
||||||
|
verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
|
||||||
|
verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
|
||||||
|
verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
|
||||||
|
at::kInt);
|
||||||
|
|
||||||
|
verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
|
||||||
|
|
||||||
|
int dev = sampled_token_ids.get_device();
|
||||||
|
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
|
||||||
|
|
||||||
|
int blocks;
|
||||||
|
int threads;
|
||||||
|
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||||
|
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
|
||||||
|
if (logging) {
|
||||||
|
printf("launching kernel with %d blocks\n", blocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(will): support arbitrary block_tables stride
|
||||||
|
if ((blocks * threads) / block_tables.stride(0) < num_queries) {
|
||||||
|
TORCH_CHECK(false,
|
||||||
|
"multi-step: not enough threads to map block_table to"
|
||||||
|
"FlashInfer's paged_kv_indices on GPU. Try reducing the number "
|
||||||
|
"of seqs,",
|
||||||
|
" increasing the block size or take smaller steps.",
|
||||||
|
" num_queries = ", num_queries,
|
||||||
|
" block_tables.stride(0) = ", block_tables.stride(0),
|
||||||
|
" blocks = ", blocks, " max_threads = ", threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
|
||||||
|
threads, num_seqs, num_queries, block_size,
|
||||||
|
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
||||||
|
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
|
||||||
|
reinterpret_cast<long*>(input_positions.data_ptr()),
|
||||||
|
reinterpret_cast<int*>(seq_lens.data_ptr()),
|
||||||
|
reinterpret_cast<long*>(slot_mapping.data_ptr()),
|
||||||
|
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
||||||
|
block_tables.stride(0),
|
||||||
|
reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
|
||||||
|
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
||||||
|
|
||||||
|
advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
|
||||||
|
threads, num_seqs, num_queries,
|
||||||
|
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
|
||||||
|
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
||||||
|
|
||||||
|
advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
|
||||||
|
threads, num_seqs, num_queries,
|
||||||
|
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
||||||
|
block_tables.stride(0),
|
||||||
|
reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
|
||||||
|
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
|
||||||
|
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace prepare_inputs
|
} // namespace prepare_inputs
|
||||||
|
|
||||||
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
|
||||||
|
int64_t block_size, torch::Tensor& input_tokens,
|
||||||
|
torch::Tensor& sampled_token_ids,
|
||||||
|
torch::Tensor& input_positions,
|
||||||
|
torch::Tensor& seq_lens,
|
||||||
|
torch::Tensor& slot_mapping,
|
||||||
|
torch::Tensor& block_tables) {
|
||||||
|
prepare_inputs::advance_step_flashattn(
|
||||||
|
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
|
||||||
|
input_positions, seq_lens, slot_mapping, block_tables);
|
||||||
|
}
|
||||||
|
|
||||||
|
void advance_step_flashinfer(
|
||||||
|
int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
|
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
|
||||||
prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
|
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||||
sampled_token_ids, input_positions, seq_lens,
|
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
|
||||||
slot_mapping, block_tables);
|
prepare_inputs::advance_step_flashinfer(
|
||||||
|
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
|
||||||
|
input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
|
||||||
|
paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
|
||||||
}
|
}
|
||||||
@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
c10::SymInt size_k, c10::SymInt size_n,
|
||||||
|
int64_t num_bits) {
|
||||||
|
int const pack_factor = 32 / num_bits;
|
||||||
|
auto options = torch::TensorOptions()
|
||||||
|
.dtype(b_q_weight.dtype())
|
||||||
|
.device(b_q_weight.device());
|
||||||
|
return torch::empty_symint(
|
||||||
|
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||||
|
options);
|
||||||
|
}
|
||||||
|
|||||||
@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& perm, c10::SymInt size_k,
|
||||||
|
c10::SymInt size_n, int64_t num_bits) {
|
||||||
|
int const pack_factor = 32 / num_bits;
|
||||||
|
auto options = torch::TensorOptions()
|
||||||
|
.dtype(b_q_weight.dtype())
|
||||||
|
.device(b_q_weight.device());
|
||||||
|
return torch::empty_symint(
|
||||||
|
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||||
|
options);
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,216 +0,0 @@
|
|||||||
#include <torch/all.h>
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
|
|
||||||
// half-tensor
|
|
||||||
#include <c10/cuda/CUDAStream.h>
|
|
||||||
#include <ATen/cuda/CUDATensorMethods.cuh>
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
|
||||||
|
|
||||||
#define BLOCKWIDTH 128
|
|
||||||
#define BLOCKHEIGHT4 16
|
|
||||||
|
|
||||||
namespace vllm {
|
|
||||||
namespace squeezellm {
|
|
||||||
|
|
||||||
__device__ inline unsigned int as_unsigned(int i) {
|
|
||||||
return *reinterpret_cast<unsigned int*>(&i);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4-bit matvec kernel (LUT-based)
|
|
||||||
__global__ void NUQ4MatMulKernel(
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
const half2* __restrict__ vec,
|
|
||||||
#else
|
|
||||||
const __half2* __restrict__ vec,
|
|
||||||
#endif
|
|
||||||
const int* __restrict__ mat,
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
half2* __restrict__ mul,
|
|
||||||
#else
|
|
||||||
float2* __restrict__ mul,
|
|
||||||
#endif
|
|
||||||
const __half* __restrict__ lookup_table, int height, int width, int batch,
|
|
||||||
int vec_height) {
|
|
||||||
|
|
||||||
const int blockwidth2 = BLOCKWIDTH / 2;
|
|
||||||
|
|
||||||
int row = BLOCKHEIGHT4 * blockIdx.x;
|
|
||||||
int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
__shared__ half2 blockvec[blockwidth2];
|
|
||||||
#else
|
|
||||||
__shared__ __half2 blockvec[blockwidth2];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__shared__ __half deq2[16][BLOCKWIDTH];
|
|
||||||
int off = threadIdx.x;
|
|
||||||
int column_offset = col * 16;
|
|
||||||
for (int val = 0; val < 16; val += 1) {
|
|
||||||
int lut_index = column_offset + val;
|
|
||||||
deq2[val][off] = lookup_table[lut_index];
|
|
||||||
}
|
|
||||||
|
|
||||||
__half res;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
half2 res2;
|
|
||||||
half2 tmp2;
|
|
||||||
#else
|
|
||||||
__half2 res2;
|
|
||||||
__half2 tmp2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int i;
|
|
||||||
int k;
|
|
||||||
|
|
||||||
unsigned int tmp1;
|
|
||||||
unsigned int lut_index1, lut_index2;
|
|
||||||
|
|
||||||
for (int b = 0; b < batch; ++b) {
|
|
||||||
i = width * row + col;
|
|
||||||
res = __int2half_rd(0);
|
|
||||||
k = 0;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
if (threadIdx.x < blockwidth2)
|
|
||||||
blockvec[threadIdx.x] =
|
|
||||||
vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 +
|
|
||||||
threadIdx.x];
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
while (k < blockwidth2) {
|
|
||||||
tmp1 = as_unsigned(mat[i]);
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
res2 = {};
|
|
||||||
tmp2 = {};
|
|
||||||
#else
|
|
||||||
res2.x = __half_as_ushort(__float2half(0));
|
|
||||||
res2.y = __half_as_ushort(__float2half(0));
|
|
||||||
tmp2.x = __half_as_ushort(__float2half(0));
|
|
||||||
tmp2.y = __half_as_ushort(__float2half(0));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
lut_index1 = tmp1 & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 4) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 0], res2);
|
|
||||||
|
|
||||||
lut_index1 = (tmp1 >> 8) & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 12) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 1], res2);
|
|
||||||
|
|
||||||
lut_index1 = (tmp1 >> 16) & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 20) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 2], res2);
|
|
||||||
|
|
||||||
lut_index1 = (tmp1 >> 24) & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 28) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 3], res2);
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
res = __hadd(__hadd(res2.x, res2.y), res);
|
|
||||||
#else
|
|
||||||
res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)),
|
|
||||||
res);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
i += width;
|
|
||||||
k += 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
// col%2 -> only set one of the two values
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
half2 res3 = {};
|
|
||||||
if (col % 2 == 0) {
|
|
||||||
res3.x = res;
|
|
||||||
} else {
|
|
||||||
res3.y = res;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
__half2 res3;
|
|
||||||
res3.x = __half_as_ushort(__float2half(0));
|
|
||||||
res3.y = __half_as_ushort(__float2half(0));
|
|
||||||
if (col % 2 == 0) {
|
|
||||||
res3.x = __half_as_ushort(res);
|
|
||||||
} else {
|
|
||||||
res3.y = __half_as_ushort(res);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
atomicAdd(&mul[b * width / 2 + col / 2], res3);
|
|
||||||
#else
|
|
||||||
int tmp_addr = b * width / 2 + col / 2;
|
|
||||||
atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x)));
|
|
||||||
atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y)));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace squeezellm
|
|
||||||
} // namespace vllm
|
|
||||||
|
|
||||||
// 4-bit matvec kernel (LUT-based)
|
|
||||||
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
|
||||||
torch::Tensor lookup_table) {
|
|
||||||
int height = mat.size(0);
|
|
||||||
int width = mat.size(1);
|
|
||||||
|
|
||||||
int batch = vec.size(0);
|
|
||||||
int vec_height = vec.size(1);
|
|
||||||
|
|
||||||
dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
|
|
||||||
(width + BLOCKWIDTH - 1) / BLOCKWIDTH);
|
|
||||||
dim3 threads(BLOCKWIDTH);
|
|
||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
(half2*)vec.data_ptr<at::Half>(),
|
|
||||||
#else
|
|
||||||
(__half2*)vec.data_ptr<at::Half>(),
|
|
||||||
#endif
|
|
||||||
mat.data_ptr<int>(),
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
(half2*)mul.data_ptr<at::Half>(),
|
|
||||||
(__half*)lookup_table.data_ptr<at::Half>(),
|
|
||||||
#else
|
|
||||||
(float2*)mul.data_ptr<float>(),
|
|
||||||
(__half*)lookup_table.data_ptr<at::Half>(),
|
|
||||||
#endif
|
|
||||||
height, width, batch, vec_height);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef BLOCKWIDTH
|
|
||||||
#undef BLOCKHEIGHT4
|
|
||||||
@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// PagedAttention V2.
|
// PagedAttention V2.
|
||||||
ops.def(
|
ops.def(
|
||||||
"paged_attention_v2("
|
"paged_attention_v2("
|
||||||
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
" Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
|
||||||
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
" Tensor! tmp_out, Tensor query, Tensor key_cache,"
|
||||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
" int max_seq_len, Tensor? alibi_slopes,"
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
@@ -73,8 +73,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
|
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
|
||||||
|
|
||||||
// prepare_inputs advance_step
|
// prepare_inputs advance_step
|
||||||
ops.def("advance_step", &advance_step);
|
ops.def(
|
||||||
ops.impl("advance_step", torch::kCUDA, &advance_step);
|
"advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
|
||||||
|
"Tensor! input_tokens, Tensor sampled_token_ids, "
|
||||||
|
"Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
|
||||||
|
"Tensor block_tables) -> ()");
|
||||||
|
ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
|
||||||
|
|
||||||
|
ops.def(
|
||||||
|
"advance_step_flashinfer("
|
||||||
|
" int num_seqs, int num_queries, int block_size,"
|
||||||
|
" Tensor! input_tokens, Tensor sampled_token_ids,"
|
||||||
|
" Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
|
||||||
|
" Tensor block_tables, Tensor! paged_kv_indices,"
|
||||||
|
" Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
|
||||||
|
" Tensor! block_table_bounds"
|
||||||
|
") -> ()");
|
||||||
|
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
|
||||||
|
|
||||||
// Layernorm
|
// Layernorm
|
||||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||||
@@ -110,27 +125,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// Quantization ops
|
// Quantization ops
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
// Quantized GEMM for AQLM.
|
// Quantized GEMM for AQLM.
|
||||||
ops.def("aqlm_gemm", &aqlm_gemm);
|
ops.def(
|
||||||
|
"aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
|
||||||
|
"Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
|
||||||
|
"-> Tensor");
|
||||||
ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
|
ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
|
||||||
|
|
||||||
// Decompression method for AQLM.
|
// Decompression method for AQLM.
|
||||||
ops.def("aqlm_dequant", &aqlm_dequant);
|
ops.def(
|
||||||
|
"aqlm_dequant(Tensor codes, Tensor codebooks, "
|
||||||
|
"int[] codebook_partition_sizes) -> Tensor");
|
||||||
ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
|
ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
|
||||||
|
|
||||||
// Quantized GEMM for AWQ.
|
// Quantized GEMM for AWQ.
|
||||||
ops.def("awq_gemm", &awq_gemm);
|
ops.def(
|
||||||
|
"awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
|
||||||
|
"Tensor _zeros, int split_k_iters) -> Tensor");
|
||||||
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
|
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
|
||||||
|
|
||||||
// Dequantization for AWQ.
|
// Dequantization for AWQ.
|
||||||
ops.def("awq_dequantize", &awq_dequantize);
|
ops.def(
|
||||||
|
"awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
|
||||||
|
"Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
|
||||||
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
|
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
|
||||||
|
|
||||||
|
// Note about marlin kernel 'workspace' arguments:
|
||||||
|
// Technically these should be mutable since they are modified by the kernel.
|
||||||
|
// But since they are set back to zero once the kernel is finished we can
|
||||||
|
// hand wave and say that they have no net effect.
|
||||||
|
//
|
||||||
|
// The reason to mark 'workspace' as immutable is so that they don't interfere
|
||||||
|
// with using ScalarType arguments in the ops. If they are marked as mutable,
|
||||||
|
// pytorch throws an assert in
|
||||||
|
// 'torch._higher_order_ops._register_effectful_op' that prevents these
|
||||||
|
// kernels from being torch.compile'd.
|
||||||
|
// See the following document for more info on custom types and ops that use
|
||||||
|
// custom types:
|
||||||
|
// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
|
||||||
|
|
||||||
// Marlin (Dense) Optimized Quantized GEMM for GPTQ.
|
// Marlin (Dense) Optimized Quantized GEMM for GPTQ.
|
||||||
ops.def("marlin_gemm", &marlin_gemm);
|
ops.def(
|
||||||
|
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
|
"Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
|
||||||
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
|
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
|
||||||
|
|
||||||
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
|
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
|
||||||
ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
|
ops.def(
|
||||||
|
"gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
|
||||||
|
"Tensor b_scales, Tensor workspace, "
|
||||||
|
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
||||||
|
"int size_m, int size_n, int size_k) -> Tensor");
|
||||||
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
|
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
|
||||||
|
|
||||||
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
|
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
|
||||||
@@ -149,35 +193,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
|
ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
|
||||||
|
|
||||||
// gptq_marlin Optimized Quantized GEMM for GPTQ.
|
// gptq_marlin Optimized Quantized GEMM for GPTQ.
|
||||||
ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
|
ops.def(
|
||||||
|
"gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
|
"Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
|
||||||
|
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
||||||
|
"int size_m, int size_n, int size_k, bool is_k_full, "
|
||||||
|
"bool has_zp, bool use_fp32_reduce) -> Tensor");
|
||||||
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
|
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
|
||||||
|
|
||||||
// gptq_marlin repack from GPTQ.
|
// gptq_marlin repack from GPTQ.
|
||||||
ops.def("gptq_marlin_repack", &gptq_marlin_repack);
|
ops.def(
|
||||||
|
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
|
||||||
|
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
|
||||||
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
||||||
|
ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
|
||||||
|
|
||||||
// awq_marlin repack from AWQ.
|
// awq_marlin repack from AWQ.
|
||||||
ops.def("awq_marlin_repack", &awq_marlin_repack);
|
ops.def(
|
||||||
|
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
|
||||||
|
"SymInt size_n, int num_bits) -> Tensor");
|
||||||
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
|
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
|
||||||
|
ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
|
||||||
|
|
||||||
// Dequantization for GGML.
|
// Dequantization for GGML.
|
||||||
ops.def("ggml_dequantize", &ggml_dequantize);
|
ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
|
||||||
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
|
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
|
||||||
|
|
||||||
// mmvq kernel for GGML.
|
// mmvq kernel for GGML.
|
||||||
ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8);
|
ops.def(
|
||||||
|
"ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
|
||||||
|
"-> Tensor");
|
||||||
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
|
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
|
||||||
|
|
||||||
// mmq kernel for GGML.
|
// mmq kernel for GGML.
|
||||||
ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8);
|
ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
|
||||||
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
|
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
|
||||||
|
|
||||||
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
|
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
|
||||||
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
|
ops.def(
|
||||||
|
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
|
"Tensor! workspace, int num_bits, int size_m, int size_n, "
|
||||||
|
"int size_k) -> Tensor");
|
||||||
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
|
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
|
||||||
|
|
||||||
// marlin_qqq_gemm for QQQ.
|
// marlin_qqq_gemm for QQQ.
|
||||||
ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
|
ops.def(
|
||||||
|
"marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
|
||||||
|
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
|
||||||
|
"Tensor! workspace, int size_m, int size_n, "
|
||||||
|
"int size_k) -> Tensor");
|
||||||
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
|
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
|
||||||
|
|
||||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
@@ -199,16 +263,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
|
|
||||||
// Check if cutlass scaled_mm is supported for CUDA devices of the given
|
// Check if cutlass scaled_mm is supported for CUDA devices of the given
|
||||||
// capability
|
// capability
|
||||||
ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
|
ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
|
||||||
ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
|
ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
|
||||||
&cutlass_scaled_mm_supports_fp8);
|
|
||||||
// Mamba selective scan kernel
|
// Mamba selective scan kernel
|
||||||
ops.def(
|
ops.def(
|
||||||
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
||||||
"Tensor! A, Tensor! B, Tensor! C,"
|
"Tensor! A, Tensor! B, Tensor! C,"
|
||||||
"Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
|
"Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
|
||||||
"bool delta_softplus,"
|
"bool delta_softplus,"
|
||||||
"Tensor? index_, Tensor? x) -> Tensor[]");
|
"Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
|
||||||
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
||||||
|
|
||||||
ops.def(
|
ops.def(
|
||||||
@@ -230,19 +294,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Quantized GEMM for GPTQ.
|
// Quantized GEMM for GPTQ.
|
||||||
ops.def("gptq_gemm", &gptq_gemm);
|
// Note: even though the C++ inferred schema is correct for this op, it seems
|
||||||
|
// to prevent the meta function registry.
|
||||||
|
ops.def(
|
||||||
|
"gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
|
||||||
|
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
|
||||||
|
"-> Tensor");
|
||||||
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
|
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
|
||||||
|
|
||||||
// Post processing for GPTQ.
|
// Post processing for GPTQ.
|
||||||
ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
|
ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
|
||||||
ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
|
ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
|
||||||
|
|
||||||
// Quantized GEMM for SqueezeLLM.
|
|
||||||
ops.def(
|
|
||||||
"squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
|
|
||||||
"lookup_table) -> ()");
|
|
||||||
ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
|
|
||||||
|
|
||||||
// Compute FP8 quantized tensor for given scaling factor.
|
// Compute FP8 quantized tensor for given scaling factor.
|
||||||
ops.def(
|
ops.def(
|
||||||
"static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
|
"static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
|
||||||
@@ -256,8 +319,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
|
|
||||||
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
|
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
|
||||||
ops.def(
|
ops.def(
|
||||||
"dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
|
"dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
|
||||||
"scale, Tensor? scale_ub) -> "
|
"Tensor! scale, Tensor? scale_ub) -> "
|
||||||
"()");
|
"()");
|
||||||
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
|
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
|
||||||
&dynamic_per_token_scaled_fp8_quant);
|
&dynamic_per_token_scaled_fp8_quant);
|
||||||
@@ -294,8 +357,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
// Copy the cache blocks from src to dst.
|
// Copy the cache blocks from src to dst.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||||
"block_mapping) -> ()");
|
"Tensor block_mapping) -> ()");
|
||||||
cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks);
|
cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks);
|
||||||
|
|
||||||
// Reshape the key and value tensors and cache them.
|
// Reshape the key and value tensors and cache them.
|
||||||
@@ -320,8 +383,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
// Convert the key and value cache to fp8 data type.
|
// Convert the key and value cache to fp8 data type.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
|
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
|
||||||
"kv_cache_dtype) -> ()");
|
"str kv_cache_dtype) -> ()");
|
||||||
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
|
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -329,24 +392,28 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
|
|||||||
// Cuda utils
|
// Cuda utils
|
||||||
|
|
||||||
// Gets the specified device attribute.
|
// Gets the specified device attribute.
|
||||||
cuda_utils.def("get_device_attribute", &get_device_attribute);
|
cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
|
||||||
cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
|
cuda_utils.impl("get_device_attribute", &get_device_attribute);
|
||||||
|
|
||||||
// Gets the maximum shared memory per block device attribute.
|
// Gets the maximum shared memory per block device attribute.
|
||||||
cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
|
cuda_utils.def(
|
||||||
&get_max_shared_memory_per_block_device_attribute);
|
"get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
|
||||||
cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
|
cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
|
||||||
torch::kCUDA,
|
|
||||||
&get_max_shared_memory_per_block_device_attribute);
|
&get_max_shared_memory_per_block_device_attribute);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
||||||
// Custom all-reduce kernels
|
// Custom all-reduce kernels
|
||||||
custom_ar.def("init_custom_ar", &init_custom_ar);
|
custom_ar.def(
|
||||||
|
"init_custom_ar(Tensor meta, Tensor rank_data, "
|
||||||
|
"str[] handles, int[] offsets, int rank, "
|
||||||
|
"bool full_nvlink) -> int");
|
||||||
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
|
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
|
||||||
|
|
||||||
custom_ar.def("should_custom_ar", &should_custom_ar);
|
custom_ar.def(
|
||||||
|
"should_custom_ar(Tensor inp, int max_size, int world_size, "
|
||||||
|
"bool full_nvlink) -> bool");
|
||||||
custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
|
custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
|
||||||
|
|
||||||
custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
|
custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
|
||||||
@@ -358,21 +425,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
|||||||
custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
|
custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
|
||||||
|
|
||||||
custom_ar.def("dispose", &dispose);
|
custom_ar.def("dispose", &dispose);
|
||||||
custom_ar.impl("dispose", torch::kCPU, &dispose);
|
|
||||||
|
|
||||||
custom_ar.def("meta_size", &meta_size);
|
custom_ar.def("meta_size", &meta_size);
|
||||||
custom_ar.impl("meta_size", torch::kCPU, &meta_size);
|
|
||||||
|
|
||||||
custom_ar.def("register_buffer", ®ister_buffer);
|
custom_ar.def(
|
||||||
|
"register_buffer(int fa, Tensor t, str[] handles, "
|
||||||
|
"int[] offsets) -> ()");
|
||||||
custom_ar.impl("register_buffer", torch::kCUDA, ®ister_buffer);
|
custom_ar.impl("register_buffer", torch::kCUDA, ®ister_buffer);
|
||||||
|
|
||||||
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
|
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
|
||||||
custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
|
|
||||||
&get_graph_buffer_ipc_meta);
|
|
||||||
|
|
||||||
custom_ar.def("register_graph_buffers", ®ister_graph_buffers);
|
custom_ar.def("register_graph_buffers", ®ister_graph_buffers);
|
||||||
custom_ar.impl("register_graph_buffers", torch::kCPU,
|
|
||||||
®ister_graph_buffers);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,5 @@ pydantic >= 2.8
|
|||||||
torch
|
torch
|
||||||
py-cpuinfo
|
py-cpuinfo
|
||||||
transformers
|
transformers
|
||||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
|
||||||
mistral_common >= 1.3.4
|
mistral_common >= 1.3.4
|
||||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||||
@@ -5,6 +5,7 @@ vLLM Meetups
|
|||||||
|
|
||||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||||
|
|
||||||
|
- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
|
||||||
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
|
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
|
||||||
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
|
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
|
||||||
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
|
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ autodoc_mock_imports = [
|
|||||||
"aiohttp",
|
"aiohttp",
|
||||||
"compressed_tensors",
|
"compressed_tensors",
|
||||||
"cpuinfo",
|
"cpuinfo",
|
||||||
|
"cv2",
|
||||||
"torch",
|
"torch",
|
||||||
"transformers",
|
"transformers",
|
||||||
"psutil",
|
"psutil",
|
||||||
|
|||||||
@@ -18,13 +18,27 @@ Traces can be visualized using https://ui.perfetto.dev/.
|
|||||||
|
|
||||||
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
||||||
|
|
||||||
Example commands:
|
.. tip::
|
||||||
|
|
||||||
|
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
|
||||||
|
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
|
||||||
|
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
|
||||||
|
|
||||||
|
Example commands and usage:
|
||||||
|
===========================
|
||||||
|
|
||||||
|
Offline Inference:
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
|
||||||
|
|
||||||
|
|
||||||
OpenAI Server:
|
OpenAI Server:
|
||||||
|
--------------
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
|
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
|
||||||
|
|
||||||
benchmark_serving.py:
|
benchmark_serving.py:
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ If you have already taken care of the above issues, but the vLLM instance still
|
|||||||
|
|
||||||
With more logging, hopefully you can find the root cause of the issue.
|
With more logging, hopefully you can find the root cause of the issue.
|
||||||
|
|
||||||
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
|
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
|
||||||
|
|
||||||
Here are some common issues that can cause hangs:
|
Here are some common issues that can cause hangs:
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,9 @@ Offline Batched Inference
|
|||||||
|
|
||||||
We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
|
We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
|
||||||
|
|
||||||
Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process.
|
Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
|
||||||
|
The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
|
||||||
|
The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The
|
|||||||
]
|
]
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
|
Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
|
|||||||
@@ -107,3 +107,55 @@ The following is an example request
|
|||||||
"max_tokens": 7,
|
"max_tokens": 7,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
}' | jq
|
}' | jq
|
||||||
|
|
||||||
|
|
||||||
|
Dynamically serving LoRA Adapters
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
|
||||||
|
LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
|
||||||
|
to change models on-the-fly is needed.
|
||||||
|
|
||||||
|
Note: Enabling this feature in production environments is risky as user may participate model adapter management.
|
||||||
|
|
||||||
|
To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
|
||||||
|
is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
|
||||||
|
|
||||||
|
|
||||||
|
Loading a LoRA Adapter:
|
||||||
|
|
||||||
|
To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
|
||||||
|
details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
|
||||||
|
|
||||||
|
Example request to load a LoRA adapter:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
curl -X POST http://localhost:8000/v1/load_lora_adapter \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"lora_name": "sql_adapter",
|
||||||
|
"lora_path": "/path/to/sql-lora-adapter"
|
||||||
|
}'
|
||||||
|
|
||||||
|
Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
|
||||||
|
cannot be found or loaded, an appropriate error message will be returned.
|
||||||
|
|
||||||
|
Unloading a LoRA Adapter:
|
||||||
|
|
||||||
|
To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
|
||||||
|
with the name or ID of the adapter to be unloaded.
|
||||||
|
|
||||||
|
Example request to unload a LoRA adapter:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
curl -X POST http://localhost:8000/v1/unload_lora_adapter \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"lora_name": "sql_adapter"
|
||||||
|
}'
|
||||||
|
|||||||
@@ -161,6 +161,46 @@ A variety of speculative models of this type are available on HF hub:
|
|||||||
* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
|
* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
|
||||||
* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
|
* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
|
||||||
|
|
||||||
|
Lossless guarantees of Speculative Decoding
|
||||||
|
-------------------------------------------
|
||||||
|
In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
|
||||||
|
speculative decoding, breaking down the guarantees into three key areas:
|
||||||
|
|
||||||
|
1. **Theoretical Losslessness**
|
||||||
|
- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might
|
||||||
|
cause slight variations in output distributions, as discussed
|
||||||
|
in `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_
|
||||||
|
|
||||||
|
2. **Algorithmic Losslessness**
|
||||||
|
- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
|
||||||
|
|
||||||
|
- **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target
|
||||||
|
distribution. `View Test Code <https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252>`_
|
||||||
|
|
||||||
|
- **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
|
||||||
|
without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
|
||||||
|
provides a lossless guarantee. Almost all of the tests in `this directory <https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e>`_
|
||||||
|
verify this property using `this assertion implementation <https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291>`_
|
||||||
|
|
||||||
|
3. **vLLM Logprob Stability**
|
||||||
|
- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
|
||||||
|
same request across runs. For more details, see the FAQ section
|
||||||
|
titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
|
||||||
|
|
||||||
|
|
||||||
|
**Conclusion**
|
||||||
|
|
||||||
|
While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
|
||||||
|
can occur due to following factors:
|
||||||
|
|
||||||
|
- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
|
||||||
|
|
||||||
|
- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
|
||||||
|
due to non-deterministic behavior in batched operations or numerical instability.
|
||||||
|
|
||||||
|
**Mitigation Strategies**
|
||||||
|
|
||||||
|
For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
|
||||||
|
|
||||||
Resources for vLLM contributors
|
Resources for vLLM contributors
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|||||||
@@ -194,12 +194,12 @@ Multimodal Language Models
|
|||||||
|
|
||||||
* - Architecture
|
* - Architecture
|
||||||
- Models
|
- Models
|
||||||
- Supported Modalities
|
- Modalities
|
||||||
- Example HuggingFace Models
|
- Example HuggingFace Models
|
||||||
- :ref:`LoRA <lora>`
|
- :ref:`LoRA <lora>`
|
||||||
* - :code:`Blip2ForConditionalGeneration`
|
* - :code:`Blip2ForConditionalGeneration`
|
||||||
- BLIP-2
|
- BLIP-2
|
||||||
- Image
|
- Image\ :sup:`E`
|
||||||
- :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
|
- :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`ChameleonForConditionalGeneration`
|
* - :code:`ChameleonForConditionalGeneration`
|
||||||
@@ -214,44 +214,75 @@ Multimodal Language Models
|
|||||||
-
|
-
|
||||||
* - :code:`InternVLChatModel`
|
* - :code:`InternVLChatModel`
|
||||||
- InternVL2
|
- InternVL2
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
|
- :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`LlavaForConditionalGeneration`
|
* - :code:`LlavaForConditionalGeneration`
|
||||||
- LLaVA-1.5
|
- LLaVA-1.5
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
|
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`LlavaNextForConditionalGeneration`
|
* - :code:`LlavaNextForConditionalGeneration`
|
||||||
- LLaVA-NeXT
|
- LLaVA-NeXT
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
|
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
|
||||||
-
|
-
|
||||||
|
* - :code:`LlavaNextVideoForConditionalGeneration`
|
||||||
|
- LLaVA-NeXT-Video
|
||||||
|
- Video
|
||||||
|
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
|
||||||
|
-
|
||||||
|
* - :code:`MiniCPMV`
|
||||||
|
- MiniCPM-V
|
||||||
|
- Image\ :sup:`+`
|
||||||
|
- :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
|
||||||
|
-
|
||||||
* - :code:`PaliGemmaForConditionalGeneration`
|
* - :code:`PaliGemmaForConditionalGeneration`
|
||||||
- PaliGemma
|
- PaliGemma
|
||||||
- Image
|
- Image\ :sup:`E`
|
||||||
- :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
|
- :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`Phi3VForCausalLM`
|
* - :code:`Phi3VForCausalLM`
|
||||||
- Phi-3-Vision, Phi-3.5-Vision
|
- Phi-3-Vision, Phi-3.5-Vision
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
|
- :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
|
||||||
-
|
-
|
||||||
* - :code:`MiniCPMV`
|
* - :code:`PixtralForConditionalGeneration`
|
||||||
- MiniCPM-V
|
- Pixtral
|
||||||
- Image
|
- Image\ :sup:`+`
|
||||||
- :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
|
- :code:`mistralai/Pixtral-12B-2409`
|
||||||
|
-
|
||||||
|
* - :code:`QWenLMHeadModel`
|
||||||
|
- Qwen-VL
|
||||||
|
- Image\ :sup:`E+`
|
||||||
|
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
|
||||||
|
-
|
||||||
|
* - :code:`Qwen2VLForConditionalGeneration`
|
||||||
|
- Qwen2-VL (see note)
|
||||||
|
- Image\ :sup:`+` / Video\ :sup:`+`
|
||||||
|
- :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`UltravoxModel`
|
* - :code:`UltravoxModel`
|
||||||
- Ultravox
|
- Ultravox
|
||||||
- Audio
|
- Audio\ :sup:`E+`
|
||||||
- :code:`fixie-ai/ultravox-v0_3`
|
- :code:`fixie-ai/ultravox-v0_3`
|
||||||
-
|
-
|
||||||
|
|
||||||
|
| :sup:`E` Pre-computed embeddings can be inputted for this modality.
|
||||||
|
| :sup:`+` Multiple items can be inputted per text prompt for this modality.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
|
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
|
||||||
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
|
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
|
||||||
|
This can be installed by running the following command:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
||||||
|
|||||||
@@ -9,26 +9,23 @@ This document shows you how to run and serve these models using vLLM.
|
|||||||
.. important::
|
.. important::
|
||||||
We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
|
We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
|
||||||
|
|
||||||
Currently, the support for vision language models on vLLM has the following limitations:
|
|
||||||
|
|
||||||
* Only single image input is supported per text prompt.
|
|
||||||
|
|
||||||
We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
|
We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
|
||||||
|
|
||||||
Offline Batched Inference
|
Offline Inference
|
||||||
-------------------------
|
-----------------
|
||||||
|
|
||||||
To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
|
Single-image input
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||||
|
|
||||||
.. important::
|
.. note::
|
||||||
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
||||||
the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
|
the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model.
|
||||||
internally for each model.
|
|
||||||
|
|
||||||
|
|
||||||
To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
|
To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
|
||||||
|
|
||||||
@@ -86,61 +83,117 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
|
|||||||
|
|
||||||
A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
|
A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
|
||||||
|
|
||||||
|
Multi-image input
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
Online OpenAI Vision API Compatible Inference
|
Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
|
||||||
----------------------------------------------
|
|
||||||
|
To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
trust_remote_code=True, # Required to load Phi-3.5-vision
|
||||||
|
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
|
||||||
|
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
|
||||||
|
)
|
||||||
|
|
||||||
|
Instead of passing in a single image, you can pass in a list of images.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
|
prompt = "<|user|>\n<image_1>\n<image_2>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
||||||
|
|
||||||
|
# Load the images using PIL.Image
|
||||||
|
image1 = PIL.Image.open(...)
|
||||||
|
image2 = PIL.Image.open(...)
|
||||||
|
|
||||||
|
outputs = llm.generate({
|
||||||
|
"prompt": prompt,
|
||||||
|
"multi_modal_data": {
|
||||||
|
"image": [image1, image2]
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
|
||||||
|
A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
|
||||||
|
|
||||||
|
Online Inference
|
||||||
|
----------------
|
||||||
|
|
||||||
|
OpenAI Vision API
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
|
You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
|
||||||
|
|
||||||
.. note::
|
Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server.
|
||||||
Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
|
|
||||||
added in the future.
|
|
||||||
|
|
||||||
Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
|
|
||||||
|
|
||||||
.. important::
|
|
||||||
Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template
|
|
||||||
is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the
|
|
||||||
HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
|
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
|
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
|
||||||
|
--trust-remote-code --limit-mm-per-prompt image=2
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
|
||||||
the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
|
a chat template is **required** to launch the API server.
|
||||||
internally for each model.
|
|
||||||
|
Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
|
||||||
|
The chat template can be inferred based on the documentation on the model's HuggingFace repo.
|
||||||
|
For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
|
||||||
|
|
||||||
To consume the server, you can use the OpenAI client like in the example below:
|
To consume the server, you can use the OpenAI client like in the example below:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
openai_api_key = "EMPTY"
|
openai_api_key = "EMPTY"
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
api_key=openai_api_key,
|
api_key=openai_api_key,
|
||||||
base_url=openai_api_base,
|
base_url=openai_api_base,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Single-image input inference
|
||||||
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||||
|
|
||||||
chat_response = client.chat.completions.create(
|
chat_response = client.chat.completions.create(
|
||||||
model="llava-hf/llava-1.5-7b-hf",
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
messages=[{
|
messages=[{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||||
# since the prompt will be processed automatically by the API server.
|
# since the prompt will be processed automatically by the API server.
|
||||||
{"type": "text", "text": "What's in this image?"},
|
{"type": "text", "text": "What’s in this image?"},
|
||||||
{
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {
|
|
||||||
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
}],
|
}],
|
||||||
)
|
)
|
||||||
print("Chat response:", chat_response)
|
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||||
|
|
||||||
|
# Multi-image input inference
|
||||||
|
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
||||||
|
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
||||||
|
|
||||||
|
chat_response = client.chat.completions.create(
|
||||||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "What are the animals in these images?"},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url_duck}},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url_lion}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||||
|
|
||||||
|
|
||||||
A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
|
A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
|
||||||
|
|
||||||
|
|||||||
@@ -119,17 +119,6 @@ The table below shows the compatibility of various quantization implementations
|
|||||||
- ✗
|
- ✗
|
||||||
- ✗
|
- ✗
|
||||||
- ✗
|
- ✗
|
||||||
* - SqueezeLLM
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
^^^^^^
|
^^^^^^
|
||||||
|
|||||||
@@ -10,3 +10,22 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
|
|||||||
Q: Which model to use for offline inference embedding?
|
Q: Which model to use for offline inference embedding?
|
||||||
|
|
||||||
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
|
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
|
||||||
|
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
Q: Can the output of a prompt vary across runs in vLLM?
|
||||||
|
|
||||||
|
A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
|
||||||
|
numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details,
|
||||||
|
see the `Numerical Accuracy section <https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations>`_.
|
||||||
|
|
||||||
|
In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
|
||||||
|
changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations,
|
||||||
|
can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
|
||||||
|
different tokens being sampled. Once a different token is sampled, further divergence is likely.
|
||||||
|
|
||||||
|
**Mitigation Strategies**
|
||||||
|
|
||||||
|
- For improved stability and reduced variance, use `float32`. Note that this will require more memory.
|
||||||
|
- If using `bfloat16`, switching to `float16` can also help.
|
||||||
|
- Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur.
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ This script evaluates the inference throughput of language models using various
|
|||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --help
|
python3 benchmarks/benchmark_throughput.py --help
|
||||||
usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
|
usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
|
||||||
[--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
|
[--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
|
||||||
[--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
|
[--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
|
||||||
[--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
|
[--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
|
||||||
[--quantization-param-path KV_CACHE_quantization_param_path]
|
[--quantization-param-path KV_CACHE_quantization_param_path]
|
||||||
@@ -76,7 +76,7 @@ optional arguments:
|
|||||||
--output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset.
|
--output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset.
|
||||||
--model MODEL
|
--model MODEL
|
||||||
--tokenizer TOKENIZER
|
--tokenizer TOKENIZER
|
||||||
--quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None}
|
--quantization {awq,gptq,None}, -q {awq,gptq,None}
|
||||||
--tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
|
--tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
|
||||||
--n N Number of generated sequences per prompt.
|
--n N Number of generated sequences per prompt.
|
||||||
--use-beam-search
|
--use-beam-search
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
### Quantizer Utilities
|
### Quantizer Utilities
|
||||||
`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
|
`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported
|
||||||
`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
|
from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
|
||||||
|
|
||||||
### Prerequisite
|
### Prerequisite
|
||||||
|
|
||||||
|
|||||||
165
examples/offline_inference_pixtral.py
Normal file
165
examples/offline_inference_pixtral.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
# ruff: noqa
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from vllm import LLM
|
||||||
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
|
# This script is an offline demo for running Pixtral.
|
||||||
|
#
|
||||||
|
# If you want to run a server/client setup, please follow this code:
|
||||||
|
#
|
||||||
|
# - Server:
|
||||||
|
#
|
||||||
|
# ```bash
|
||||||
|
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||||
|
# ```
|
||||||
|
#
|
||||||
|
# - Client:
|
||||||
|
#
|
||||||
|
# ```bash
|
||||||
|
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
|
||||||
|
# --header 'Content-Type: application/json' \
|
||||||
|
# --header 'Authorization: Bearer token' \
|
||||||
|
# --data '{
|
||||||
|
# "model": "mistralai/Pixtral-12B-2409",
|
||||||
|
# "messages": [
|
||||||
|
# {
|
||||||
|
# "role": "user",
|
||||||
|
# "content": [
|
||||||
|
# {"type" : "text", "text": "Describe this image in detail please."},
|
||||||
|
# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
|
||||||
|
# {"type" : "text", "text": "and this one as well. Answer in French."},
|
||||||
|
# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
|
||||||
|
# ]
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# }'
|
||||||
|
# ```
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python demo.py simple
|
||||||
|
# python demo.py advanced
|
||||||
|
|
||||||
|
|
||||||
|
def run_simple_demo():
|
||||||
|
model_name = "mistralai/Pixtral-12B-2409"
|
||||||
|
sampling_params = SamplingParams(max_tokens=8192)
|
||||||
|
|
||||||
|
# Lower max_num_seqs or max_model_len on low-VRAM GPUs.
|
||||||
|
llm = LLM(model=model_name, tokenizer_mode="mistral")
|
||||||
|
|
||||||
|
prompt = "Describe this image in one sentence."
|
||||||
|
image_url = "https://picsum.photos/id/237/200/300"
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": image_url
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
outputs = llm.chat(messages, sampling_params=sampling_params)
|
||||||
|
|
||||||
|
print(outputs[0].outputs[0].text)
|
||||||
|
|
||||||
|
|
||||||
|
def run_advanced_demo():
|
||||||
|
model_name = "mistralai/Pixtral-12B-2409"
|
||||||
|
max_img_per_msg = 5
|
||||||
|
max_tokens_per_img = 4096
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
tokenizer_mode="mistral",
|
||||||
|
limit_mm_per_prompt={"image": max_img_per_msg},
|
||||||
|
max_model_len=max_img_per_msg * max_tokens_per_img,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = "Describe the following image."
|
||||||
|
|
||||||
|
url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
|
||||||
|
url_2 = "https://picsum.photos/seed/picsum/200/300"
|
||||||
|
url_3 = "https://picsum.photos/id/32/512/512"
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": url_1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": url_2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "The images show nature.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "More details please and answer only in French!.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": url_3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
|
||||||
|
print(outputs[0].outputs[0].text)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Run a demo in simple or advanced mode.")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"mode",
|
||||||
|
choices=["simple", "advanced"],
|
||||||
|
help="Specify the demo mode: 'simple' or 'advanced'",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.mode == "simple":
|
||||||
|
print("Running simple demo...")
|
||||||
|
run_simple_demo()
|
||||||
|
elif args.mode == "advanced":
|
||||||
|
print("Running advanced demo...")
|
||||||
|
run_advanced_demo()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -9,12 +9,9 @@ from transformers import AutoTokenizer
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# Input image and question
|
|
||||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
|
||||||
question = "What is the content of this image?"
|
|
||||||
|
|
||||||
|
|
||||||
# LLaVA-1.5
|
# LLaVA-1.5
|
||||||
def run_llava(question):
|
def run_llava(question):
|
||||||
@@ -30,7 +27,16 @@ def run_llava(question):
|
|||||||
def run_llava_next(question):
|
def run_llava_next(question):
|
||||||
|
|
||||||
prompt = f"[INST] <image>\n{question} [/INST]"
|
prompt = f"[INST] <image>\n{question} [/INST]"
|
||||||
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
|
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
|
# LlaVA-NeXT-Video
|
||||||
|
# Currently only support for video input
|
||||||
|
def run_llava_next_video(question):
|
||||||
|
prompt = f"USER: <video>\n{question} ASSISTANT:"
|
||||||
|
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
|
||||||
stop_token_ids = None
|
stop_token_ids = None
|
||||||
return llm, prompt, stop_token_ids
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
@@ -159,9 +165,41 @@ def run_blip2(question):
|
|||||||
return llm, prompt, stop_token_ids
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
|
# Qwen
|
||||||
|
def run_qwen_vl(question):
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model="Qwen/Qwen-VL",
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_num_seqs=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = f"{question}Picture 1: <img></img>\n"
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
|
# Qwen2-VL
|
||||||
|
def run_qwen2_vl(question):
|
||||||
|
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
max_num_seqs=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||||
|
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
|
||||||
|
f"{question}<|im_end|>\n"
|
||||||
|
"<|im_start|>assistant\n")
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
"llava": run_llava,
|
"llava": run_llava,
|
||||||
"llava-next": run_llava_next,
|
"llava-next": run_llava_next,
|
||||||
|
"llava-next-video": run_llava_next_video,
|
||||||
"fuyu": run_fuyu,
|
"fuyu": run_fuyu,
|
||||||
"phi3_v": run_phi3v,
|
"phi3_v": run_phi3v,
|
||||||
"paligemma": run_paligemma,
|
"paligemma": run_paligemma,
|
||||||
@@ -169,14 +207,54 @@ model_example_map = {
|
|||||||
"minicpmv": run_minicpmv,
|
"minicpmv": run_minicpmv,
|
||||||
"blip-2": run_blip2,
|
"blip-2": run_blip2,
|
||||||
"internvl_chat": run_internvl,
|
"internvl_chat": run_internvl,
|
||||||
|
"qwen_vl": run_qwen_vl,
|
||||||
|
"qwen2_vl": run_qwen2_vl,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_multi_modal_input(args):
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"data": image or video,
|
||||||
|
"question": question,
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
if args.modality == "image":
|
||||||
|
# Input image and question
|
||||||
|
image = ImageAsset("cherry_blossom") \
|
||||||
|
.pil_image.convert("RGB")
|
||||||
|
img_question = "What is the content of this image?"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"data": image,
|
||||||
|
"question": img_question,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.modality == "video":
|
||||||
|
# Input video and question
|
||||||
|
video = VideoAsset(name="sample_demo_1.mp4",
|
||||||
|
num_frames=args.num_frames).np_ndarrays
|
||||||
|
vid_question = "Why is this video funny?"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"data": video,
|
||||||
|
"question": vid_question,
|
||||||
|
}
|
||||||
|
|
||||||
|
msg = f"Modality {args.modality} is not supported."
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
model = args.model_type
|
model = args.model_type
|
||||||
if model not in model_example_map:
|
if model not in model_example_map:
|
||||||
raise ValueError(f"Model type {model} is not supported.")
|
raise ValueError(f"Model type {model} is not supported.")
|
||||||
|
|
||||||
|
modality = args.modality
|
||||||
|
mm_input = get_multi_modal_input(args)
|
||||||
|
data = mm_input["data"]
|
||||||
|
question = mm_input["question"]
|
||||||
|
|
||||||
llm, prompt, stop_token_ids = model_example_map[model](question)
|
llm, prompt, stop_token_ids = model_example_map[model](question)
|
||||||
|
|
||||||
# We set temperature to 0.2 so that outputs can be different
|
# We set temperature to 0.2 so that outputs can be different
|
||||||
@@ -191,7 +269,7 @@ def main(args):
|
|||||||
inputs = {
|
inputs = {
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"multi_modal_data": {
|
"multi_modal_data": {
|
||||||
"image": image
|
modality: data
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -200,7 +278,7 @@ def main(args):
|
|||||||
inputs = [{
|
inputs = [{
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"multi_modal_data": {
|
"multi_modal_data": {
|
||||||
"image": image
|
modality: data
|
||||||
},
|
},
|
||||||
} for _ in range(args.num_prompts)]
|
} for _ in range(args.num_prompts)]
|
||||||
|
|
||||||
@@ -223,8 +301,15 @@ if __name__ == "__main__":
|
|||||||
help='Huggingface "model_type".')
|
help='Huggingface "model_type".')
|
||||||
parser.add_argument('--num-prompts',
|
parser.add_argument('--num-prompts',
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=4,
|
||||||
help='Number of prompts to run.')
|
help='Number of prompts to run.')
|
||||||
|
parser.add_argument('--modality',
|
||||||
|
type=str,
|
||||||
|
default="image",
|
||||||
|
help='Modality of the input.')
|
||||||
|
parser.add_argument('--num-frames',
|
||||||
|
type=int,
|
||||||
|
default=16,
|
||||||
|
help='Number of frames to extract from the video.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
243
examples/offline_inference_vision_language_multi_image.py
Normal file
243
examples/offline_inference_vision_language_multi_image.py
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
"""
|
||||||
|
This example shows how to use vLLM for running offline inference with
|
||||||
|
multi-image input on vision language models, using the chat template defined
|
||||||
|
by the model.
|
||||||
|
"""
|
||||||
|
from argparse import Namespace
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from transformers import AutoProcessor, AutoTokenizer
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.multimodal.utils import fetch_image
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
QUESTION = "What is the content of each image?"
|
||||||
|
IMAGE_URLS = [
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def load_qwenvl_chat(question: str, image_urls: List[str]):
|
||||||
|
model_name = "Qwen/Qwen-VL-Chat"
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_num_seqs=5,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
placeholders = "".join(f"Picture {i}: <img></img>\n"
|
||||||
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
|
|
||||||
|
# This model does not have a chat_template attribute on its tokenizer,
|
||||||
|
# so we need to explicitly pass it. We use ChatML since it's used in the
|
||||||
|
# generation utils of the model:
|
||||||
|
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||||
|
trust_remote_code=True)
|
||||||
|
|
||||||
|
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
|
||||||
|
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501
|
||||||
|
|
||||||
|
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||||
|
prompt = tokenizer.apply_chat_template(messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
chat_template=chat_template)
|
||||||
|
|
||||||
|
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
|
||||||
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
|
return llm, prompt, stop_token_ids, None, chat_template
|
||||||
|
|
||||||
|
|
||||||
|
def load_phi3v(question: str, image_urls: List[str]):
|
||||||
|
llm = LLM(
|
||||||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=4096,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
placeholders = "\n".join(f"<|image_{i}|>"
|
||||||
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
|
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def load_internvl(question: str, image_urls: List[str]):
|
||||||
|
model_name = "OpenGVLab/InternVL2-2B"
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_num_seqs=5,
|
||||||
|
max_model_len=4096,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||||
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
|
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||||
|
trust_remote_code=True)
|
||||||
|
prompt = tokenizer.apply_chat_template(messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True)
|
||||||
|
|
||||||
|
# Stop tokens for InternVL
|
||||||
|
# models variants may have different stop tokens
|
||||||
|
# please refer to the model card for the correct "stop words":
|
||||||
|
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
|
||||||
|
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||||
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
|
|
||||||
|
return llm, prompt, stop_token_ids, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def load_qwen2_vl(question, image_urls: List[str]):
|
||||||
|
try:
|
||||||
|
from qwen_vl_utils import process_vision_info
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
print('WARNING: `qwen-vl-utils` not installed, input images will not '
|
||||||
|
'be automatically resized. You can enable this functionality by '
|
||||||
|
'`pip install qwen-vl-utils`.')
|
||||||
|
process_vision_info = None
|
||||||
|
|
||||||
|
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
max_num_seqs=5,
|
||||||
|
max_model_len=32768 if process_vision_info is None else 4096,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
}, {
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name)
|
||||||
|
|
||||||
|
prompt = processor.apply_chat_template(messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True)
|
||||||
|
|
||||||
|
stop_token_ids = None
|
||||||
|
|
||||||
|
if process_vision_info is None:
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
else:
|
||||||
|
image_data, _ = process_vision_info(messages)
|
||||||
|
|
||||||
|
return llm, prompt, stop_token_ids, image_data, None
|
||||||
|
|
||||||
|
|
||||||
|
model_example_map = {
|
||||||
|
"phi3_v": load_phi3v,
|
||||||
|
"internvl_chat": load_internvl,
|
||||||
|
"qwen2_vl": load_qwen2_vl,
|
||||||
|
"qwen_vl_chat": load_qwenvl_chat,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_generate(model, question: str, image_urls: List[str]):
|
||||||
|
llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
|
||||||
|
question, image_urls)
|
||||||
|
if image_data is None:
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=0.0,
|
||||||
|
max_tokens=128,
|
||||||
|
stop_token_ids=stop_token_ids)
|
||||||
|
|
||||||
|
outputs = llm.generate(
|
||||||
|
{
|
||||||
|
"prompt": prompt,
|
||||||
|
"multi_modal_data": {
|
||||||
|
"image": image_data
|
||||||
|
},
|
||||||
|
},
|
||||||
|
sampling_params=sampling_params)
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
|
||||||
|
|
||||||
|
def run_chat(model: str, question: str, image_urls: List[str]):
|
||||||
|
llm, _, stop_token_ids, _, chat_template = model_example_map[model](
|
||||||
|
question, image_urls)
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=0.0,
|
||||||
|
max_tokens=128,
|
||||||
|
stop_token_ids=stop_token_ids)
|
||||||
|
outputs = llm.chat(
|
||||||
|
[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question,
|
||||||
|
},
|
||||||
|
*({
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": image_url
|
||||||
|
},
|
||||||
|
} for image_url in image_urls),
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
chat_template=chat_template,
|
||||||
|
)
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Namespace):
|
||||||
|
model = args.model_type
|
||||||
|
method = args.method
|
||||||
|
|
||||||
|
if method == "generate":
|
||||||
|
run_generate(model, QUESTION, IMAGE_URLS)
|
||||||
|
elif method == "chat":
|
||||||
|
run_chat(model, QUESTION, IMAGE_URLS)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid method: {method}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description='Demo on using vLLM for offline inference with '
|
||||||
|
'vision language models that support multi-image input')
|
||||||
|
parser.add_argument('--model-type',
|
||||||
|
'-m',
|
||||||
|
type=str,
|
||||||
|
default="phi3_v",
|
||||||
|
choices=model_example_map.keys(),
|
||||||
|
help='Huggingface "model_type".')
|
||||||
|
parser.add_argument("--method",
|
||||||
|
type=str,
|
||||||
|
default="generate",
|
||||||
|
choices=["generate", "chat"],
|
||||||
|
help="The method to run in `vllm.LLM`.")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
33
examples/offline_inference_with_profiler.py
Normal file
33
examples/offline_inference_with_profiler.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
# enable torch profiler, can also be set on cmd line
|
||||||
|
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
|
||||||
|
|
||||||
|
# Sample prompts.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
# Create a sampling params object.
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
|
||||||
|
|
||||||
|
llm.start_profile()
|
||||||
|
|
||||||
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
|
# that contain the prompt, generated text, and other information.
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
llm.stop_profile()
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
@@ -27,9 +27,10 @@ client = OpenAI(
|
|||||||
models = client.models.list()
|
models = client.models.list()
|
||||||
model = models.data[0].id
|
model = models.data[0].id
|
||||||
|
|
||||||
|
# Single-image input inference
|
||||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||||
|
|
||||||
# Use image url in the payload
|
## Use image url in the payload
|
||||||
chat_completion_from_url = client.chat.completions.create(
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
messages=[{
|
messages=[{
|
||||||
"role":
|
"role":
|
||||||
@@ -52,10 +53,10 @@ chat_completion_from_url = client.chat.completions.create(
|
|||||||
)
|
)
|
||||||
|
|
||||||
result = chat_completion_from_url.choices[0].message.content
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
print(f"Chat completion output:{result}")
|
print("Chat completion output:", result)
|
||||||
|
|
||||||
|
|
||||||
# Use base64 encoded image in the payload
|
## Use base64 encoded image in the payload
|
||||||
def encode_image_base64_from_url(image_url: str) -> str:
|
def encode_image_base64_from_url(image_url: str) -> str:
|
||||||
"""Encode an image retrieved from a remote url to base64 format."""
|
"""Encode an image retrieved from a remote url to base64 format."""
|
||||||
|
|
||||||
@@ -122,4 +123,4 @@ chat_completion_from_url = client.chat.completions.create(
|
|||||||
)
|
)
|
||||||
|
|
||||||
result = chat_completion_from_url.choices[0].message.content
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
print(f"Chat completion output:{result}")
|
print("Chat completion output:", result)
|
||||||
|
|||||||
@@ -97,12 +97,13 @@
|
|||||||
{{- '{' }}
|
{{- '{' }}
|
||||||
{{- '"name": "' }}
|
{{- '"name": "' }}
|
||||||
{{- tool_call.name }}
|
{{- tool_call.name }}
|
||||||
{{- '"}' }}
|
{{- '"' }}
|
||||||
{{- ', ' }}
|
|
||||||
{%- if tool_call.arguments is defined %}
|
{%- if tool_call.arguments is defined %}
|
||||||
|
{{- ', ' }}
|
||||||
{{- '"arguments": ' }}
|
{{- '"arguments": ' }}
|
||||||
{{- tool_call.arguments|tojson }}
|
{{- tool_call.arguments|tojson }}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
{{- '}' }}
|
||||||
{{- '\n</tool_call>' }}
|
{{- '\n</tool_call>' }}
|
||||||
{%- endfor %}
|
{%- endfor %}
|
||||||
{{- '<|im_end|>\n' }}
|
{{- '<|im_end|>\n' }}
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ exclude = [
|
|||||||
|
|
||||||
[tool.codespell]
|
[tool.codespell]
|
||||||
ignore-words-list = "dout, te, indicies, subtile"
|
ignore-words-list = "dout, te, indicies, subtile"
|
||||||
skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
|
skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
use_parentheses = true
|
use_parentheses = true
|
||||||
|
|||||||
@@ -1,3 +0,0 @@
|
|||||||
# Dependencies for Ray accelerated DAG
|
|
||||||
cupy-cuda12x
|
|
||||||
ray >= 2.32
|
|
||||||
@@ -7,11 +7,11 @@ py-cpuinfo
|
|||||||
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
|
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
|
||||||
tokenizers >= 0.19.1 # Required for Llama 3.
|
tokenizers >= 0.19.1 # Required for Llama 3.
|
||||||
protobuf # Required by LlamaTokenizer.
|
protobuf # Required by LlamaTokenizer.
|
||||||
fastapi
|
fastapi >= 0.114.1
|
||||||
aiohttp
|
aiohttp
|
||||||
openai >= 1.0 # Ensure modern openai package (ensure types module present)
|
openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
|
||||||
uvicorn[standard]
|
uvicorn[standard]
|
||||||
pydantic >= 2.8 # Required for OpenAI server.
|
pydantic >= 2.9 # Required for fastapi >= 0.113.0
|
||||||
pillow # Required for image processing
|
pillow # Required for image processing
|
||||||
prometheus_client >= 0.18.0
|
prometheus_client >= 0.18.0
|
||||||
prometheus-fastapi-instrumentator >= 7.0.0
|
prometheus-fastapi-instrumentator >= 7.0.0
|
||||||
@@ -25,5 +25,7 @@ pyzmq
|
|||||||
msgspec
|
msgspec
|
||||||
gguf == 0.9.1
|
gguf == 0.9.1
|
||||||
importlib_metadata
|
importlib_metadata
|
||||||
mistral_common >= 1.3.4
|
mistral_common >= 1.4.0
|
||||||
pyyaml
|
pyyaml
|
||||||
|
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||||
|
einops # Required for Qwen2-VL.
|
||||||
|
|||||||
@@ -1,6 +1,3 @@
|
|||||||
# Needed for Ray accelerated DAG tests
|
|
||||||
-r requirements-adag.txt
|
|
||||||
|
|
||||||
# testing
|
# testing
|
||||||
pytest
|
pytest
|
||||||
tensorizer>=2.9.0
|
tensorizer>=2.9.0
|
||||||
@@ -14,9 +11,10 @@ awscli
|
|||||||
einops # required for MPT, qwen-vl and Mamba
|
einops # required for MPT, qwen-vl and Mamba
|
||||||
httpx
|
httpx
|
||||||
librosa # required for audio test
|
librosa # required for audio test
|
||||||
|
opencv-python # required for video test
|
||||||
peft
|
peft
|
||||||
requests
|
requests
|
||||||
ray
|
ray[adag]>=2.35
|
||||||
sentence-transformers # required for embedding
|
sentence-transformers # required for embedding
|
||||||
soundfile # required for audio test
|
soundfile # required for audio test
|
||||||
compressed-tensors==0.4.0 # required for compressed-tensors
|
compressed-tensors==0.4.0 # required for compressed-tensors
|
||||||
|
|||||||
6
setup.py
6
setup.py
@@ -170,14 +170,17 @@ class cmake_build_ext(build_ext):
|
|||||||
|
|
||||||
if is_sccache_available():
|
if is_sccache_available():
|
||||||
cmake_args += [
|
cmake_args += [
|
||||||
|
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
|
||||||
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
|
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
|
||||||
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
|
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
|
||||||
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
|
'-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
|
||||||
]
|
]
|
||||||
elif is_ccache_available():
|
elif is_ccache_available():
|
||||||
cmake_args += [
|
cmake_args += [
|
||||||
|
'-DCMAKE_C_COMPILER_LAUNCHER=ccache',
|
||||||
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
|
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
|
||||||
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
|
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
|
||||||
|
'-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
|
||||||
]
|
]
|
||||||
|
|
||||||
# Pass the python executable to cmake so it can find an exact
|
# Pass the python executable to cmake so it can find an exact
|
||||||
@@ -502,6 +505,7 @@ setup(
|
|||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
extras_require={
|
extras_require={
|
||||||
"tensorizer": ["tensorizer>=2.9.0"],
|
"tensorizer": ["tensorizer>=2.9.0"],
|
||||||
|
"video": ["opencv-python"], # Required for video processing
|
||||||
"audio": ["librosa", "soundfile"] # Required for audio processing
|
"audio": ["librosa", "soundfile"] # Required for audio processing
|
||||||
},
|
},
|
||||||
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
|
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
@@ -26,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
|
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
|
||||||
worker_use_ray: bool):
|
|
||||||
script_path = Path(__file__).parent.joinpath(
|
script_path = Path(__file__).parent.joinpath(
|
||||||
"api_server_async_engine.py").absolute()
|
"api_server_async_engine.py").absolute()
|
||||||
commands = [
|
commands = [
|
||||||
@@ -37,25 +35,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
|
|||||||
str(tokenizer_pool_size)
|
str(tokenizer_pool_size)
|
||||||
]
|
]
|
||||||
|
|
||||||
# Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
|
|
||||||
# to prevent `--engine-use-ray` raises an exception due to it deprecation
|
|
||||||
env_vars = os.environ.copy()
|
|
||||||
env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
|
|
||||||
|
|
||||||
if engine_use_ray:
|
|
||||||
commands.append("--engine-use-ray")
|
|
||||||
if worker_use_ray:
|
if worker_use_ray:
|
||||||
commands.append("--worker-use-ray")
|
commands.append("--worker-use-ray")
|
||||||
uvicorn_process = subprocess.Popen(commands, env=env_vars)
|
uvicorn_process = subprocess.Popen(commands)
|
||||||
yield
|
yield
|
||||||
uvicorn_process.terminate()
|
uvicorn_process.terminate()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
||||||
@pytest.mark.parametrize("worker_use_ray", [False, True])
|
@pytest.mark.parametrize("worker_use_ray", [False, True])
|
||||||
@pytest.mark.parametrize("engine_use_ray", [False, True])
|
def test_api_server(api_server, tokenizer_pool_size: int,
|
||||||
def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
|
worker_use_ray: bool):
|
||||||
engine_use_ray: bool):
|
|
||||||
"""
|
"""
|
||||||
Run the API server and test it.
|
Run the API server and test it.
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
|
import uuid
|
||||||
from asyncio import CancelledError
|
from asyncio import CancelledError
|
||||||
|
from copy import copy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
@@ -12,6 +14,7 @@ from vllm import SamplingParams
|
|||||||
from vllm.config import ParallelConfig
|
from vllm.config import ParallelConfig
|
||||||
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
|
||||||
from vllm.outputs import RequestOutput as RealRequestOutput
|
from vllm.outputs import RequestOutput as RealRequestOutput
|
||||||
|
from vllm.sampling_params import RequestOutputKind
|
||||||
|
|
||||||
from ..conftest import cleanup
|
from ..conftest import cleanup
|
||||||
from ..utils import wait_for_gpu_memory_to_clear
|
from ..utils import wait_for_gpu_memory_to_clear
|
||||||
@@ -72,14 +75,12 @@ class MockEngine:
|
|||||||
|
|
||||||
|
|
||||||
class MockAsyncLLMEngine(AsyncLLMEngine):
|
class MockAsyncLLMEngine(AsyncLLMEngine):
|
||||||
|
_engine_class = MockEngine
|
||||||
def _init_engine(self, *args, **kwargs):
|
|
||||||
return MockEngine()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_new_requests_event():
|
async def test_new_requests_event():
|
||||||
engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
|
engine = MockAsyncLLMEngine(worker_use_ray=False)
|
||||||
engine.start_background_loop()
|
engine.start_background_loop()
|
||||||
await asyncio.sleep(0.01)
|
await asyncio.sleep(0.01)
|
||||||
assert engine.engine.step_calls == 0
|
assert engine.engine.step_calls == 0
|
||||||
@@ -112,16 +113,11 @@ async def test_new_requests_event():
|
|||||||
assert engine.engine.add_request_calls == 3
|
assert engine.engine.add_request_calls == 3
|
||||||
assert engine.engine.step_calls == old_step_calls + 1
|
assert engine.engine.step_calls == old_step_calls + 1
|
||||||
|
|
||||||
# Allow deprecated engine_use_ray to not raise exception
|
engine = MockAsyncLLMEngine(worker_use_ray=True)
|
||||||
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
|
|
||||||
|
|
||||||
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
|
|
||||||
assert engine.get_model_config() is not None
|
assert engine.get_model_config() is not None
|
||||||
assert engine.get_tokenizer() is not None
|
assert engine.get_tokenizer() is not None
|
||||||
assert engine.get_decoding_config() is not None
|
assert engine.get_decoding_config() is not None
|
||||||
|
|
||||||
os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
|
|
||||||
|
|
||||||
|
|
||||||
def start_engine():
|
def start_engine():
|
||||||
wait_for_gpu_memory_to_clear(
|
wait_for_gpu_memory_to_clear(
|
||||||
@@ -130,8 +126,17 @@ def start_engine():
|
|||||||
timeout_s=60,
|
timeout_s=60,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
|
||||||
|
print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
|
||||||
|
|
||||||
return AsyncLLMEngine.from_engine_args(
|
return AsyncLLMEngine.from_engine_args(
|
||||||
AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
|
AsyncEngineArgs(model="facebook/opt-125m",
|
||||||
|
enforce_eager=True,
|
||||||
|
num_scheduler_steps=num_scheduler_steps))
|
||||||
|
|
||||||
|
|
||||||
|
def uid() -> str:
|
||||||
|
return str(uuid.uuid4())
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture(scope="module")
|
@pytest_asyncio.fixture(scope="module")
|
||||||
@@ -156,57 +161,177 @@ def should_do_global_cleanup_after_test(request) -> bool:
|
|||||||
@pytest.mark.asyncio(scope="module")
|
@pytest.mark.asyncio(scope="module")
|
||||||
async def test_asyncio_run(async_engine):
|
async def test_asyncio_run(async_engine):
|
||||||
|
|
||||||
|
scheduler_config = await async_engine.get_scheduler_config()
|
||||||
|
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||||
|
|
||||||
async def run(prompt: str):
|
async def run(prompt: str):
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=0,
|
temperature=0,
|
||||||
max_tokens=32,
|
max_tokens=32,
|
||||||
|
min_tokens=32,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
output_count = 0
|
||||||
|
final_output = None
|
||||||
async for output in async_engine.generate(prompt,
|
async for output in async_engine.generate(prompt,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
request_id=prompt):
|
request_id=uid()):
|
||||||
|
output_count += 1
|
||||||
final_output = output
|
final_output = output
|
||||||
return final_output
|
return final_output, output_count
|
||||||
|
|
||||||
results = await asyncio.gather(
|
results = await asyncio.gather(
|
||||||
run("test0"),
|
run("test0"),
|
||||||
run("test1"),
|
run("test0"),
|
||||||
)
|
)
|
||||||
assert len(results) == 2
|
assert len(results) == 2
|
||||||
|
first, second = results
|
||||||
|
|
||||||
|
# remove nondeterministic fields for comparison
|
||||||
|
first[0].metrics = None
|
||||||
|
second[0].metrics = None
|
||||||
|
first[0].request_id = None
|
||||||
|
second[0].request_id = None
|
||||||
|
|
||||||
|
assert str(first) == str(second)
|
||||||
|
|
||||||
|
output_count = results[0][1]
|
||||||
|
if num_scheduler_steps == 1:
|
||||||
|
assert output_count == 32
|
||||||
|
else:
|
||||||
|
assert 1 < output_count < 32
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio(scope="module")
|
||||||
|
async def test_output_kinds(async_engine):
|
||||||
|
"""Test that output_kind works as expected and that
|
||||||
|
results are equivalent across different kinds."""
|
||||||
|
|
||||||
|
scheduler_config = await async_engine.get_scheduler_config()
|
||||||
|
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
temperature=0,
|
||||||
|
max_tokens=32,
|
||||||
|
min_tokens=32,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run(prompt: str, kind: RequestOutputKind):
|
||||||
|
params = copy(sampling_params)
|
||||||
|
params.output_kind = kind
|
||||||
|
|
||||||
|
output_count = 0
|
||||||
|
final_output = None
|
||||||
|
async for output in async_engine.generate(prompt,
|
||||||
|
params,
|
||||||
|
request_id=uid()):
|
||||||
|
output_count += 1
|
||||||
|
final_output = output
|
||||||
|
|
||||||
|
assert final_output is not None
|
||||||
|
return (final_output.prompt_token_ids,
|
||||||
|
final_output.outputs[0].token_ids,
|
||||||
|
final_output.outputs[0].text, output_count)
|
||||||
|
|
||||||
|
async def run_deltas(prompt: str):
|
||||||
|
params = copy(sampling_params)
|
||||||
|
params.output_kind = RequestOutputKind.DELTA
|
||||||
|
|
||||||
|
prompt_tokens = None
|
||||||
|
output_tokens: List[int] = []
|
||||||
|
output_text = ""
|
||||||
|
output_count = 0
|
||||||
|
async for output in async_engine.generate(prompt,
|
||||||
|
params,
|
||||||
|
request_id=uid()):
|
||||||
|
token_ids = output.outputs[0].token_ids
|
||||||
|
text = output.outputs[0].text
|
||||||
|
|
||||||
|
# Ensure we get prompt ids iff we haven't yet received output tokens
|
||||||
|
if output_tokens:
|
||||||
|
assert 1 <= len(token_ids) <= num_scheduler_steps
|
||||||
|
assert text
|
||||||
|
assert not output.prompt_token_ids
|
||||||
|
else:
|
||||||
|
assert output.prompt_token_ids
|
||||||
|
prompt_tokens = output.prompt_token_ids
|
||||||
|
|
||||||
|
output_tokens.extend(token_ids)
|
||||||
|
output_text += text
|
||||||
|
|
||||||
|
output_count += 1
|
||||||
|
return prompt_tokens, output_tokens, output_text, output_count
|
||||||
|
|
||||||
|
results = await asyncio.gather(
|
||||||
|
run("common input prompt", RequestOutputKind.CUMULATIVE),
|
||||||
|
run("common input prompt", RequestOutputKind.FINAL_ONLY),
|
||||||
|
run_deltas("common input prompt"))
|
||||||
|
|
||||||
|
# Make sure outputs are the same
|
||||||
|
prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
|
||||||
|
assert len(prompt_set) == 1
|
||||||
|
|
||||||
|
text_set = set(text for _, _, text, _ in results)
|
||||||
|
assert len(text_set) == 1
|
||||||
|
|
||||||
|
tokens_set = set(tuple(ids) for _, ids, _, _ in results)
|
||||||
|
assert len(tokens_set) == 1
|
||||||
|
|
||||||
|
cumulative, final, deltas = results
|
||||||
|
|
||||||
|
# output message counts
|
||||||
|
assert cumulative[3] == deltas[3]
|
||||||
|
|
||||||
|
if num_scheduler_steps == 1:
|
||||||
|
assert cumulative[3] == 32
|
||||||
|
else:
|
||||||
|
assert 1 < cumulative[3] < 32
|
||||||
|
|
||||||
|
assert final[3] == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio(scope="module")
|
@pytest.mark.asyncio(scope="module")
|
||||||
async def test_cancellation(async_engine):
|
async def test_cancellation(async_engine):
|
||||||
|
scheduler_config = await async_engine.get_scheduler_config()
|
||||||
|
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=0,
|
temperature=0,
|
||||||
min_tokens=10,
|
min_tokens=13,
|
||||||
max_tokens=10,
|
max_tokens=13,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
stop_at = 5 if num_scheduler_steps == 1 else 1
|
||||||
|
|
||||||
|
request_id = uid()
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
with pytest.raises(CancelledError):
|
with pytest.raises(CancelledError):
|
||||||
async for output in async_engine.generate("test2",
|
async for output in async_engine.generate("test2",
|
||||||
sampling_params,
|
sampling_params,
|
||||||
request_id="test2"):
|
request_id=request_id):
|
||||||
assert not output.finished
|
assert not output.finished
|
||||||
i += 1
|
i += 1
|
||||||
if i == 5:
|
if i == stop_at:
|
||||||
await async_engine.abort("test2")
|
await async_engine.abort(request_id)
|
||||||
|
|
||||||
assert i == 5
|
assert i == stop_at
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio(scope="module")
|
@pytest.mark.asyncio(scope="module")
|
||||||
async def test_delayed_generator(async_engine):
|
async def test_delayed_generator(async_engine):
|
||||||
|
scheduler_config = await async_engine.get_scheduler_config()
|
||||||
|
|
||||||
|
if scheduler_config.num_scheduler_steps != 1:
|
||||||
|
pytest.skip("no need to test this one with multistep")
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=0,
|
temperature=0,
|
||||||
min_tokens=10,
|
min_tokens=10,
|
||||||
max_tokens=10,
|
max_tokens=10,
|
||||||
)
|
)
|
||||||
|
|
||||||
stream = async_engine.generate("test3",
|
stream = async_engine.generate("test3", sampling_params, request_id=uid())
|
||||||
sampling_params,
|
|
||||||
request_id="test3")
|
|
||||||
i = 0
|
i = 0
|
||||||
final_output: Optional[RealRequestOutput] = None
|
final_output: Optional[RealRequestOutput] = None
|
||||||
async for output in stream:
|
async for output in stream:
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
|
from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
|
||||||
|
load_chat_template)
|
||||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
@@ -87,7 +88,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
|
|||||||
add_generation_prompt=add_generation_prompt)
|
add_generation_prompt=add_generation_prompt)
|
||||||
|
|
||||||
# Call the function and get the result
|
# Call the function and get the result
|
||||||
result = apply_chat_template(
|
result = apply_hf_chat_template(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
conversation=mock_request.messages,
|
conversation=mock_request.messages,
|
||||||
chat_template=mock_request.chat_template or template_content,
|
chat_template=mock_request.chat_template or template_content,
|
||||||
|
|||||||
@@ -19,16 +19,11 @@ def server():
|
|||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"2048",
|
||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
"--engine-use-ray",
|
|
||||||
"--chat-template",
|
"--chat-template",
|
||||||
str(chatml_jinja_path),
|
str(chatml_jinja_path),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Allow `--engine-use-ray`, otherwise the launch of the server throw
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
# an error due to try to use a deprecated feature
|
|
||||||
env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args,
|
|
||||||
env_dict=env_dict) as remote_server:
|
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
@@ -3,12 +3,16 @@
|
|||||||
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
import weakref
|
import weakref
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.utils import is_hip
|
from vllm.utils import is_hip
|
||||||
|
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
|
||||||
|
|
||||||
from ..models.utils import check_outputs_equal
|
from ..models.utils import check_outputs_equal
|
||||||
|
|
||||||
@@ -64,3 +68,29 @@ def test_models(
|
|||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_with_failure(vllm_runner) -> None:
|
||||||
|
try:
|
||||||
|
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
|
||||||
|
side_effect=ValueError()):
|
||||||
|
with pytest.raises(ValueError) as exc_info:
|
||||||
|
vllm_runner("facebook/opt-125m",
|
||||||
|
dtype="half",
|
||||||
|
enforce_eager=False,
|
||||||
|
gpu_memory_utilization=0.7)
|
||||||
|
matches = re.search(r"input dumped to (.+).pkl",
|
||||||
|
str(exc_info.value))
|
||||||
|
assert matches is not None
|
||||||
|
filename = f"{matches.group(1)}.pkl"
|
||||||
|
|
||||||
|
with open(filename, "rb") as filep:
|
||||||
|
inputs = pickle.load(filep)
|
||||||
|
|
||||||
|
if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
|
||||||
|
raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
|
||||||
|
f"{list(inputs.keys())}")
|
||||||
|
assert isinstance(inputs["arg_1"],
|
||||||
|
ModelInputForGPUWithSamplingMetadata)
|
||||||
|
finally:
|
||||||
|
os.remove(filename)
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ def test_chunked_prefill_recompute(
|
|||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
max_num_seqs=max_num_seqs,
|
max_num_seqs=max_num_seqs,
|
||||||
worker_use_ray=worker_use_ray,
|
worker_use_ray=worker_use_ray,
|
||||||
|
disable_log_stats=False,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||||
|
|||||||
@@ -16,5 +16,7 @@ def test_full_graph(model):
|
|||||||
"The future of AI is",
|
"The future of AI is",
|
||||||
]
|
]
|
||||||
sampling_params = SamplingParams(temperature=0)
|
sampling_params = SamplingParams(temperature=0)
|
||||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B")
|
llm = LLM(model="meta-llama/Meta-Llama-3-8B",
|
||||||
|
enforce_eager=True,
|
||||||
|
load_format="dummy")
|
||||||
llm.generate(prompts, sampling_params)
|
llm.generate(prompts, sampling_params)
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ from typing import Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispacther
|
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
||||||
|
|
||||||
|
|
||||||
class MyMod(torch.nn.Module):
|
class MyMod(torch.nn.Module):
|
||||||
@@ -13,7 +13,7 @@ class MyMod(torch.nn.Module):
|
|||||||
return x * 2
|
return x * 2
|
||||||
|
|
||||||
|
|
||||||
class MyWrapper(TorchCompileWrapperWithCustomDispacther):
|
class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
||||||
|
|
||||||
def __init__(self, model):
|
def __init__(self, model):
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.config import TokenizerPoolConfig
|
from vllm.config import TokenizerPoolConfig
|
||||||
from vllm.connections import global_http_connection
|
from vllm.connections import global_http_connection
|
||||||
from vllm.distributed import (destroy_distributed_environment,
|
from vllm.distributed import (destroy_distributed_environment,
|
||||||
@@ -44,6 +45,7 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
|
|||||||
PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
|
PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
|
||||||
PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
|
PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
|
||||||
List[List[Tuple[np.ndarray, int]]]]
|
List[List[Tuple[np.ndarray, int]]]]
|
||||||
|
PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
|
||||||
|
|
||||||
|
|
||||||
def _read_prompts(filename: str) -> List[str]:
|
def _read_prompts(filename: str) -> List[str]:
|
||||||
@@ -85,8 +87,35 @@ class _ImageAssets(_ImageAssetsBase):
|
|||||||
return [prompts["stop_sign"], prompts["cherry_blossom"]]
|
return [prompts["stop_sign"], prompts["cherry_blossom"]]
|
||||||
|
|
||||||
|
|
||||||
|
class _VideoAssetPrompts(TypedDict):
|
||||||
|
sample_demo_1: str
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info < (3, 9):
|
||||||
|
# UserList cannot be subscripted
|
||||||
|
class _VideoAssetsBase(UserList):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
|
||||||
|
class _VideoAssetsBase(UserList[VideoAsset]):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class _VideoAssets(_VideoAssetsBase):
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__([
|
||||||
|
VideoAsset("sample_demo_1.mp4"),
|
||||||
|
])
|
||||||
|
|
||||||
|
def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
|
||||||
|
return [prompts["sample_demo_1"]]
|
||||||
|
|
||||||
|
|
||||||
IMAGE_ASSETS = _ImageAssets()
|
IMAGE_ASSETS = _ImageAssets()
|
||||||
"""Singleton instance of :class:`_ImageAssets`."""
|
"""Singleton instance of :class:`_ImageAssets`."""
|
||||||
|
VIDEO_ASSETS = _VideoAssets()
|
||||||
|
"""Singleton instance of :class:`_VideoAssets`."""
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@@ -202,6 +231,11 @@ def image_assets() -> _ImageAssets:
|
|||||||
return IMAGE_ASSETS
|
return IMAGE_ASSETS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def video_assets() -> _VideoAssets:
|
||||||
|
return VIDEO_ASSETS
|
||||||
|
|
||||||
|
|
||||||
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
|
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
|
||||||
|
|
||||||
|
|
||||||
@@ -278,7 +312,8 @@ class HfRunner:
|
|||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
|
videos: Optional[List[np.ndarray]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Tuple[List[List[int]], List[str]]]:
|
) -> List[Tuple[List[List[int]], List[str]]]:
|
||||||
if images:
|
if images:
|
||||||
@@ -292,6 +327,8 @@ class HfRunner:
|
|||||||
}
|
}
|
||||||
if images is not None and images[i] is not None:
|
if images is not None and images[i] is not None:
|
||||||
processor_kwargs["images"] = images[i]
|
processor_kwargs["images"] = images[i]
|
||||||
|
if videos is not None and videos[i] is not None:
|
||||||
|
processor_kwargs["videos"] = videos[i]
|
||||||
|
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs)
|
inputs = self.postprocess_inputs(inputs)
|
||||||
@@ -314,7 +351,7 @@ class HfRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Tuple[List[int], str]]:
|
) -> List[Tuple[List[int], str]]:
|
||||||
outputs = self.generate(prompts,
|
outputs = self.generate(prompts,
|
||||||
@@ -351,7 +388,8 @@ class HfRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
|
videos: Optional[List[np.ndarray]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[List[torch.Tensor]]:
|
) -> List[List[torch.Tensor]]:
|
||||||
all_logprobs: List[List[torch.Tensor]] = []
|
all_logprobs: List[List[torch.Tensor]] = []
|
||||||
@@ -362,6 +400,8 @@ class HfRunner:
|
|||||||
}
|
}
|
||||||
if images is not None and images[i] is not None:
|
if images is not None and images[i] is not None:
|
||||||
processor_kwargs["images"] = images[i]
|
processor_kwargs["images"] = images[i]
|
||||||
|
if videos is not None and videos[i] is not None:
|
||||||
|
processor_kwargs["videos"] = videos[i]
|
||||||
|
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs)
|
inputs = self.postprocess_inputs(inputs)
|
||||||
@@ -433,8 +473,9 @@ class HfRunner:
|
|||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
audios: Optional[List[Tuple[np.ndarray, int]]] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
|
videos: Optional[List[np.ndarray]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
|
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
|
||||||
all_logprobs: List[List[Dict[int, float]]] = []
|
all_logprobs: List[List[Dict[int, float]]] = []
|
||||||
@@ -454,6 +495,8 @@ class HfRunner:
|
|||||||
processor_kwargs["audio"] = audio
|
processor_kwargs["audio"] = audio
|
||||||
processor_kwargs["sampling_rate"] = sr
|
processor_kwargs["sampling_rate"] = sr
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
processor_kwargs["videos"] = videos[i]
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs)
|
inputs = self.postprocess_inputs(inputs)
|
||||||
|
|
||||||
@@ -615,8 +658,8 @@ class VllmRunner:
|
|||||||
outputs.append((req_sample_output_ids, req_sample_output_strs))
|
outputs.append((req_sample_output_ids, req_sample_output_strs))
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _final_steps_generate_w_logprobs(
|
def _final_steps_generate_w_logprobs(
|
||||||
self,
|
|
||||||
req_outputs: List[RequestOutput],
|
req_outputs: List[RequestOutput],
|
||||||
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
||||||
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
|
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
|
||||||
@@ -634,12 +677,16 @@ class VllmRunner:
|
|||||||
sampling_params: SamplingParams,
|
sampling_params: SamplingParams,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
|
videos: Optional[PromptVideoInput] = None,
|
||||||
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
||||||
assert sampling_params.logprobs is not None
|
assert sampling_params.logprobs is not None
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
assert len(prompts) == len(images)
|
assert len(prompts) == len(images)
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
assert len(prompts) == len(videos)
|
||||||
|
|
||||||
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
|
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
|
||||||
if images is not None:
|
if images is not None:
|
||||||
for i, image in enumerate(images):
|
for i, image in enumerate(images):
|
||||||
@@ -649,6 +696,11 @@ class VllmRunner:
|
|||||||
for i, audio in enumerate(audios):
|
for i, audio in enumerate(audios):
|
||||||
inputs[i]["multi_modal_data"] = {"audio": audio}
|
inputs[i]["multi_modal_data"] = {"audio": audio}
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
for i, video in enumerate(videos):
|
||||||
|
inputs[i]["multi_modal_data"] = {"video": video}
|
||||||
|
print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
|
||||||
|
|
||||||
req_outputs = self.model.generate(inputs,
|
req_outputs = self.model.generate(inputs,
|
||||||
sampling_params=sampling_params)
|
sampling_params=sampling_params)
|
||||||
return self._final_steps_generate_w_logprobs(req_outputs)
|
return self._final_steps_generate_w_logprobs(req_outputs)
|
||||||
@@ -671,7 +723,7 @@ class VllmRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
) -> List[Tuple[List[int], str]]:
|
) -> List[Tuple[List[int], str]]:
|
||||||
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
||||||
outputs = self.generate(prompts, greedy_params, images=images)
|
outputs = self.generate(prompts, greedy_params, images=images)
|
||||||
@@ -685,6 +737,7 @@ class VllmRunner:
|
|||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
|
videos: Optional[PromptVideoInput] = None,
|
||||||
stop_token_ids: Optional[List[int]] = None,
|
stop_token_ids: Optional[List[int]] = None,
|
||||||
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
||||||
greedy_logprobs_params = SamplingParams(temperature=0.0,
|
greedy_logprobs_params = SamplingParams(temperature=0.0,
|
||||||
@@ -694,7 +747,8 @@ class VllmRunner:
|
|||||||
outputs = self.generate_w_logprobs(prompts,
|
outputs = self.generate_w_logprobs(prompts,
|
||||||
greedy_logprobs_params,
|
greedy_logprobs_params,
|
||||||
images=images,
|
images=images,
|
||||||
audios=audios)
|
audios=audios,
|
||||||
|
videos=videos)
|
||||||
|
|
||||||
return [(output_ids, output_str, output_logprobs)
|
return [(output_ids, output_str, output_logprobs)
|
||||||
for output_ids, output_str, output_logprobs in outputs]
|
for output_ids, output_str, output_logprobs in outputs]
|
||||||
|
|||||||
@@ -35,9 +35,11 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str,
|
|||||||
if model.startswith("llava-hf/llava-1.5"):
|
if model.startswith("llava-hf/llava-1.5"):
|
||||||
from ..models.test_llava import models, run_test
|
from ..models.test_llava import models, run_test
|
||||||
elif model.startswith("llava-hf/llava-v1.6"):
|
elif model.startswith("llava-hf/llava-v1.6"):
|
||||||
from ..models.test_llava_next import models, run_test
|
from ..models.test_llava_next import run_test # type: ignore[no-redef]
|
||||||
|
from ..models.test_llava_next import models
|
||||||
elif model.startswith("facebook/chameleon"):
|
elif model.startswith("facebook/chameleon"):
|
||||||
from ..models.test_chameleon import models, run_test
|
from ..models.test_chameleon import run_test # type: ignore[no-redef]
|
||||||
|
from ..models.test_chameleon import models
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Unsupported model: {model}")
|
raise NotImplementedError(f"Unsupported model: {model}")
|
||||||
|
|
||||||
|
|||||||
@@ -18,23 +18,29 @@ logger = init_logger("test_pipeline_parallel")
|
|||||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
|
@pytest.mark.parametrize(
|
||||||
|
("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
|
||||||
"MODEL_NAME, DIST_BACKEND"),
|
"MODEL_NAME, DIST_BACKEND"),
|
||||||
[
|
[
|
||||||
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
])
|
# TODO: Enable internVL2 in a separate test if needed
|
||||||
|
# (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
|
||||||
|
# (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
|
||||||
|
# (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
|
||||||
|
],
|
||||||
|
)
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
|
||||||
DIST_BACKEND):
|
TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
|
||||||
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
|
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
|
||||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||||
"multiprocessing distributed backend")
|
"multiprocessing distributed backend")
|
||||||
@@ -43,6 +49,8 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
|||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"float16",
|
"float16",
|
||||||
|
"--max-model-len",
|
||||||
|
"8192",
|
||||||
"--pipeline-parallel-size",
|
"--pipeline-parallel-size",
|
||||||
str(PP_SIZE),
|
str(PP_SIZE),
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
@@ -59,7 +67,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
|||||||
tp_args = [
|
tp_args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"float16",
|
||||||
|
"--max-model-len",
|
||||||
|
"8192",
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
|
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
|
||||||
"--distributed-executor-backend",
|
"--distributed-executor-backend",
|
||||||
@@ -71,6 +81,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
|||||||
if EAGER_MODE:
|
if EAGER_MODE:
|
||||||
pp_args.append("--enforce-eager")
|
pp_args.append("--enforce-eager")
|
||||||
tp_args.append("--enforce-eager")
|
tp_args.append("--enforce-eager")
|
||||||
|
if TRUST_REMOTE_CODE:
|
||||||
|
pp_args.append("--trust-remote-code")
|
||||||
|
tp_args.append("--trust-remote-code")
|
||||||
pp_env = None
|
pp_env = None
|
||||||
if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
|
if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
|
||||||
and CHUNKED_PREFILL):
|
and CHUNKED_PREFILL):
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ def test_local_workers() -> None:
|
|||||||
workers[3].process.kill()
|
workers[3].process.kill()
|
||||||
|
|
||||||
# Other workers should get shut down here
|
# Other workers should get shut down here
|
||||||
worker_monitor.join(2)
|
worker_monitor.join(20)
|
||||||
|
|
||||||
# Ensure everything is stopped
|
# Ensure everything is stopped
|
||||||
assert not worker_monitor.is_alive()
|
assert not worker_monitor.is_alive()
|
||||||
@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
|
|||||||
# Clean shutdown
|
# Clean shutdown
|
||||||
worker_monitor.close()
|
worker_monitor.close()
|
||||||
|
|
||||||
worker_monitor.join(5)
|
worker_monitor.join(20)
|
||||||
|
|
||||||
# Ensure everything is stopped
|
# Ensure everything is stopped
|
||||||
assert not worker_monitor.is_alive()
|
assert not worker_monitor.is_alive()
|
||||||
@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
|
|||||||
workers[3].process.kill()
|
workers[3].process.kill()
|
||||||
|
|
||||||
# Other workers should get shut down here
|
# Other workers should get shut down here
|
||||||
worker_monitor.join(2)
|
worker_monitor.join(20)
|
||||||
|
|
||||||
# Ensure everything is stopped
|
# Ensure everything is stopped
|
||||||
assert not worker_monitor.is_alive()
|
assert not worker_monitor.is_alive()
|
||||||
|
|||||||
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
|
|||||||
# token ids.
|
# token ids.
|
||||||
llm = LLM(model=model, skip_tokenizer_init=True)
|
llm = LLM(model=model, skip_tokenizer_init=True)
|
||||||
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
||||||
with pytest.raises(ValueError) as err:
|
|
||||||
|
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
||||||
llm.generate("abc", sampling_params)
|
llm.generate("abc", sampling_params)
|
||||||
assert "prompts must be None if" in str(err.value)
|
|
||||||
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
|
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
|
||||||
sampling_params=sampling_params)
|
sampling_params=sampling_params)
|
||||||
assert len(outputs) > 0
|
assert len(outputs) > 0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ def zephyr_lora_files():
|
|||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
|
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
|
||||||
lora_request = [
|
lora_request = [
|
||||||
LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
|
LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
|
||||||
for idx in range(len(PROMPTS))
|
for idx in range(len(PROMPTS))
|
||||||
]
|
]
|
||||||
# Multiple SamplingParams should be matched with each prompt
|
# Multiple SamplingParams should be matched with each prompt
|
||||||
|
|||||||
0
tests/entrypoints/offline_mode/__init__.py
Normal file
0
tests/entrypoints/offline_mode/__init__.py
Normal file
77
tests/entrypoints/offline_mode/test_offline_mode.py
Normal file
77
tests/entrypoints/offline_mode/test_offline_mode.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
"""Tests for HF_HUB_OFFLINE mode"""
|
||||||
|
import importlib
|
||||||
|
import sys
|
||||||
|
import weakref
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
|
from ...conftest import cleanup
|
||||||
|
|
||||||
|
MODEL_NAME = "facebook/opt-125m"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def llm():
|
||||||
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
|
# enable garbage collection
|
||||||
|
llm = LLM(model=MODEL_NAME,
|
||||||
|
max_num_batched_tokens=4096,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
gpu_memory_utilization=0.10,
|
||||||
|
enforce_eager=True)
|
||||||
|
|
||||||
|
with llm.deprecate_legacy_api():
|
||||||
|
yield weakref.proxy(llm)
|
||||||
|
|
||||||
|
del llm
|
||||||
|
|
||||||
|
cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip_global_cleanup
|
||||||
|
def test_offline_mode(llm: LLM, monkeypatch):
|
||||||
|
# we use the llm fixture to ensure the model files are in-cache
|
||||||
|
del llm
|
||||||
|
|
||||||
|
# Set HF to offline mode and ensure we can still construct an LLM
|
||||||
|
try:
|
||||||
|
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
|
||||||
|
# Need to re-import huggingface_hub and friends to setup offline mode
|
||||||
|
_re_import_modules()
|
||||||
|
# Cached model files should be used in offline mode
|
||||||
|
LLM(model=MODEL_NAME,
|
||||||
|
max_num_batched_tokens=4096,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
gpu_memory_utilization=0.10,
|
||||||
|
enforce_eager=True)
|
||||||
|
finally:
|
||||||
|
# Reset the environment after the test
|
||||||
|
# NB: Assuming tests are run in online mode
|
||||||
|
monkeypatch.delenv("HF_HUB_OFFLINE")
|
||||||
|
_re_import_modules()
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _re_import_modules():
|
||||||
|
hf_hub_module_names = [
|
||||||
|
k for k in sys.modules if k.startswith("huggingface_hub")
|
||||||
|
]
|
||||||
|
transformers_module_names = [
|
||||||
|
k for k in sys.modules if k.startswith("transformers")
|
||||||
|
and not k.startswith("transformers_modules")
|
||||||
|
]
|
||||||
|
|
||||||
|
reload_exception = None
|
||||||
|
for module_name in hf_hub_module_names + transformers_module_names:
|
||||||
|
try:
|
||||||
|
importlib.reload(sys.modules[module_name])
|
||||||
|
except Exception as e:
|
||||||
|
reload_exception = e
|
||||||
|
# Try to continue clean up so that other tests are less likely to
|
||||||
|
# be affected
|
||||||
|
|
||||||
|
# Error this test if reloading a module failed
|
||||||
|
if reload_exception is not None:
|
||||||
|
raise reload_exception
|
||||||
@@ -8,7 +8,9 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
|
|||||||
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
|
|
||||||
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
|
{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
|
{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
||||||
|
|
||||||
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
||||||
|
|||||||
107
tests/entrypoints/openai/test_serving_engine.py
Normal file
107
tests/entrypoints/openai/test_serving_engine.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
from http import HTTPStatus
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config import ModelConfig
|
||||||
|
from vllm.engine.protocol import AsyncEngineClient
|
||||||
|
from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||||
|
LoadLoraAdapterRequest,
|
||||||
|
UnloadLoraAdapterRequest)
|
||||||
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
|
|
||||||
|
MODEL_NAME = "meta-llama/Llama-2-7b"
|
||||||
|
LORA_LOADING_SUCCESS_MESSAGE = (
|
||||||
|
"Success: LoRA adapter '{lora_name}' added successfully.")
|
||||||
|
LORA_UNLOADING_SUCCESS_MESSAGE = (
|
||||||
|
"Success: LoRA adapter '{lora_name}' removed successfully.")
|
||||||
|
|
||||||
|
|
||||||
|
async def _async_serving_engine_init():
|
||||||
|
mock_engine_client = MagicMock(spec=AsyncEngineClient)
|
||||||
|
mock_model_config = MagicMock(spec=ModelConfig)
|
||||||
|
# Set the max_model_len attribute to avoid missing attribute
|
||||||
|
mock_model_config.max_model_len = 2048
|
||||||
|
|
||||||
|
serving_engine = OpenAIServing(mock_engine_client,
|
||||||
|
mock_model_config,
|
||||||
|
served_model_names=[MODEL_NAME],
|
||||||
|
lora_modules=None,
|
||||||
|
prompt_adapters=None,
|
||||||
|
request_logger=None)
|
||||||
|
return serving_engine
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_lora_adapter_success():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter",
|
||||||
|
lora_path="/path/to/adapter2")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
assert serving_engine.lora_requests[0].lora_name == "adapter"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_lora_adapter_missing_fields():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="", lora_path="")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_lora_adapter_duplicate():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||||
|
lora_path="/path/to/adapter1")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
|
||||||
|
lora_name='adapter1')
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||||
|
lora_path="/path/to/adapter1")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unload_lora_adapter_success():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||||
|
lora_path="/path/to/adapter1")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
|
||||||
|
request = UnloadLoraAdapterRequest(lora_name="adapter1")
|
||||||
|
response = await serving_engine.unload_lora_adapter(request)
|
||||||
|
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
|
||||||
|
lora_name='adapter1')
|
||||||
|
assert len(serving_engine.lora_requests) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unload_lora_adapter_missing_fields():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
|
||||||
|
response = await serving_engine.unload_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unload_lora_adapter_not_found():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
|
||||||
|
response = await serving_engine.unload_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
@@ -3,8 +3,10 @@ from typing import Type
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
|
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
|
||||||
NewGELU, SiluAndMul)
|
NewGELU, QuickGELU,
|
||||||
|
SiluAndMul)
|
||||||
|
|
||||||
from .allclose_default import get_default_atol, get_default_rtol
|
from .allclose_default import get_default_atol, get_default_rtol
|
||||||
|
|
||||||
@@ -39,18 +41,28 @@ def test_act_and_mul(
|
|||||||
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
||||||
if activation == "silu":
|
if activation == "silu":
|
||||||
layer = SiluAndMul()
|
layer = SiluAndMul()
|
||||||
|
fn = torch.ops._C.silu_and_mul
|
||||||
elif activation == "gelu":
|
elif activation == "gelu":
|
||||||
layer = GeluAndMul(approximate="none")
|
layer = GeluAndMul(approximate="none")
|
||||||
|
fn = torch.ops._C.gelu_and_mul
|
||||||
elif activation == "gelu_tanh":
|
elif activation == "gelu_tanh":
|
||||||
layer = GeluAndMul(approximate="tanh")
|
layer = GeluAndMul(approximate="tanh")
|
||||||
|
fn = torch.ops._C.gelu_tanh_and_mul
|
||||||
out = layer(x)
|
out = layer(x)
|
||||||
ref_out = layer.forward_native(x)
|
ref_out = layer.forward_native(x)
|
||||||
# The SiLU and GELU implementations are equivalent to the native PyTorch
|
# The SiLU and GELU implementations are equivalent to the native PyTorch
|
||||||
# implementations, so we can do exact comparison.
|
# implementations, so we can do exact comparison.
|
||||||
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
|
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
|
||||||
|
|
||||||
|
d = x.shape[-1] // 2
|
||||||
|
output_shape = (x.shape[:-1] + (d, ))
|
||||||
|
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
||||||
|
opcheck(fn, (out, x))
|
||||||
|
|
||||||
@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
|
|
||||||
|
@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
|
||||||
|
(NewGELU, torch.ops._C.gelu_new),
|
||||||
|
(QuickGELU, torch.ops._C.gelu_quick)])
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("d", D)
|
@pytest.mark.parametrize("d", D)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@@ -70,10 +82,14 @@ def test_activation(
|
|||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
x = torch.randn(num_tokens, d, dtype=dtype)
|
x = torch.randn(num_tokens, d, dtype=dtype)
|
||||||
layer = activation()
|
layer = activation[0]()
|
||||||
|
fn = activation[1]
|
||||||
out = layer(x)
|
out = layer(x)
|
||||||
ref_out = layer.forward_native(x)
|
ref_out = layer.forward_native(x)
|
||||||
torch.testing.assert_close(out,
|
torch.testing.assert_close(out,
|
||||||
ref_out,
|
ref_out,
|
||||||
atol=get_default_atol(out),
|
atol=get_default_atol(out),
|
||||||
rtol=get_default_rtol(out))
|
rtol=get_default_rtol(out))
|
||||||
|
|
||||||
|
out = torch.empty_like(x)
|
||||||
|
opcheck(fn, (out, x))
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import torch
|
|||||||
from xformers import ops as xops
|
from xformers import ops as xops
|
||||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import get_max_shared_memory_bytes, is_hip
|
from vllm.utils import get_max_shared_memory_bytes, is_hip
|
||||||
|
|
||||||
@@ -198,6 +199,13 @@ def test_paged_attention(
|
|||||||
k_scale,
|
k_scale,
|
||||||
v_scale,
|
v_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.paged_attention_v1,
|
||||||
|
(output, query, key_cache, value_cache, num_kv_heads, scale,
|
||||||
|
block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
|
||||||
|
kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
|
|
||||||
elif version == "v2":
|
elif version == "v2":
|
||||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||||
assert PARTITION_SIZE % block_size == 0
|
assert PARTITION_SIZE % block_size == 0
|
||||||
@@ -230,6 +238,14 @@ def test_paged_attention(
|
|||||||
k_scale,
|
k_scale,
|
||||||
v_scale,
|
v_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.paged_attention_v2,
|
||||||
|
(output, exp_sums, max_logits, tmp_output, query, key_cache,
|
||||||
|
value_cache, num_kv_heads, scale, block_tables, seq_lens,
|
||||||
|
block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
|
||||||
|
k_scale, v_scale, 0, 0, 0, 64, 0),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise AssertionError(f"Unknown version: {version}")
|
raise AssertionError(f"Unknown version: {version}")
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from typing import List, Tuple
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
|
||||||
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
||||||
@@ -87,6 +88,11 @@ def test_copy_blocks(
|
|||||||
block_mapping_tensor = torch.tensor(block_mapping,
|
block_mapping_tensor = torch.tensor(block_mapping,
|
||||||
dtype=torch.int64,
|
dtype=torch.int64,
|
||||||
device=device).view(-1, 2)
|
device=device).view(-1, 2)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C_cache_ops.copy_blocks,
|
||||||
|
(key_caches, value_caches, block_mapping_tensor),
|
||||||
|
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
||||||
|
|
||||||
# Run the reference implementation.
|
# Run the reference implementation.
|
||||||
@@ -162,6 +168,10 @@ def test_reshape_and_cache(
|
|||||||
k_scale = v_scale = 1.0
|
k_scale = v_scale = 1.0
|
||||||
|
|
||||||
# Call the reshape_and_cache kernel.
|
# Call the reshape_and_cache kernel.
|
||||||
|
opcheck(torch.ops._C_cache_ops.reshape_and_cache,
|
||||||
|
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
|
||||||
|
k_scale, v_scale),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
|
ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
|
||||||
kv_cache_dtype, k_scale, v_scale)
|
kv_cache_dtype, k_scale, v_scale)
|
||||||
|
|
||||||
@@ -269,6 +279,10 @@ def test_reshape_and_cache_flash(
|
|||||||
k_scale = v_scale = 1.0
|
k_scale = v_scale = 1.0
|
||||||
|
|
||||||
# Call the reshape_and_cache kernel.
|
# Call the reshape_and_cache kernel.
|
||||||
|
opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
|
||||||
|
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
|
||||||
|
k_scale, v_scale),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
|
ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
|
||||||
slot_mapping, kv_cache_dtype, k_scale, v_scale)
|
slot_mapping, kv_cache_dtype, k_scale, v_scale)
|
||||||
|
|
||||||
@@ -366,6 +380,14 @@ def test_swap_blocks(
|
|||||||
src_value_caches_clone = src_value_caches[0].clone()
|
src_value_caches_clone = src_value_caches[0].clone()
|
||||||
|
|
||||||
# Call the swap_blocks kernel.
|
# Call the swap_blocks kernel.
|
||||||
|
do_opcheck = (head_size == HEAD_SIZES[0])
|
||||||
|
opcheck(torch.ops._C_cache_ops.swap_blocks,
|
||||||
|
(src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
|
||||||
|
cond=do_opcheck)
|
||||||
|
opcheck(torch.ops._C_cache_ops.swap_blocks,
|
||||||
|
(src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
|
||||||
|
cond=do_opcheck)
|
||||||
|
|
||||||
ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
|
ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
|
||||||
block_mapping_tensor)
|
block_mapping_tensor)
|
||||||
ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
|
ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from typing import Optional, Type
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@@ -108,6 +109,9 @@ def cutlass_int8_gemm_helper(m: int,
|
|||||||
|
|
||||||
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
|
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.cutlass_scaled_mm,
|
||||||
|
(out, a, b, scale_a, scale_b, bias))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
|
@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
|
||||||
@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
|
@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
|
||||||
@@ -341,6 +345,15 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
|
|||||||
torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
|
torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
|
||||||
torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
|
torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
|
||||||
|
|
||||||
|
if azp_per_token:
|
||||||
|
opcheck(torch.ops._C.cutlass_scaled_mm_azp,
|
||||||
|
(out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
|
||||||
|
func_bias))
|
||||||
|
else:
|
||||||
|
opcheck(torch.ops._C.cutlass_scaled_mm_azp,
|
||||||
|
(out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
|
||||||
|
func_bias))
|
||||||
|
|
||||||
|
|
||||||
# Test working with a subset of A and B
|
# Test working with a subset of A and B
|
||||||
def test_cutlass_subset():
|
def test_cutlass_subset():
|
||||||
|
|||||||
@@ -445,7 +445,8 @@ def test_flashinfer_decode_with_paged_fp8_kv(
|
|||||||
head_size,
|
head_size,
|
||||||
block_size,
|
block_size,
|
||||||
"NONE",
|
"NONE",
|
||||||
data_type=dtype)
|
data_type=dtype,
|
||||||
|
q_data_type=dtype)
|
||||||
output = wrapper.forward(query,
|
output = wrapper.forward(query,
|
||||||
kv_cache_fp8,
|
kv_cache_fp8,
|
||||||
logits_soft_cap=soft_cap,
|
logits_soft_cap=soft_cap,
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
|
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm._custom_ops import scaled_int8_quant
|
from vllm._custom_ops import scaled_int8_quant
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
@@ -12,6 +13,16 @@ SEEDS = [0]
|
|||||||
SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
|
SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
|
||||||
|
|
||||||
|
|
||||||
|
def opcheck_int8_quant(output, input, scale=None):
|
||||||
|
if scale is not None:
|
||||||
|
opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale))
|
||||||
|
else:
|
||||||
|
scale = torch.empty((input.numel() // input.shape[-1], 1),
|
||||||
|
device=input.device,
|
||||||
|
dtype=torch.float32)
|
||||||
|
opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@@ -34,6 +45,8 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
|
|||||||
ops_out, ref_out, atol=1,
|
ops_out, ref_out, atol=1,
|
||||||
rtol=0.0) # big atol to account for rounding errors
|
rtol=0.0) # big atol to account for rounding errors
|
||||||
|
|
||||||
|
opcheck_int8_quant(ops_out, x)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||||
@@ -58,3 +71,5 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
|
|||||||
torch.testing.assert_close(
|
torch.testing.assert_close(
|
||||||
out1, out2, atol=1,
|
out1, out2, atol=1,
|
||||||
rtol=0.0) # big atol to account for rounding errors
|
rtol=0.0) # big atol to account for rounding errors
|
||||||
|
|
||||||
|
opcheck_int8_quant(out2, x, scale)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
@@ -52,3 +53,10 @@ def test_rms_norm(
|
|||||||
torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
|
torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
|
||||||
else:
|
else:
|
||||||
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
|
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
|
||||||
|
|
||||||
|
if residual is not None:
|
||||||
|
opcheck(torch.ops._C.fused_add_rms_norm,
|
||||||
|
(x, residual, layer.weight.data, layer.variance_epsilon))
|
||||||
|
else:
|
||||||
|
opcheck(torch.ops._C.rms_norm,
|
||||||
|
(out, x, layer.weight.data, layer.variance_epsilon))
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from typing import Optional, Tuple
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
pack_rows, quantize_weights)
|
pack_rows, quantize_weights)
|
||||||
@@ -76,6 +77,8 @@ def machete_quantize_and_pack(w: torch.Tensor,
|
|||||||
w_q = w_q.t().contiguous().t() # convert to col major
|
w_q = w_q.t().contiguous().t() # convert to col major
|
||||||
w_q_machete = ops.machete_prepack_B(w_q, wtype)
|
w_q_machete = ops.machete_prepack_B(w_q, wtype)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype))
|
||||||
|
|
||||||
return w_ref, w_q_machete, w_s, w_zp
|
return w_ref, w_q_machete, w_s, w_zp
|
||||||
|
|
||||||
|
|
||||||
@@ -146,6 +149,10 @@ def test_machete_all_schedules(shape, atype: torch.dtype,
|
|||||||
schedule=schedule,
|
schedule=schedule,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.machete_gemm,
|
||||||
|
(a, w_q_machete, wtype, w_s, maybe_convert_zeropoints(
|
||||||
|
w_zp, w_s), group_size, None, None, None, schedule))
|
||||||
|
|
||||||
# Relax atol as our reduction dim becomes larger (more rounding error)
|
# Relax atol as our reduction dim becomes larger (more rounding error)
|
||||||
# Relax atol when we have zeropoints since the way machete applies
|
# Relax atol when we have zeropoints since the way machete applies
|
||||||
# zeropoints (after scales) causes noise around 0
|
# zeropoints (after scales) causes noise around 0
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||||
@@ -73,12 +74,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
act_order, mnk_factors):
|
act_order, mnk_factors):
|
||||||
m_factor, n_factor, k_factor = mnk_factors
|
m_factor, n_factor, k_factor = mnk_factors
|
||||||
|
|
||||||
size_m = m_factor
|
|
||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
|
|
||||||
# Filter act_order
|
# Filter act_order
|
||||||
if act_order:
|
if act_order:
|
||||||
if group_size == -1:
|
if group_size == -1:
|
||||||
@@ -112,6 +110,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
||||||
weight_perm)
|
weight_perm)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.gptq_marlin_repack,
|
||||||
|
(q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
|
||||||
|
|
||||||
# Run Marlin repack GPU kernel
|
# Run Marlin repack GPU kernel
|
||||||
marlin_q_w_2 = ops.gptq_marlin_repack(
|
marlin_q_w_2 = ops.gptq_marlin_repack(
|
||||||
q_w_gptq,
|
q_w_gptq,
|
||||||
@@ -137,12 +138,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
mnk_factors):
|
mnk_factors):
|
||||||
m_factor, n_factor, k_factor = mnk_factors
|
m_factor, n_factor, k_factor = mnk_factors
|
||||||
|
|
||||||
size_m = m_factor
|
|
||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
|
|
||||||
# Normalize group_size
|
# Normalize group_size
|
||||||
if group_size == -1:
|
if group_size == -1:
|
||||||
group_size = size_k
|
group_size = size_k
|
||||||
@@ -165,6 +163,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
||||||
weight_perm)
|
weight_perm)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.awq_marlin_repack,
|
||||||
|
(q_w_awq, size_k, size_n, quant_type.size_bits))
|
||||||
|
|
||||||
# Run Marlin repack GPU kernel
|
# Run Marlin repack GPU kernel
|
||||||
marlin_q_w_2 = ops.awq_marlin_repack(
|
marlin_q_w_2 = ops.awq_marlin_repack(
|
||||||
q_w_awq,
|
q_w_awq,
|
||||||
@@ -204,9 +205,6 @@ def test_gptq_marlin_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
if act_order:
|
if act_order:
|
||||||
if group_size == -1:
|
if group_size == -1:
|
||||||
return
|
return
|
||||||
@@ -224,6 +222,13 @@ def test_gptq_marlin_gemm(
|
|||||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||||
GPTQ_MARLIN_MAX_PARALLEL)
|
GPTQ_MARLIN_MAX_PARALLEL)
|
||||||
|
|
||||||
|
opcheck(
|
||||||
|
torch.ops._C.gptq_marlin_gemm,
|
||||||
|
(a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
|
||||||
|
workspace.scratch, quant_type, a_input.shape[0], b_weight.shape[1],
|
||||||
|
a_input.shape[1], is_k_full, False, use_fp32_reduce),
|
||||||
|
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
|
||||||
|
|
||||||
output = ops.gptq_marlin_gemm(
|
output = ops.gptq_marlin_gemm(
|
||||||
a_input,
|
a_input,
|
||||||
marlin_q_w,
|
marlin_q_w,
|
||||||
@@ -245,7 +250,6 @@ def test_gptq_marlin_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -265,9 +269,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k))
|
a_input = rand_data((size_m, size_k))
|
||||||
b_weight = rand_data((size_k, size_n))
|
b_weight = rand_data((size_k, size_n))
|
||||||
|
|
||||||
@@ -279,6 +280,12 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
|
|
||||||
output_ref = torch.matmul(a_input, w_24_ref)
|
output_ref = torch.matmul(a_input, w_24_ref)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.gptq_marlin_24_gemm,
|
||||||
|
(a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
|
||||||
|
workspace_24.scratch, quant_type, a_input.shape[0],
|
||||||
|
b_weight.shape[1], a_input.shape[1]),
|
||||||
|
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
|
||||||
|
|
||||||
output = ops.gptq_marlin_24_gemm(
|
output = ops.gptq_marlin_24_gemm(
|
||||||
a_input,
|
a_input,
|
||||||
marlin_24_q_w_comp,
|
marlin_24_q_w_comp,
|
||||||
@@ -294,7 +301,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -321,9 +327,6 @@ def test_fp8_marlin_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k), dtype=dtype)
|
a_input = rand_data((size_m, size_k), dtype=dtype)
|
||||||
b_weight = rand_data((size_k, size_n), dtype=dtype)
|
b_weight = rand_data((size_k, size_n), dtype=dtype)
|
||||||
|
|
||||||
@@ -353,6 +356,10 @@ def test_fp8_marlin_gemm(
|
|||||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||||
GPTQ_MARLIN_MAX_PARALLEL)
|
GPTQ_MARLIN_MAX_PARALLEL)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.fp8_marlin_gemm,
|
||||||
|
(a_input, marlin_qweight, marlin_scales, workspace.scratch,
|
||||||
|
num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1]))
|
||||||
|
|
||||||
output = ops.fp8_marlin_gemm(
|
output = ops.fp8_marlin_gemm(
|
||||||
a=a_input,
|
a=a_input,
|
||||||
b_q_weight=marlin_qweight,
|
b_q_weight=marlin_qweight,
|
||||||
@@ -368,7 +375,6 @@ def test_fp8_marlin_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -396,9 +402,6 @@ def test_awq_marlin_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k))
|
a_input = rand_data((size_m, size_k))
|
||||||
b_weight = rand_data((size_k, size_n))
|
b_weight = rand_data((size_k, size_n))
|
||||||
|
|
||||||
@@ -434,7 +437,6 @@ def test_awq_marlin_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -460,9 +462,6 @@ def test_marlin_qqq_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k))
|
a_input = rand_data((size_m, size_k))
|
||||||
b_weight = rand_data((size_k, size_n))
|
b_weight = rand_data((size_k, size_n))
|
||||||
|
|
||||||
@@ -479,6 +478,11 @@ def test_marlin_qqq_gemm(
|
|||||||
workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
|
workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
|
||||||
MARLIN_QQQ_MAX_PARALLEL)
|
MARLIN_QQQ_MAX_PARALLEL)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.marlin_qqq_gemm,
|
||||||
|
(q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
|
||||||
|
marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
|
||||||
|
b_weight.shape[1], a_input.shape[1]))
|
||||||
|
|
||||||
output = ops.marlin_qqq_gemm(
|
output = ops.marlin_qqq_gemm(
|
||||||
q_a,
|
q_a,
|
||||||
marlin_qqq_q_w,
|
marlin_qqq_q_w,
|
||||||
@@ -495,6 +499,5 @@ def test_marlin_qqq_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
Run `pytest tests/kernels/test_moe.py`.
|
Run `pytest tests/kernels/test_moe.py`.
|
||||||
"""
|
"""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from transformers import MixtralConfig
|
from transformers import MixtralConfig
|
||||||
@@ -9,7 +11,13 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
|||||||
|
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
|
||||||
|
fused_marlin_moe, single_marlin_moe)
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
||||||
|
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||||
|
marlin_quantize)
|
||||||
from vllm.model_executor.models.mixtral import MixtralMoE
|
from vllm.model_executor.models.mixtral import MixtralMoE
|
||||||
|
from vllm.scalar_type import scalar_types
|
||||||
|
|
||||||
|
|
||||||
def torch_moe(a, w1, w2, score, topk):
|
def torch_moe(a, w1, w2, score, topk):
|
||||||
@@ -29,6 +37,20 @@ def torch_moe(a, w1, w2, score, topk):
|
|||||||
topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
|
topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
|
||||||
|
|
||||||
|
|
||||||
|
def torch_moe_single(a, w, score, topk):
|
||||||
|
B, D = a.shape
|
||||||
|
a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
|
||||||
|
out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
|
||||||
|
score = torch.softmax(score, dim=-1, dtype=torch.float32)
|
||||||
|
_, topk_ids = torch.topk(score, topk)
|
||||||
|
topk_ids = topk_ids.view(-1)
|
||||||
|
for i in range(w.shape[0]):
|
||||||
|
mask = topk_ids == i
|
||||||
|
if mask.sum():
|
||||||
|
out[mask] = a[mask] @ w[i].transpose(0, 1)
|
||||||
|
return (out.view(B, -1, w.shape[1])).sum(dim=1)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
|
@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
|
||||||
@pytest.mark.parametrize("n", [2048, 256, 1024])
|
@pytest.mark.parametrize("n", [2048, 256, 1024])
|
||||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||||
@@ -43,11 +65,11 @@ def test_fused_moe(
|
|||||||
topk: int,
|
topk: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||||
w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
|
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||||
w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
|
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||||
|
|
||||||
score = torch.randn((m, e), device='cuda', dtype=dtype)
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||||
triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
|
triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
|
||||||
torch_output = torch_moe(a, w1, w2, score, topk)
|
torch_output = torch_moe(a, w1, w2, score, topk)
|
||||||
torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
|
torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
|
||||||
@@ -99,3 +121,194 @@ def test_mixtral_moe(dtype: torch.dtype):
|
|||||||
vllm_states,
|
vllm_states,
|
||||||
rtol=mixtral_moe_tol[dtype],
|
rtol=mixtral_moe_tol[dtype],
|
||||||
atol=mixtral_moe_tol[dtype])
|
atol=mixtral_moe_tol[dtype])
|
||||||
|
|
||||||
|
|
||||||
|
def stack_and_dev(tensors: List[torch.Tensor]):
|
||||||
|
dev = tensors[0].device
|
||||||
|
return torch.stack(tensors, dim=0).to(dev)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_max_diff(output, output_ref):
|
||||||
|
return torch.mean(torch.abs(output - output_ref)) / torch.mean(
|
||||||
|
torch.abs(output_ref))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
|
||||||
|
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
|
||||||
|
@pytest.mark.parametrize("k", [128, 1024, 512])
|
||||||
|
@pytest.mark.parametrize("e", [4, 8, 64])
|
||||||
|
@pytest.mark.parametrize("topk", [2, 6])
|
||||||
|
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
|
||||||
|
@pytest.mark.parametrize("act_order", [True, False])
|
||||||
|
def test_fused_marlin_moe(
|
||||||
|
m: int,
|
||||||
|
n: int,
|
||||||
|
k: int,
|
||||||
|
e: int,
|
||||||
|
topk: int,
|
||||||
|
group_size: int,
|
||||||
|
act_order: bool,
|
||||||
|
):
|
||||||
|
torch.manual_seed(7)
|
||||||
|
|
||||||
|
if topk > e:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filter act_order
|
||||||
|
if act_order:
|
||||||
|
if group_size == -1:
|
||||||
|
return
|
||||||
|
if group_size in (k, n):
|
||||||
|
return
|
||||||
|
|
||||||
|
quant_type = scalar_types.uint4b8
|
||||||
|
dtype = torch.float16
|
||||||
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||||
|
for i in range(w2.shape[0]):
|
||||||
|
w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
|
||||||
|
|
||||||
|
w_ref1_l = []
|
||||||
|
qweight1_l = []
|
||||||
|
scales1_l = []
|
||||||
|
g_idx1_l = []
|
||||||
|
sort_indices1_l = []
|
||||||
|
|
||||||
|
for i in range(w1.shape[0]):
|
||||||
|
test_perm = torch.randperm(k)
|
||||||
|
w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
|
||||||
|
w1[i].transpose(1, 0), quant_type, group_size, act_order,
|
||||||
|
test_perm)
|
||||||
|
w_ref1_l.append(w_ref1)
|
||||||
|
qweight1_l.append(qweight1)
|
||||||
|
scales1_l.append(scales1)
|
||||||
|
g_idx1_l.append(g_idx1)
|
||||||
|
sort_indices1_l.append(sort_indices1)
|
||||||
|
|
||||||
|
w_ref1 = stack_and_dev(w_ref1_l)
|
||||||
|
qweight1 = stack_and_dev(qweight1_l).contiguous()
|
||||||
|
scales1 = stack_and_dev(scales1_l)
|
||||||
|
g_idx1 = stack_and_dev(g_idx1_l)
|
||||||
|
sort_indices1 = stack_and_dev(sort_indices1_l)
|
||||||
|
|
||||||
|
w_ref2_l = []
|
||||||
|
qweight2_l = []
|
||||||
|
scales2_l = []
|
||||||
|
g_idx2_l = []
|
||||||
|
sort_indices2_l = []
|
||||||
|
|
||||||
|
for i in range(w2.shape[0]):
|
||||||
|
test_perm = torch.randperm(n)
|
||||||
|
w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
|
||||||
|
w2[i].transpose(1, 0), quant_type, group_size, act_order,
|
||||||
|
test_perm)
|
||||||
|
w_ref2_l.append(w_ref2)
|
||||||
|
qweight2_l.append(qweight2)
|
||||||
|
scales2_l.append(scales2)
|
||||||
|
g_idx2_l.append(g_idx2)
|
||||||
|
sort_indices2_l.append(sort_indices2)
|
||||||
|
|
||||||
|
w_ref2 = stack_and_dev(w_ref2_l)
|
||||||
|
qweight2 = stack_and_dev(qweight2_l).contiguous()
|
||||||
|
scales2 = stack_and_dev(scales2_l)
|
||||||
|
g_idx2 = stack_and_dev(g_idx2_l)
|
||||||
|
sort_indices2 = stack_and_dev(sort_indices2_l)
|
||||||
|
|
||||||
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||||
|
|
||||||
|
topk_weights, topk_ids = fused_topk(a, score, topk, False)
|
||||||
|
|
||||||
|
triton_output = fused_moe(
|
||||||
|
a,
|
||||||
|
w_ref1.transpose(1, 2).contiguous(),
|
||||||
|
w_ref2.transpose(1, 2).contiguous(),
|
||||||
|
score,
|
||||||
|
topk,
|
||||||
|
renormalize=False,
|
||||||
|
)
|
||||||
|
marlin_output = fused_marlin_moe(
|
||||||
|
a,
|
||||||
|
qweight1,
|
||||||
|
qweight2,
|
||||||
|
score,
|
||||||
|
g_idx1,
|
||||||
|
g_idx2,
|
||||||
|
sort_indices1,
|
||||||
|
sort_indices2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
w1_scale=scales1,
|
||||||
|
w2_scale=scales2,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert compute_max_diff(marlin_output, triton_output) < 4e-2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip("This test is here for the sake of debugging, "
|
||||||
|
"don't run it in automated tests.")
|
||||||
|
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
|
||||||
|
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
|
||||||
|
@pytest.mark.parametrize("k", [128, 1024, 512])
|
||||||
|
@pytest.mark.parametrize("e", [4, 8, 64])
|
||||||
|
@pytest.mark.parametrize("topk", [2, 6])
|
||||||
|
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
|
||||||
|
@pytest.mark.parametrize("act_order", [True, False])
|
||||||
|
def test_marlin_moe_mmm(
|
||||||
|
m: int,
|
||||||
|
n: int,
|
||||||
|
k: int,
|
||||||
|
e: int,
|
||||||
|
topk: int,
|
||||||
|
group_size: int,
|
||||||
|
act_order: bool,
|
||||||
|
):
|
||||||
|
if topk > e:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filter act_order
|
||||||
|
if act_order:
|
||||||
|
if group_size == -1:
|
||||||
|
return
|
||||||
|
if group_size == k:
|
||||||
|
return
|
||||||
|
|
||||||
|
quant_type = scalar_types.uint4b8
|
||||||
|
dtype = torch.float16
|
||||||
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
|
||||||
|
|
||||||
|
w_ref_l = []
|
||||||
|
qweights_l = []
|
||||||
|
scales_l = []
|
||||||
|
g_idx_l = []
|
||||||
|
sort_indices_l = []
|
||||||
|
|
||||||
|
for i in range(w.shape[0]):
|
||||||
|
test_perm = torch.randperm(k)
|
||||||
|
w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
|
||||||
|
w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
|
||||||
|
w_ref_l.append(w_ref)
|
||||||
|
qweights_l.append(qweight)
|
||||||
|
scales_l.append(scales)
|
||||||
|
g_idx_l.append(g_idx)
|
||||||
|
sort_indices_l.append(sort_indices)
|
||||||
|
|
||||||
|
w_ref = stack_and_dev(w_ref_l)
|
||||||
|
qweight = stack_and_dev(qweights_l).contiguous()
|
||||||
|
scales = stack_and_dev(scales_l)
|
||||||
|
g_idx = stack_and_dev(g_idx_l)
|
||||||
|
sort_indices = stack_and_dev(sort_indices_l)
|
||||||
|
|
||||||
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||||
|
marlin_output = single_marlin_moe(a,
|
||||||
|
qweight,
|
||||||
|
scales,
|
||||||
|
score,
|
||||||
|
g_idx,
|
||||||
|
sort_indices,
|
||||||
|
topk,
|
||||||
|
renormalize=False)
|
||||||
|
torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
|
||||||
|
|
||||||
|
assert compute_max_diff(marlin_output, torch_output) < 1e-2
|
||||||
|
|||||||
@@ -3,7 +3,8 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
from numbers import Number
|
from numbers import Number
|
||||||
from typing import Any, List, NamedTuple, Optional, Tuple, Union
|
from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
|
||||||
|
Union)
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@@ -13,6 +14,21 @@ from vllm.attention.backends.xformers import XFormersBackend
|
|||||||
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
|
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
|
||||||
make_tensor_with_pad)
|
make_tensor_with_pad)
|
||||||
|
|
||||||
|
# For now, disable "test_aot_dispatch_dynamic" since there are some
|
||||||
|
# bugs related to this test in PyTorch 2.4.
|
||||||
|
DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
|
||||||
|
"test_schema",
|
||||||
|
"test_autograd_registration",
|
||||||
|
"test_faketensor",
|
||||||
|
)
|
||||||
|
|
||||||
|
ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
|
||||||
|
"test_schema",
|
||||||
|
"test_autograd_registration",
|
||||||
|
"test_faketensor",
|
||||||
|
"test_aot_dispatch_dynamic",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class QKVInputs(NamedTuple):
|
class QKVInputs(NamedTuple):
|
||||||
'''
|
'''
|
||||||
@@ -926,3 +942,19 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
|
|||||||
ideal_output = test_params.packed_qkvo.ideal_output
|
ideal_output = test_params.packed_qkvo.ideal_output
|
||||||
torch.testing.assert_close(ideal_output,
|
torch.testing.assert_close(ideal_output,
|
||||||
output_under_test.view_as(ideal_output))
|
output_under_test.view_as(ideal_output))
|
||||||
|
|
||||||
|
|
||||||
|
def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
|
||||||
|
torch._library.custom_ops.CustomOpDef],
|
||||||
|
args: Tuple[Any, ...],
|
||||||
|
kwargs: Optional[Dict[str, Any]] = None,
|
||||||
|
*,
|
||||||
|
test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
|
||||||
|
raise_exception: bool = True,
|
||||||
|
cond: bool = True) -> Dict[str, str]:
|
||||||
|
return torch.library.opcheck(
|
||||||
|
op,
|
||||||
|
args,
|
||||||
|
kwargs,
|
||||||
|
test_utils=test_utils,
|
||||||
|
raise_exception=raise_exception) if cond else {}
|
||||||
|
|||||||
1
tests/models/fixtures/pixtral_chat.json
Normal file
1
tests/models/fixtures/pixtral_chat.json
Normal file
File diff suppressed because one or more lines are too long
1
tests/models/fixtures/pixtral_chat_engine.json
Normal file
1
tests/models/fixtures/pixtral_chat_engine.json
Normal file
File diff suppressed because one or more lines are too long
@@ -7,26 +7,6 @@ import pytest
|
|||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
|
||||||
# In this test we hardcode prompts and generations for the model so we don't
|
|
||||||
# need to require the AQLM package as a dependency
|
|
||||||
example_prompts = [
|
|
||||||
'vLLM is a high-throughput and memory-efficient inference and serving '
|
|
||||||
'engine for LLMs.\n',
|
|
||||||
'Briefly describe the major milestones in the development of artificial '
|
|
||||||
'intelligence from 1950 to 2020.\n',
|
|
||||||
'Compare and contrast artificial intelligence with human intelligence in '
|
|
||||||
'terms of processing information.\n',
|
|
||||||
'Describe the basic components of a neural network and how it can be '
|
|
||||||
'trained.\n',
|
|
||||||
'Write a short story about a robot that dreams for the first time.\n',
|
|
||||||
'Analyze the impact of the COVID-19 pandemic on global economic structures '
|
|
||||||
'and future business models.\n',
|
|
||||||
'Explain the cultural significance of the Mona Lisa painting, and how its '
|
|
||||||
'perception might vary in Western versus Eastern societies.\n',
|
|
||||||
"Translate the following English sentence into Japanese, French, and "
|
|
||||||
"Swahili: 'The early bird catches the worm.'\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
# These ground truth generations were generated using `transformers==4.38.1
|
# These ground truth generations were generated using `transformers==4.38.1
|
||||||
# aqlm==1.1.0 torch==2.2.0`
|
# aqlm==1.1.0 torch==2.2.0`
|
||||||
# and the below code:
|
# and the below code:
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import types
|
import types
|
||||||
from typing import List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@@ -9,7 +9,8 @@ from transformers import AutoConfig
|
|||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
from vllm.utils import is_cpu
|
from vllm.utils import is_cpu
|
||||||
|
|
||||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||||
|
_ImageAssets)
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.vlm
|
pytestmark = pytest.mark.vlm
|
||||||
@@ -20,6 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|||||||
"cherry_blossom":
|
"cherry_blossom":
|
||||||
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
})
|
})
|
||||||
|
HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
"OpenGVLab/InternVL2-1B",
|
"OpenGVLab/InternVL2-1B",
|
||||||
@@ -64,13 +66,13 @@ def generate(
|
|||||||
def run_test(
|
def run_test(
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
image_assets: _ImageAssets,
|
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||||
model: str,
|
model: str,
|
||||||
*,
|
*,
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
|
mm_limit: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
distributed_executor_backend: Optional[str] = None,
|
distributed_executor_backend: Optional[str] = None,
|
||||||
):
|
):
|
||||||
@@ -83,12 +85,6 @@ def run_test(
|
|||||||
Note, the text input is also adjusted to abide by vllm contract.
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
The text output is sanitized to be able to compare with hf.
|
The text output is sanitized to be able to compare with hf.
|
||||||
"""
|
"""
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
@@ -110,13 +106,21 @@ def run_test(
|
|||||||
self.max_num = self.config.max_dynamic_patch
|
self.max_num = self.config.max_dynamic_patch
|
||||||
self.image_size = self.vision_config.image_size
|
self.image_size = self.vision_config.image_size
|
||||||
|
|
||||||
def __call__(self, text: str, images: Image, **kwargs):
|
def __call__(self, text: str, images: Union[Image, List[Image]],
|
||||||
|
**kwargs):
|
||||||
from vllm.model_executor.models.internvl import (
|
from vllm.model_executor.models.internvl import (
|
||||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
||||||
pixel_values = image_to_pixel_values(
|
images = [images] if isinstance(images, Image) else images
|
||||||
images, self.image_size, self.min_num, self.max_num,
|
pixel_values = [
|
||||||
|
image_to_pixel_values(image, self.image_size, self.min_num,
|
||||||
|
self.max_num,
|
||||||
self.use_thumbnail).to(self.dtype)
|
self.use_thumbnail).to(self.dtype)
|
||||||
num_patches_list = [pixel_values.shape[0]]
|
for image in images
|
||||||
|
]
|
||||||
|
num_patches_list = [
|
||||||
|
pixel_value.shape[0] for pixel_value in pixel_values
|
||||||
|
]
|
||||||
|
pixel_values = torch.cat(pixel_values, dim=0)
|
||||||
for num_patches in num_patches_list:
|
for num_patches in num_patches_list:
|
||||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||||
* num_patches
|
* num_patches
|
||||||
@@ -130,6 +134,7 @@ def run_test(
|
|||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
limit_mm_per_prompt={"image": mm_limit},
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=True) as vllm_model:
|
||||||
@@ -138,7 +143,7 @@ def run_test(
|
|||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=images)
|
images=images)
|
||||||
for prompts, images in inputs_per_image
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
@@ -156,7 +161,7 @@ def run_test(
|
|||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=hf_images,
|
images=hf_images,
|
||||||
eos_token_id=eos_token_id)
|
eos_token_id=eos_token_id)
|
||||||
for prompts, hf_images in inputs_per_image
|
for prompts, hf_images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||||
@@ -264,15 +269,99 @@ if is_cpu():
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
|
||||||
run_test(
|
run_test(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
image_assets,
|
inputs_per_image,
|
||||||
model,
|
model,
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=1,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.5, 0.75, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
@torch.inference_mode()
|
||||||
|
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||||
|
size_factors, dtype: str, max_tokens: int,
|
||||||
|
num_logprobs: int) -> None:
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
inputs_per_case = [
|
||||||
|
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||||
|
[[rescale_image_size(image, factor) for image in images]
|
||||||
|
for factor in size_factors])
|
||||||
|
]
|
||||||
|
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs_per_case,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=2,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
|
||||||
|
@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
|
||||||
|
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
@torch.inference_mode()
|
||||||
|
def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
|
||||||
|
size_factors, dtype: str, max_tokens: int,
|
||||||
|
num_logprobs: int) -> None:
|
||||||
|
images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
|
||||||
|
|
||||||
|
inputs_batching = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
|
||||||
|
inputs_multi_images = [
|
||||||
|
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||||
|
[[rescale_image_size(image, factor) for image in images]
|
||||||
|
for factor in size_factors])
|
||||||
|
]
|
||||||
|
for inputs in [inputs_batching, inputs_multi_images]:
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=2,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type, overload
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
||||||
@@ -8,11 +8,14 @@ from vllm.multimodal.utils import rescale_image_size
|
|||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||||
|
_ImageAssets)
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.vlm
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
_LIMIT_IMAGE_PER_PROMPT = 4
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
"stop_sign":
|
"stop_sign":
|
||||||
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
||||||
@@ -52,6 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|||||||
return hf_output_ids, hf_output_str, out_logprobs
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
def run_test(
|
def run_test(
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
@@ -64,6 +68,78 @@ def run_test(
|
|||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
distributed_executor_backend: Optional[str] = None,
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
sizes: List[Tuple[int, int]],
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
size_factors: Optional[List[float]] = None,
|
||||||
|
sizes: Optional[List[Tuple[int, int]]] = None,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
if size_factors is not None:
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
elif sizes is not None:
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in sizes],
|
||||||
|
[image.resize(size) for size in sizes],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
else:
|
||||||
|
raise ValueError("You must provide either `size_factors` or `sizes`")
|
||||||
|
|
||||||
|
_run_test(hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs_per_image,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Inference result should be the same between hf and vllm.
|
"""Inference result should be the same between hf and vllm.
|
||||||
|
|
||||||
@@ -85,13 +161,6 @@ def run_test(
|
|||||||
else:
|
else:
|
||||||
mantis_processor = None
|
mantis_processor = None
|
||||||
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
@@ -100,15 +169,18 @@ def run_test(
|
|||||||
# max_model_len should be greater than image_feature_size
|
# max_model_len should be greater than image_feature_size
|
||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
max_model_len=4096,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=True,
|
||||||
|
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
||||||
|
}) as vllm_model:
|
||||||
vllm_outputs_per_image = [
|
vllm_outputs_per_image = [
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=images)
|
images=images)
|
||||||
for prompts, images in inputs_per_image
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
if mantis_processor is not None:
|
if mantis_processor is not None:
|
||||||
@@ -131,7 +203,7 @@ def run_test(
|
|||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=images)
|
images=images)
|
||||||
for prompts, images in inputs_per_image
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||||
@@ -181,6 +253,51 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
|
||||||
|
model, dtype, max_tokens,
|
||||||
|
num_logprobs) -> None:
|
||||||
|
stop_sign = image_assets[0].pil_image
|
||||||
|
cherry_blossom = image_assets[1].pil_image
|
||||||
|
|
||||||
|
inputs = [(
|
||||||
|
[
|
||||||
|
"USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
|
||||||
|
"USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
|
||||||
|
"USER: <image><image><image><image>\nDescribe 4 images.\nASSISTANT:", # noqa: E501
|
||||||
|
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
[stop_sign, cherry_blossom],
|
||||||
|
# Images with different sizes and aspect-ratios
|
||||||
|
[
|
||||||
|
rescale_image_size(stop_sign, 0.1),
|
||||||
|
stop_sign,
|
||||||
|
],
|
||||||
|
[
|
||||||
|
stop_sign,
|
||||||
|
rescale_image_size(stop_sign, 0.25),
|
||||||
|
cherry_blossom.resize((183, 488)),
|
||||||
|
cherry_blossom.resize((488, 183))
|
||||||
|
],
|
||||||
|
cherry_blossom,
|
||||||
|
])]
|
||||||
|
|
||||||
|
_run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
@pytest.mark.parametrize("model", models)
|
||||||
def test_context_length_too_short(vllm_runner, image_assets, model):
|
def test_context_length_too_short(vllm_runner, image_assets, model):
|
||||||
images = [asset.pil_image for asset in image_assets]
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|||||||
236
tests/models/test_llava_next_video.py
Normal file
236
tests/models/test_llava_next_video.py
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
from typing import List, Optional, Tuple, Type, overload
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import transformers
|
||||||
|
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
||||||
|
|
||||||
|
from vllm.multimodal.utils import (rescale_video_size, resize_video,
|
||||||
|
sample_frames_from_video)
|
||||||
|
from vllm.sequence import SampleLogprobs
|
||||||
|
|
||||||
|
from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
|
||||||
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
_PREFACE = (
|
||||||
|
"A chat between a curious human and an artificial intelligence assistant. "
|
||||||
|
"The assistant gives helpful, detailed, and polite answers to the human's "
|
||||||
|
"questions.")
|
||||||
|
|
||||||
|
HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||||
|
"sample_demo_1":
|
||||||
|
f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
|
||||||
|
})
|
||||||
|
|
||||||
|
models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
|
||||||
|
|
||||||
|
|
||||||
|
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||||
|
Optional[SampleLogprobs]],
|
||||||
|
model: str):
|
||||||
|
"""Sanitize vllm output to be comparable with hf output."""
|
||||||
|
output_ids, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(model)
|
||||||
|
video_token_id = config.video_token_index
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
eos_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
hf_output_ids = [
|
||||||
|
token_id for idx, token_id in enumerate(output_ids)
|
||||||
|
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output_str[0] == " "
|
||||||
|
hf_output_str = output_str[1:]
|
||||||
|
if hf_output_ids[-1] == eos_token_id:
|
||||||
|
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||||
|
|
||||||
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
size_factors: List[float],
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
num_frames: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
sizes: List[Tuple[int, int]],
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
num_frames: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
size_factors: Optional[List[float]] = None,
|
||||||
|
sizes: Optional[List[Tuple[int, int]]] = None,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
num_frames: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
videos = [
|
||||||
|
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||||
|
for asset in video_assets
|
||||||
|
]
|
||||||
|
|
||||||
|
for video in videos:
|
||||||
|
print(video.shape)
|
||||||
|
|
||||||
|
if size_factors is not None:
|
||||||
|
inputs_per_video = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_video_size(video, factor) for factor in size_factors],
|
||||||
|
) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
|
||||||
|
elif sizes is not None:
|
||||||
|
inputs_per_video = [(
|
||||||
|
[prompt for _ in sizes],
|
||||||
|
[resize_video(video, size) for size in sizes],
|
||||||
|
) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
|
||||||
|
else:
|
||||||
|
raise ValueError("You must provide either `size_factors` or `sizes`")
|
||||||
|
|
||||||
|
# max_model_len should be greater than image_feature_size
|
||||||
|
with vllm_runner(model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_model_len=4096,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
enforce_eager=True) as vllm_model:
|
||||||
|
vllm_outputs_per_video = [
|
||||||
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
videos=videos)
|
||||||
|
for prompts, videos in inputs_per_video
|
||||||
|
]
|
||||||
|
|
||||||
|
with hf_runner(model, dtype=dtype,
|
||||||
|
auto_cls=AutoModelForVision2Seq) as hf_model:
|
||||||
|
hf_outputs_per_video = [
|
||||||
|
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
videos=videos)
|
||||||
|
for prompts, videos in inputs_per_video
|
||||||
|
]
|
||||||
|
|
||||||
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
|
||||||
|
vllm_outputs_per_video):
|
||||||
|
# TODO: Check whether using original CLIPVisionModel can improve
|
||||||
|
# consistency against HF
|
||||||
|
check_logprobs_close(
|
||||||
|
outputs_0_lst=hf_outputs,
|
||||||
|
outputs_1_lst=[
|
||||||
|
vllm_to_hf_output(vllm_output, model)
|
||||||
|
for vllm_output in vllm_outputs
|
||||||
|
],
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(transformers.__version__ < "4.45",
|
||||||
|
reason="Waiting for next transformers release")
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No video
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
@pytest.mark.parametrize("num_frames", [16])
|
||||||
|
def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
|
||||||
|
dtype, max_tokens, num_logprobs, num_frames) -> None:
|
||||||
|
"""Inference result should be the same between hf and vllm.
|
||||||
|
|
||||||
|
All the image fixtures for the test is under tests/videos.
|
||||||
|
For huggingface runner, we provide the np.ndarray as input.
|
||||||
|
For vllm runner, we provide MultiModalDataDict objects
|
||||||
|
and corresponding MultiModalConfig as input.
|
||||||
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
|
The text output is sanitized to be able to compare with hf.
|
||||||
|
"""
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
video_assets,
|
||||||
|
model,
|
||||||
|
size_factors=size_factors,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
num_frames=num_frames,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(transformers.__version__ < "4.45",
|
||||||
|
reason="Waiting for next transformers release")
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sizes",
|
||||||
|
[[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
@pytest.mark.parametrize("num_frames", [16])
|
||||||
|
def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
|
||||||
|
dtype, max_tokens, num_logprobs,
|
||||||
|
num_frames) -> None:
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
video_assets,
|
||||||
|
model,
|
||||||
|
sizes=sizes,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
num_frames=num_frames,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
@@ -41,3 +41,43 @@ def test_models(
|
|||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", MODELS[1:])
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [64])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_mistral_format(
|
||||||
|
vllm_runner,
|
||||||
|
example_prompts,
|
||||||
|
model: str,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
) -> None:
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
tokenizer_mode="auto",
|
||||||
|
load_format="safetensors",
|
||||||
|
config_format="hf",
|
||||||
|
) as hf_format_model:
|
||||||
|
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
|
||||||
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
tokenizer_mode="mistral",
|
||||||
|
load_format="mistral",
|
||||||
|
config_format="mistral",
|
||||||
|
) as mistral_format_model:
|
||||||
|
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
|
||||||
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
|
||||||
|
check_logprobs_close(
|
||||||
|
outputs_0_lst=hf_format_outputs,
|
||||||
|
outputs_1_lst=mistral_format_outputs,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="mistral",
|
||||||
|
)
|
||||||
|
|||||||
79
tests/models/test_modelopt.py
Normal file
79
tests/models/test_modelopt.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# flake8: noqa
|
||||||
|
"""Tests Model Optimizer fp8 models against ground truth generation
|
||||||
|
Note: these tests will only pass on H100
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||||
|
|
||||||
|
MAX_MODEL_LEN = 1024
|
||||||
|
|
||||||
|
MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
|
||||||
|
|
||||||
|
EXPECTED_STRS_MAP = {
|
||||||
|
"nvidia/Llama-3.1-8B-Instruct-FP8": [
|
||||||
|
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
|
||||||
|
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||||
|
'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
|
||||||
|
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
|
||||||
|
'**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
|
||||||
|
'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
|
||||||
|
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||||
|
'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# This test compares against golden strings for exact match since
|
||||||
|
# there is no baseline implementation to compare against
|
||||||
|
# and is unstable w.r.t specifics of the fp8 implementation or
|
||||||
|
# the hardware being run on.
|
||||||
|
# Disabled to prevent it from breaking the build
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason=
|
||||||
|
"Prevent unstable test based on golden strings from breaking the build.")
|
||||||
|
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||||
|
reason="fp8 is not supported on this GPU type.")
|
||||||
|
@pytest.mark.parametrize("model_name", MODELS)
|
||||||
|
def test_models(example_prompts, model_name) -> None:
|
||||||
|
model = LLM(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=MAX_MODEL_LEN,
|
||||||
|
trust_remote_code=True,
|
||||||
|
enforce_eager=True,
|
||||||
|
quantization="modelopt",
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
formatted_prompts = [
|
||||||
|
tokenizer.apply_chat_template([{
|
||||||
|
"role": "user",
|
||||||
|
"content": prompt
|
||||||
|
}],
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True)
|
||||||
|
for prompt in example_prompts
|
||||||
|
]
|
||||||
|
params = SamplingParams(max_tokens=20, temperature=0)
|
||||||
|
generations: List[str] = []
|
||||||
|
# Note: these need to be run 1 at a time due to numerical precision,
|
||||||
|
# since the expected strs were generated this way.
|
||||||
|
for prompt in formatted_prompts:
|
||||||
|
outputs = model.generate(prompt, params)
|
||||||
|
generations.append(outputs[0].outputs[0].text)
|
||||||
|
del model
|
||||||
|
|
||||||
|
print(model_name, generations)
|
||||||
|
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||||
|
for i in range(len(example_prompts)):
|
||||||
|
generated_str = generations[i]
|
||||||
|
expected_str = expected_strs[i]
|
||||||
|
assert expected_str == generated_str, (
|
||||||
|
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
|
||||||
@@ -1,16 +1,15 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import List, Optional, Tuple, Type, Union
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from PIL import Image
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import is_cpu, is_hip
|
from vllm.utils import is_cpu, is_hip
|
||||||
|
|
||||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.vlm
|
pytestmark = pytest.mark.vlm
|
||||||
@@ -60,8 +59,7 @@ if is_hip():
|
|||||||
def run_test(
|
def run_test(
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
inputs: List[Tuple[List[str], Union[List[Image.Image],
|
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||||
List[List[Image.Image]]]]],
|
|
||||||
model: str,
|
model: str,
|
||||||
*,
|
*,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
|
|||||||
190
tests/models/test_pixtral.py
Normal file
190
tests/models/test_pixtral.py
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
|
||||||
|
|
||||||
|
Run `pytest tests/models/test_mistral.py`.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
from dataclasses import asdict
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from mistral_common.protocol.instruct.messages import ImageURLChunk
|
||||||
|
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||||
|
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||||
|
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||||
|
|
||||||
|
from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
|
||||||
|
from vllm.multimodal import MultiModalDataBuiltins
|
||||||
|
from vllm.sequence import Logprob, SampleLogprobs
|
||||||
|
|
||||||
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
MODELS = ["mistralai/Pixtral-12B-2409"]
|
||||||
|
IMG_URLS = [
|
||||||
|
"https://picsum.photos/id/237/400/300",
|
||||||
|
"https://picsum.photos/id/231/200/300",
|
||||||
|
"https://picsum.photos/id/27/500/500",
|
||||||
|
"https://picsum.photos/id/17/150/600",
|
||||||
|
]
|
||||||
|
PROMPT = "Describe each image in one short sentence."
|
||||||
|
|
||||||
|
|
||||||
|
def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
|
||||||
|
return [{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [{
|
||||||
|
"type": "text",
|
||||||
|
"text": PROMPT,
|
||||||
|
}] + [{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": url
|
||||||
|
}
|
||||||
|
} for url in urls],
|
||||||
|
}]
|
||||||
|
|
||||||
|
|
||||||
|
def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
|
||||||
|
msg = _create_msg_format(urls)
|
||||||
|
|
||||||
|
tokenizer = MistralTokenizer.from_model("pixtral")
|
||||||
|
|
||||||
|
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
|
||||||
|
tokenized = tokenizer.encode_chat_completion(request)
|
||||||
|
|
||||||
|
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
|
||||||
|
|
||||||
|
images = []
|
||||||
|
for chunk in request.messages[0].content:
|
||||||
|
if isinstance(chunk, ImageURLChunk):
|
||||||
|
images.append(image_from_chunk(chunk))
|
||||||
|
|
||||||
|
mm_data = MultiModalDataBuiltins(image=images)
|
||||||
|
engine_inputs["multi_modal_data"] = mm_data
|
||||||
|
|
||||||
|
return engine_inputs
|
||||||
|
|
||||||
|
|
||||||
|
MSGS = [
|
||||||
|
_create_msg_format(IMG_URLS[:1]),
|
||||||
|
_create_msg_format(IMG_URLS[:2]),
|
||||||
|
_create_msg_format(IMG_URLS),
|
||||||
|
]
|
||||||
|
ENGINE_INPUTS = [
|
||||||
|
_create_engine_inputs(IMG_URLS[:1]),
|
||||||
|
_create_engine_inputs(IMG_URLS[:2]),
|
||||||
|
_create_engine_inputs(IMG_URLS),
|
||||||
|
]
|
||||||
|
|
||||||
|
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||||
|
LIMIT_MM_PER_PROMPT = dict(image=4)
|
||||||
|
|
||||||
|
MAX_MODEL_LEN = [8192, 65536]
|
||||||
|
FIXTURE_LOGPROBS_CHAT = "tests/models/fixtures/pixtral_chat.json"
|
||||||
|
FIXTURE_LOGPROBS_ENGINE = "tests/models/fixtures/pixtral_chat_engine.json"
|
||||||
|
|
||||||
|
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
|
||||||
|
|
||||||
|
|
||||||
|
# For the test author to store golden output in JSON
|
||||||
|
def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
|
||||||
|
json_data = [(tokens, text,
|
||||||
|
[{k: asdict(v)
|
||||||
|
for k, v in token_logprobs.items()}
|
||||||
|
for token_logprobs in (logprobs or [])])
|
||||||
|
for tokens, text, logprobs in outputs]
|
||||||
|
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
json.dump(json_data, f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_outputs_w_logprobs(filename: str) -> OutputsLogprobs:
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
json_data = json.load(f)
|
||||||
|
|
||||||
|
return [(tokens, text,
|
||||||
|
[{int(k): Logprob(**v)
|
||||||
|
for k, v in token_logprobs.items()}
|
||||||
|
for token_logprobs in logprobs])
|
||||||
|
for tokens, text, logprobs in json_data]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason=
|
||||||
|
"Model is too big, test passed on A100 locally but will OOM on CI machine."
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
def test_chat(
|
||||||
|
vllm_runner,
|
||||||
|
max_model_len: int,
|
||||||
|
model: str,
|
||||||
|
dtype: str,
|
||||||
|
) -> None:
|
||||||
|
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
tokenizer_mode="mistral",
|
||||||
|
enable_chunked_prefill=False,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||||
|
) as vllm_model:
|
||||||
|
outputs = []
|
||||||
|
for msg in MSGS:
|
||||||
|
output = vllm_model.model.chat(msg,
|
||||||
|
sampling_params=SAMPLING_PARAMS)
|
||||||
|
|
||||||
|
outputs.extend(output)
|
||||||
|
|
||||||
|
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||||
|
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||||
|
outputs_1_lst=logprobs,
|
||||||
|
name_0="h100_ref",
|
||||||
|
name_1="output")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason=
|
||||||
|
"Model is too big, test passed on A100 locally but will OOM on CI machine."
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
|
||||||
|
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
|
||||||
|
args = EngineArgs(
|
||||||
|
model=model,
|
||||||
|
tokenizer_mode="mistral",
|
||||||
|
enable_chunked_prefill=False,
|
||||||
|
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||||
|
dtype=dtype,
|
||||||
|
)
|
||||||
|
engine = LLMEngine.from_engine_args(args)
|
||||||
|
|
||||||
|
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
|
||||||
|
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
|
||||||
|
|
||||||
|
outputs = []
|
||||||
|
count = 0
|
||||||
|
while True:
|
||||||
|
out = engine.step()
|
||||||
|
count += 1
|
||||||
|
for request_output in out:
|
||||||
|
if request_output.finished:
|
||||||
|
outputs.append(request_output)
|
||||||
|
|
||||||
|
if count == 2:
|
||||||
|
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
|
||||||
|
SAMPLING_PARAMS)
|
||||||
|
if not engine.has_unfinished_requests():
|
||||||
|
break
|
||||||
|
|
||||||
|
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||||
|
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
|
||||||
|
outputs_1_lst=logprobs,
|
||||||
|
name_0="h100_ref",
|
||||||
|
name_1="output")
|
||||||
@@ -1,44 +1,280 @@
|
|||||||
from typing import Type
|
import pathlib
|
||||||
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import torch
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
from ..conftest import HfRunner, VllmRunner
|
from vllm.config import ModelConfig
|
||||||
|
from vllm.inputs import InputContext, LLMInputs
|
||||||
|
from vllm.multimodal.base import MultiModalInputs
|
||||||
|
from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
|
||||||
|
|
||||||
|
from ..conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
|
||||||
|
VllmRunner, _ImageAssets)
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
models = ["qwen/qwen-vl"]
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
text_only_models = [
|
||||||
|
"Qwen/Qwen-7B-Chat" # Has no visual component
|
||||||
|
]
|
||||||
|
|
||||||
|
multimodal_models = ["Qwen/Qwen-VL"]
|
||||||
|
|
||||||
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
|
"stop_sign":
|
||||||
|
"Picture 1: <img></img>\nWhat's the content of the image?: ",
|
||||||
|
"cherry_blossom":
|
||||||
|
"Picture 1: <img></img>\nWhat is the season?: ",
|
||||||
|
})
|
||||||
|
|
||||||
|
HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n" # noqa: E501
|
||||||
|
HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n" # noqa: E501
|
||||||
|
### Multimodal preprocessing tests
|
||||||
|
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
|
||||||
|
# These values are specific to Qwen-VL/Chat; we can get these from the model
|
||||||
|
# config also, but they are hardcoded here to keep the parameterize/fixtures
|
||||||
|
# easy to read.
|
||||||
|
IMG_START_ID = 151857
|
||||||
|
IMG_END_ID = 151858
|
||||||
|
IMG_PAD_ID = 151859
|
||||||
|
TOKS_PER_IMG = 256
|
||||||
|
VIS_ENC_DIM = 4096
|
||||||
|
IMG_SIZE = 448
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
def build_model_context(model_name: str,
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
tokenizer_name: Optional[str] = None,
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
trust_remote_code: bool = False):
|
||||||
@pytest.mark.parametrize("model", models)
|
"""Creates an InputContext for a given model.
|
||||||
def test_text_only_qwen_model(
|
|
||||||
|
Args:
|
||||||
|
model_name: Name of the model being considered.
|
||||||
|
tokenizer_name: Name of the tokenizer being considered.
|
||||||
|
trust_remote_code: Whether or not to allow loading remote code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
InputContext for the model being considered.
|
||||||
|
"""
|
||||||
|
if tokenizer_name is None:
|
||||||
|
tokenizer_name = model_name
|
||||||
|
model_config = ModelConfig(
|
||||||
|
model_name,
|
||||||
|
tokenizer_name,
|
||||||
|
tokenizer_mode="auto",
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
dtype="float32",
|
||||||
|
seed=0,
|
||||||
|
)
|
||||||
|
return InputContext(model_config)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def input_mapper_for_qwen():
|
||||||
|
# Lazy import to avoid initializing CUDA during test collection
|
||||||
|
from vllm.model_executor.models.qwen import input_mapper_for_qwen
|
||||||
|
return input_mapper_for_qwen
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def input_processor_for_qwen():
|
||||||
|
# Lazy import to avoid initializing CUDA during test collection
|
||||||
|
from vllm.model_executor.models.qwen import input_processor_for_qwen
|
||||||
|
return input_processor_for_qwen
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def qwen_vl_context() -> InputContext:
|
||||||
|
"""Get an InputContext for Qwen-VL."""
|
||||||
|
return build_model_context(model_name="Qwen/Qwen-VL",
|
||||||
|
trust_remote_code=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Happy path tests for single/multi-image scenarios for the multimodal
|
||||||
|
# input processor and mapper, respectively
|
||||||
|
@pytest.mark.parametrize("num_images", [1, 2])
|
||||||
|
def test_input_processor_valid_mm_data(input_processor_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
num_images: int):
|
||||||
|
"""Happy cases for image inputs to Qwen's multimodal input processor."""
|
||||||
|
prompt = "".join(
|
||||||
|
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
|
||||||
|
inputs = LLMInputs(
|
||||||
|
prompt=prompt,
|
||||||
|
# When processing multimodal data for a multimodal model, the qwen
|
||||||
|
# input processor will overwrite the provided prompt_token_ids with
|
||||||
|
# the image prompts
|
||||||
|
prompt_token_ids=None,
|
||||||
|
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
|
||||||
|
)
|
||||||
|
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
||||||
|
assert isinstance(proc_inputs, dict)
|
||||||
|
|
||||||
|
# Each image should have one start / stop and a fixed context of 256
|
||||||
|
proc_tokens = proc_inputs["prompt_token_ids"]
|
||||||
|
assert proc_tokens.count(IMG_START_ID) == num_images
|
||||||
|
assert proc_tokens.count(IMG_END_ID) == num_images
|
||||||
|
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"img_data,expected_shape",
|
||||||
|
[
|
||||||
|
# single / multi-image
|
||||||
|
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
|
||||||
|
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
|
||||||
|
# single / multi-image embeddings
|
||||||
|
(torch.rand(
|
||||||
|
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||||
|
(torch.rand(
|
||||||
|
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||||
|
(torch.rand(
|
||||||
|
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||||
|
])
|
||||||
|
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
img_data: Union[torch.Tensor, List[Image],
|
||||||
|
Image],
|
||||||
|
expected_shape: List[int]):
|
||||||
|
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
|
||||||
|
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||||
|
# Ensure that we get the appropriately shaped pixel_values
|
||||||
|
# for images and image embeddings, respectively.
|
||||||
|
assert isinstance(mapped_img_data, MultiModalInputs)
|
||||||
|
assert "pixel_values" in mapped_img_data
|
||||||
|
assert mapped_img_data["pixel_values"].shape == expected_shape
|
||||||
|
|
||||||
|
|
||||||
|
# Sad path tests for the multimodal input processor and mapper, respectively
|
||||||
|
@pytest.mark.parametrize("mm_data", [
|
||||||
|
{
|
||||||
|
"image": torch.rand((5))
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"image": torch.rand((5, 5, 5, 5, 5))
|
||||||
|
},
|
||||||
|
])
|
||||||
|
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
mm_data: Dict[str, torch.Tensor]):
|
||||||
|
"""Test sad cases validated in Qwen's multimodal input processor."""
|
||||||
|
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
|
||||||
|
trust_remote_code=True)
|
||||||
|
prompt = "Picture 1: <img></img>\n"
|
||||||
|
prompt_token_ids = tokenizer.encode(prompt)
|
||||||
|
inputs = LLMInputs(prompt=prompt,
|
||||||
|
prompt_token_ids=prompt_token_ids,
|
||||||
|
multi_modal_data=mm_data)
|
||||||
|
# Should fail since we have too many or too few dimensions for embeddings
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
input_processor_for_qwen(qwen_vl_context, inputs)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"img_data",
|
||||||
|
[
|
||||||
|
# Wrong context length
|
||||||
|
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
|
||||||
|
# Wrong visual encoder output size
|
||||||
|
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
|
||||||
|
])
|
||||||
|
def test_input_mapper_invalid_mm_data(
|
||||||
|
input_mapper_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
img_data: Union[torch.Tensor, List[Image], Image],
|
||||||
|
):
|
||||||
|
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||||
|
|
||||||
|
|
||||||
|
### End-to-end generation tests
|
||||||
|
def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
||||||
|
assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
|
||||||
|
"""Given a temporary dir path, export one or more image assets into the
|
||||||
|
tempdir & replace its contents with the local path to the string so that
|
||||||
|
the HF version of Qwen-VL can resolve the path and load the image ni its
|
||||||
|
forward() call.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tmp_path: Tempdir for test under consideration.
|
||||||
|
prompt: Prompt with image placeholders.
|
||||||
|
assets: List of image assets whose len equals the num placeholders.
|
||||||
|
"""
|
||||||
|
# Ensure that the number of placeholders matches the number of assets;
|
||||||
|
# If this is not true, the test is probably written incorrectly.
|
||||||
|
assert prompt.count("<img></img>") == len(assets)
|
||||||
|
|
||||||
|
# Replace the placeholders with local paths to the exported assets
|
||||||
|
for asset in assets:
|
||||||
|
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
||||||
|
asset.pil_image.save(image_tmp_path)
|
||||||
|
prompt = prompt.replace(
|
||||||
|
"<img></img>",
|
||||||
|
f"<img>{image_tmp_path}</img>",
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
example_prompts,
|
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||||
model: str,
|
model: str,
|
||||||
*,
|
*,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
|
mm_limit: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
):
|
):
|
||||||
# This test checks language inputs only, since the visual component
|
"""Inference result should be the same between hf and vllm.
|
||||||
# for qwen-vl is still unsupported in VLLM. In the near-future, the
|
|
||||||
# implementation and this test will be extended to consider
|
|
||||||
# visual inputs as well.
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
|
||||||
example_prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
)
|
|
||||||
|
|
||||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
All the image fixtures for the test is under tests/images.
|
||||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
For huggingface runner, we provide the PIL images as input.
|
||||||
example_prompts,
|
For vllm runner, we provide MultiModalDataDict objects
|
||||||
|
and corresponding MultiModalConfig as input.
|
||||||
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
|
The text output is sanitized to be able to compare with hf.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
|
# will hurt multiprocessing backend with fork method (the default method).
|
||||||
|
|
||||||
|
# max_model_len should be greater than image_feature_size
|
||||||
|
# Qwen encodes each image into a fixed content size of 256
|
||||||
|
with vllm_runner(model,
|
||||||
|
max_model_len=1024,
|
||||||
|
max_num_seqs=1,
|
||||||
|
dtype=dtype,
|
||||||
|
limit_mm_per_prompt={"image": mm_limit},
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
enforce_eager=True) as vllm_model:
|
||||||
|
vllm_outputs_per_image = [
|
||||||
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
)
|
images=images)
|
||||||
|
for prompts, images in inputs
|
||||||
|
]
|
||||||
|
|
||||||
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
|
hf_outputs_per_image = [
|
||||||
|
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
images=images)
|
||||||
|
for prompts, images in inputs
|
||||||
|
]
|
||||||
|
|
||||||
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||||
|
vllm_outputs_per_image):
|
||||||
|
|
||||||
check_logprobs_close(
|
check_logprobs_close(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
@@ -46,3 +282,122 @@ def test_text_only_qwen_model(
|
|||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", multimodal_models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [8])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets, model: str,
|
||||||
|
size_factors: List[float], dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int) -> None:
|
||||||
|
"""Tests multimodal models with single image prompts."""
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
get_prompt_with_path(tmp_path, prompt, [asset])
|
||||||
|
for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||||
|
]
|
||||||
|
|
||||||
|
inputs = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
) for image, prompt in zip(images, prompts)]
|
||||||
|
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=1,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", multimodal_models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets, model: str,
|
||||||
|
size_factors: List[float], dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int) -> None:
|
||||||
|
"""Tests multimodal models with multi-image prompts."""
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
# Put all of the images into one prompt.
|
||||||
|
prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
|
||||||
|
image_assets)
|
||||||
|
inputs = [([prompt for _ in size_factors],
|
||||||
|
[[rescale_image_size(image, factor) for image in images]
|
||||||
|
for factor in size_factors])]
|
||||||
|
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=2,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Ensure that a text-only Qwen model can still be loaded and
|
||||||
|
# used for inference in VLLM without throwing.
|
||||||
|
@pytest.mark.parametrize("model", text_only_models)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_text_only_qwen_model_can_be_loaded_and_run(
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
example_prompts: List[str],
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
):
|
||||||
|
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||||
|
vllm_model.generate_greedy_logprobs(
|
||||||
|
example_prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user