Compare commits
60 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3fd2b0d21c | ||
|
|
d394787e52 | ||
|
|
775f00f81e | ||
|
|
8baa454937 | ||
|
|
73202dbe77 | ||
|
|
7015417fd4 | ||
|
|
aea02f30de | ||
|
|
0b952af458 | ||
|
|
3b7fea770f | ||
|
|
cea95dfb94 | ||
|
|
6a512a00df | ||
|
|
efcf946a15 | ||
|
|
1230263e16 | ||
|
|
e497b8aeff | ||
|
|
94144e726c | ||
|
|
1d5e397aa4 | ||
|
|
22f3a4bc6c | ||
|
|
b1f3e18958 | ||
|
|
04e7c4e771 | ||
|
|
5faedf1b62 | ||
|
|
02751a7a42 | ||
|
|
f421f3cefb | ||
|
|
8c054b7a62 | ||
|
|
6234385f4a | ||
|
|
da1a844e61 | ||
|
|
a1d874224d | ||
|
|
6cd5e5b07e | ||
|
|
c7cb5c3335 | ||
|
|
f9b4a2d415 | ||
|
|
58fcc8545a | ||
|
|
08287ef675 | ||
|
|
4ef41b8476 | ||
|
|
cfe712bf1a | ||
|
|
b962ee1470 | ||
|
|
36bf8150cc | ||
|
|
e807125936 | ||
|
|
9f68e00d27 | ||
|
|
ce2702a923 | ||
|
|
795b662cff | ||
|
|
2f707fcb35 | ||
|
|
41e95c5247 | ||
|
|
12dd715807 | ||
|
|
29f49cd6e3 | ||
|
|
23f322297f | ||
|
|
9db52eab3d | ||
|
|
1447c97e75 | ||
|
|
de80783b69 | ||
|
|
e5cab71531 | ||
|
|
baa5467547 | ||
|
|
db3bf7c991 | ||
|
|
2febcf2777 | ||
|
|
2ee45281a5 | ||
|
|
9da25a88aa | ||
|
|
8685ba1a1e | ||
|
|
288a938872 | ||
|
|
e39ebf5cf5 | ||
|
|
ba262c4e5a | ||
|
|
4624d98dbd | ||
|
|
1afc931987 | ||
|
|
e01c2beb7d |
@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
|
|||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
commands=$@
|
commands=$@
|
||||||
|
echo "Commands:$commands"
|
||||||
|
#ignore certain kernels tests
|
||||||
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/test_attention.py \
|
||||||
|
--ignore=kernels/test_attention_selector.py \
|
||||||
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
|
--ignore=kernels/test_flash_attn.py \
|
||||||
|
--ignore=kernels/test_flashinfer.py \
|
||||||
|
--ignore=kernels/test_int8_quant.py \
|
||||||
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/test_moe.py \
|
||||||
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
|
--ignore=kernels/test_rand.py \
|
||||||
|
--ignore=kernels/test_sampler.py"
|
||||||
|
fi
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
#replace shard arguments
|
#replace shard arguments
|
||||||
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
|
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||||
|
echo "Shard ${GPU} commands:$commands"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd --device /dev/dri \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network host \
|
--network host \
|
||||||
|
|||||||
33
.buildkite/run-cpu-test-ppc64le.sh
Executable file
33
.buildkite/run-cpu-test-ppc64le.sh
Executable file
@@ -0,0 +1,33 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f cpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
|
source /etc/environment
|
||||||
|
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
pip install pytest matplotlib einops transformers_stream_generator
|
||||||
|
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
|
# online inference
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
@@ -30,6 +30,12 @@ docker exec cpu-test bash -c "
|
|||||||
--ignore=tests/models/test_jamba.py \
|
--ignore=tests/models/test_jamba.py \
|
||||||
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
|
# Run compressed-tensor test
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
pytest -s -v \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
|
||||||
|
|
||||||
# online inference
|
# online inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
|
|||||||
@@ -158,6 +158,7 @@ steps:
|
|||||||
- python3 offline_inference_with_prefix.py
|
- python3 offline_inference_with_prefix.py
|
||||||
- python3 llm_engine_example.py
|
- python3 llm_engine_example.py
|
||||||
- python3 offline_inference_vision_language.py
|
- python3 offline_inference_vision_language.py
|
||||||
|
- python3 offline_inference_vision_language_multi_image.py
|
||||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference_encoder_decoder.py
|
- python3 offline_inference_encoder_decoder.py
|
||||||
|
|
||||||
@@ -216,7 +217,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# See https://github.com/vllm-project/vllm/issues/5152
|
# See https://github.com/vllm-project/vllm/issues/5152
|
||||||
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||||
- pytest -v -s spec_decode
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
|
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 30min each
|
- label: LoRA Test %N # 30min each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -227,6 +229,7 @@ steps:
|
|||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Test %N # 30min each
|
- label: Kernels Test %N # 30min each
|
||||||
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
@@ -368,6 +371,7 @@ steps:
|
|||||||
- label: LoRA Long Context (Distributed) # 11min
|
- label: LoRA Long Context (Distributed) # 11min
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora/test_long_context
|
- tests/lora/test_long_context
|
||||||
@@ -384,7 +388,18 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|||||||
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
|
|||||||
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
<h3>Adding or changing kernels</h3>
|
||||||
|
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
|
||||||
|
<ul>
|
||||||
|
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
|
||||||
|
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
|
||||||
|
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
|
||||||
|
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
|
||||||
|
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
|
||||||
|
</ul>
|
||||||
|
|
||||||
<h3>Notes for Large Changes</h3>
|
<h3>Notes for Large Changes</h3>
|
||||||
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
||||||
|
|
||||||
|
|||||||
@@ -181,7 +181,6 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
@@ -196,9 +195,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# CUTLASS 3.5.1
|
GIT_TAG v3.5.1
|
||||||
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
|
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
|
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
||||||
|
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
@@ -232,6 +235,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"-gencode arch=compute_90a,code=sm_90a")
|
"-gencode arch=compute_90a,code=sm_90a")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
|
|
||||||
@@ -290,6 +294,12 @@ define_gpu_extension_target(
|
|||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
|
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
||||||
|
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
||||||
|
# driver API. This causes problems when linking with earlier versions of CUDA.
|
||||||
|
# Setting this variable sidesteps the issue by calling the driver directly.
|
||||||
|
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||||
|
|
||||||
#
|
#
|
||||||
# _moe_C extension
|
# _moe_C extension
|
||||||
#
|
#
|
||||||
|
|||||||
128
CODE_OF_CONDUCT.md
Normal file
128
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
|
||||||
|
# vLLM Code of Conduct
|
||||||
|
|
||||||
|
## Our Pledge
|
||||||
|
|
||||||
|
We as members, contributors, and leaders pledge to make participation in our
|
||||||
|
community a harassment-free experience for everyone, regardless of age, body
|
||||||
|
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||||
|
identity and expression, level of experience, education, socioeconomic status,
|
||||||
|
nationality, personal appearance, race, caste, color, religion, or sexual
|
||||||
|
identity and orientation.
|
||||||
|
|
||||||
|
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||||
|
diverse, inclusive, and healthy community.
|
||||||
|
|
||||||
|
## Our Standards
|
||||||
|
|
||||||
|
Examples of behavior that contributes to a positive environment for our
|
||||||
|
community include:
|
||||||
|
|
||||||
|
* Demonstrating empathy and kindness toward other people
|
||||||
|
* Being respectful of differing opinions, viewpoints, and experiences
|
||||||
|
* Giving and gracefully accepting constructive feedback
|
||||||
|
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||||
|
and learning from the experience
|
||||||
|
* Focusing on what is best not just for us as individuals, but for the overall
|
||||||
|
community
|
||||||
|
|
||||||
|
Examples of unacceptable behavior include:
|
||||||
|
|
||||||
|
* The use of sexualized language or imagery, and sexual attention or advances of
|
||||||
|
any kind
|
||||||
|
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||||
|
* Public or private harassment
|
||||||
|
* Publishing others' private information, such as a physical or email address,
|
||||||
|
without their explicit permission
|
||||||
|
* Other conduct which could reasonably be considered inappropriate in a
|
||||||
|
professional setting
|
||||||
|
|
||||||
|
## Enforcement Responsibilities
|
||||||
|
|
||||||
|
Community leaders are responsible for clarifying and enforcing our standards of
|
||||||
|
acceptable behavior and will take appropriate and fair corrective action in
|
||||||
|
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||||
|
or harmful.
|
||||||
|
|
||||||
|
Community leaders have the right and responsibility to remove, edit, or reject
|
||||||
|
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||||
|
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||||
|
decisions when appropriate.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This Code of Conduct applies within all community spaces, and also applies when
|
||||||
|
an individual is officially representing the community in public spaces.
|
||||||
|
Examples of representing our community include using an official email address,
|
||||||
|
posting via an official social media account, or acting as an appointed
|
||||||
|
representative at an online or offline/IRL event.
|
||||||
|
|
||||||
|
## Enforcement
|
||||||
|
|
||||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
|
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
||||||
|
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
|
||||||
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
|
reporter of any incident.
|
||||||
|
|
||||||
|
## Enforcement Guidelines
|
||||||
|
|
||||||
|
Community leaders will follow these Community Impact Guidelines in determining
|
||||||
|
the consequences for any action they deem in violation of this Code of Conduct:
|
||||||
|
|
||||||
|
### 1. Correction
|
||||||
|
|
||||||
|
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||||
|
unprofessional or unwelcome in the community.
|
||||||
|
|
||||||
|
**Consequence**: A private, written warning from community leaders, providing
|
||||||
|
clarity around the nature of the violation and an explanation of why the
|
||||||
|
behavior was inappropriate. A public apology may be requested.
|
||||||
|
|
||||||
|
### 2. Warning
|
||||||
|
|
||||||
|
**Community Impact**: A violation through a single incident or series of
|
||||||
|
actions.
|
||||||
|
|
||||||
|
**Consequence**: A warning with consequences for continued behavior. No
|
||||||
|
interaction with the people involved, including unsolicited interaction with
|
||||||
|
those enforcing the Code of Conduct, for a specified period of time. This
|
||||||
|
includes avoiding interactions in community spaces as well as external channels
|
||||||
|
like social media. Violating these terms may lead to a temporary or permanent
|
||||||
|
ban.
|
||||||
|
|
||||||
|
### 3. Temporary Ban
|
||||||
|
|
||||||
|
**Community Impact**: A serious violation of community standards, including
|
||||||
|
sustained inappropriate behavior.
|
||||||
|
|
||||||
|
**Consequence**: A temporary ban from any sort of interaction or public
|
||||||
|
communication with the community for a specified period of time. No public or
|
||||||
|
private interaction with the people involved, including unsolicited interaction
|
||||||
|
with those enforcing the Code of Conduct, is allowed during this period.
|
||||||
|
Violating these terms may lead to a permanent ban.
|
||||||
|
|
||||||
|
### 4. Permanent Ban
|
||||||
|
|
||||||
|
**Community Impact**: Demonstrating a pattern of violation of community
|
||||||
|
standards, including sustained inappropriate behavior, harassment of an
|
||||||
|
individual, or aggression toward or disparagement of classes of individuals.
|
||||||
|
|
||||||
|
**Consequence**: A permanent ban from any sort of public interaction within the
|
||||||
|
community.
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
|
||||||
|
version 2.1, available at
|
||||||
|
[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
|
||||||
|
|
||||||
|
Community Impact Guidelines were inspired by
|
||||||
|
[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
|
||||||
|
|
||||||
|
For answers to common questions about this code of conduct, see the
|
||||||
|
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
|
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||||
|
|
||||||
11
Dockerfile
11
Dockerfile
@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
|
|||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.12
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install Python and other dependencies
|
||||||
@@ -37,7 +37,6 @@ WORKDIR /workspace
|
|||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
COPY requirements-common.txt requirements-common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements-adag.txt requirements-adag.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-cuda.txt
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
@@ -66,7 +65,6 @@ COPY setup.py setup.py
|
|||||||
COPY cmake cmake
|
COPY cmake cmake
|
||||||
COPY CMakeLists.txt CMakeLists.txt
|
COPY CMakeLists.txt CMakeLists.txt
|
||||||
COPY requirements-common.txt requirements-common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements-adag.txt requirements-adag.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
COPY pyproject.toml pyproject.toml
|
COPY pyproject.toml pyproject.toml
|
||||||
COPY vllm vllm
|
COPY vllm vllm
|
||||||
@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
@@ -147,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||||
@@ -181,6 +180,10 @@ FROM vllm-base AS test
|
|||||||
ADD . /vllm-workspace/
|
ADD . /vllm-workspace/
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
|
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
|
||||||
|
# This installation must complete before the test dependencies are collected and installed.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install "setuptools>=74.1.1"
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-dev.txt
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
|||||||
@@ -2,9 +2,14 @@
|
|||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-1
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
|
||||||
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
apt-get update -y \
|
apt-get update -y \
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
pip install --upgrade pip && \
|
pip install --upgrade pip && \
|
||||||
pip install -r requirements-build.txt
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
|
# install oneDNN
|
||||||
|
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
|
||||||
|
-DONEDNN_BUILD_DOC=OFF \
|
||||||
|
-DONEDNN_BUILD_EXAMPLES=OFF \
|
||||||
|
-DONEDNN_BUILD_TESTS=OFF \
|
||||||
|
-DONEDNN_BUILD_GRAPH=OFF \
|
||||||
|
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
|
||||||
|
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
|
||||||
|
cmake --build ./oneDNN/build --target install --config Release
|
||||||
|
|
||||||
FROM cpu-test-1 AS build
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@@ -40,7 +58,6 @@ COPY ./ ./
|
|||||||
ARG VLLM_CPU_DISABLE_AVX512
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||||
|
|||||||
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
|
|||||||
RUN echo "Base image is $BASE_IMAGE"
|
RUN echo "Base image is $BASE_IMAGE"
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update \
|
||||||
|
&& apt-get install python3 python3-pip -y \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
### Mount Point ###
|
### Mount Point ###
|
||||||
# When launching the container, mount the code directory to /app
|
# When launching the container, mount the code directory to /app
|
||||||
|
|||||||
@@ -4,7 +4,8 @@
|
|||||||
FROM ubuntu:22.04 AS dev
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
RUN apt-get update -y && \
|
||||||
apt-get install -y python3-pip git
|
apt-get install -y python3-pip git && \
|
||||||
|
apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# copy requirements
|
# copy requirements
|
||||||
|
|||||||
@@ -2,21 +2,26 @@ FROM mambaorg/micromamba
|
|||||||
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
# Some packages in requirements-cpu are installed here
|
# Some packages in requirements-cpu are installed here
|
||||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
# Currently these may not be available for venv or pip directly
|
# Currently these may not be available for venv or pip directly
|
||||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
# These packages will be in rocketce eventually
|
# These packages will be in rocketce eventually
|
||||||
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /workspace/
|
||||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
|
|||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
# Install the TPU and Pallas dependencies.
|
# Install the TPU and Pallas dependencies.
|
||||||
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||||
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
|
|||||||
@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
|
|||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
|
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include requirements-adag.txt
|
|
||||||
include requirements-common.txt
|
include requirements-common.txt
|
||||||
include requirements-cuda.txt
|
include requirements-cuda.txt
|
||||||
include requirements-rocm.txt
|
include requirements-rocm.txt
|
||||||
|
|||||||
16
README.md
16
README.md
@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
|
**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
|
||||||
|
|
||||||
We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
|
We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
|
||||||
Join us to hear the vLLM's recent update about performance.
|
Join us to learn more about recent advancements of vLLM on MI300X.
|
||||||
Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
|
Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||||
@@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
year={2023}
|
year={2023}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Contact Us
|
||||||
|
|
||||||
|
* For technical questions and feature requests, please use Github issues or discussions.
|
||||||
|
* For discussing with fellow users, please use Discord.
|
||||||
|
* For security disclosures, please use Github's security advisory feature.
|
||||||
|
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||||
@@ -24,6 +24,7 @@ class RequestFuncInput:
|
|||||||
model: str
|
model: str
|
||||||
best_of: int = 1
|
best_of: int = 1
|
||||||
use_beam_search: bool = False
|
use_beam_search: bool = False
|
||||||
|
logprobs: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -236,6 +237,7 @@ async def async_request_openai_completions(
|
|||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
}
|
}
|
||||||
headers = {
|
headers = {
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import torch
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
|
||||||
from vllm.inputs import PromptInputs
|
from vllm.inputs import PromptInputs
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@@ -205,13 +205,11 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help=('path to save the pytorch profiler output. Can be visualized '
|
help=('path to save the pytorch profiler output. Can be visualized '
|
||||||
'with ui.perfetto.dev or Tensorboard.'))
|
'with ui.perfetto.dev or Tensorboard.'))
|
||||||
parser.add_argument(
|
parser.add_argument("--device",
|
||||||
"--device",
|
type=str,
|
||||||
type=str,
|
default="auto",
|
||||||
default="auto",
|
choices=DEVICE_OPTIONS,
|
||||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
help='device type for vLLM execution')
|
||||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
|
||||||
'CPU.')
|
|
||||||
parser.add_argument('--block-size',
|
parser.add_argument('--block-size',
|
||||||
type=int,
|
type=int,
|
||||||
default=16,
|
default=16,
|
||||||
|
|||||||
@@ -195,8 +195,16 @@ def sample_sonnet_requests(
|
|||||||
|
|
||||||
|
|
||||||
def sample_random_requests(
|
def sample_random_requests(
|
||||||
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
|
prefix_len: int,
|
||||||
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
|
input_len: int,
|
||||||
|
output_len: int,
|
||||||
|
num_prompts: int,
|
||||||
|
range_ratio: float,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
) -> List[Tuple[str, int, int]]:
|
||||||
|
prefix_token_ids = np.random.randint(0,
|
||||||
|
tokenizer.vocab_size,
|
||||||
|
size=prefix_len).tolist()
|
||||||
|
|
||||||
input_lens = np.random.randint(
|
input_lens = np.random.randint(
|
||||||
int(input_len * range_ratio),
|
int(input_len * range_ratio),
|
||||||
@@ -211,10 +219,12 @@ def sample_random_requests(
|
|||||||
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
||||||
input_requests = []
|
input_requests = []
|
||||||
for i in range(num_prompts):
|
for i in range(num_prompts):
|
||||||
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
|
prompt = tokenizer.decode(prefix_token_ids +
|
||||||
|
[(offsets[i] + i + j) % tokenizer.vocab_size
|
||||||
for j in range(input_lens[i])])
|
for j in range(input_lens[i])])
|
||||||
|
|
||||||
input_requests.append(
|
input_requests.append(
|
||||||
(prompt, int(input_lens[i]), int(output_lens[i])))
|
(prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
|
||||||
|
|
||||||
return input_requests
|
return input_requests
|
||||||
|
|
||||||
@@ -318,6 +328,7 @@ async def benchmark(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
|
logprobs: Optional[int],
|
||||||
best_of: int,
|
best_of: int,
|
||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
@@ -339,6 +350,7 @@ async def benchmark(
|
|||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -358,6 +370,7 @@ async def benchmark(
|
|||||||
api_url=base_url + "/start_profile",
|
api_url=base_url + "/start_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -379,6 +392,7 @@ async def benchmark(
|
|||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
output_len=output_len,
|
output_len=output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -396,6 +410,7 @@ async def benchmark(
|
|||||||
api_url=base_url + "/stop_profile",
|
api_url=base_url + "/stop_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
use_beam_search=use_beam_search,
|
||||||
)
|
)
|
||||||
@@ -562,6 +577,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
elif args.dataset_name == "random":
|
elif args.dataset_name == "random":
|
||||||
input_requests = sample_random_requests(
|
input_requests = sample_random_requests(
|
||||||
|
prefix_len=args.random_prefix_len,
|
||||||
input_len=args.random_input_len,
|
input_len=args.random_input_len,
|
||||||
output_len=args.random_output_len,
|
output_len=args.random_output_len,
|
||||||
num_prompts=args.num_prompts,
|
num_prompts=args.num_prompts,
|
||||||
@@ -580,6 +596,7 @@ def main(args: argparse.Namespace):
|
|||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
input_requests=input_requests,
|
input_requests=input_requests,
|
||||||
|
logprobs=args.logprobs,
|
||||||
best_of=args.best_of,
|
best_of=args.best_of,
|
||||||
use_beam_search=args.use_beam_search,
|
use_beam_search=args.use_beam_search,
|
||||||
request_rate=args.request_rate,
|
request_rate=args.request_rate,
|
||||||
@@ -721,6 +738,16 @@ if __name__ == "__main__":
|
|||||||
help=
|
help=
|
||||||
"Number of output tokens per request, used only for sonnet dataset.",
|
"Number of output tokens per request, used only for sonnet dataset.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--logprobs",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help=("Number of logprobs-per-token to compute & return as part of "
|
||||||
|
"the request. If unspecified, then either (1) if beam search "
|
||||||
|
"is disabled, no logprobs are computed & a single dummy "
|
||||||
|
"logprob is returned for each token; or (2) if beam search "
|
||||||
|
"is enabled 1 logprob per token is computed"),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--sonnet-prefix-len",
|
"--sonnet-prefix-len",
|
||||||
type=int,
|
type=int,
|
||||||
@@ -749,6 +776,14 @@ if __name__ == "__main__":
|
|||||||
help="Range of sampled ratio of input/output length, "
|
help="Range of sampled ratio of input/output length, "
|
||||||
"used only for random sampling.",
|
"used only for random sampling.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-prefix-len",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Number of fixed prefix tokens before random "
|
||||||
|
" context. The length range of context in a random "
|
||||||
|
" request is [random-prefix-len, "
|
||||||
|
" random-prefix-len + random-prefix-len * random-range-ratio).")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--request-rate",
|
"--request-rate",
|
||||||
type=float,
|
type=float,
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from tqdm import tqdm
|
|||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
@@ -451,13 +451,11 @@ if __name__ == "__main__":
|
|||||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
'instead supported for common inference criteria.')
|
'instead supported for common inference criteria.')
|
||||||
parser.add_argument(
|
parser.add_argument("--device",
|
||||||
"--device",
|
type=str,
|
||||||
type=str,
|
default="auto",
|
||||||
default="auto",
|
choices=DEVICE_OPTIONS,
|
||||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
help='device type for vLLM execution')
|
||||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
|
||||||
'CPU.')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-scheduler-steps",
|
"--num-scheduler-steps",
|
||||||
type=int,
|
type=int,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define environment variables for special configurations
|
# Define environment variables for special configurations
|
||||||
@@ -83,12 +84,7 @@ endif()
|
|||||||
|
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
|
|
||||||
list(APPEND LIBS "numa")
|
list(APPEND LIBS dnnl numa)
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Define extension targets
|
|
||||||
#
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# _C extension
|
# _C extension
|
||||||
@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/cpu/torch_bindings.cpp")
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
|
|
||||||
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/cpu/quant.cpp"
|
||||||
|
${VLLM_EXT_SRC})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Define extension targets
|
||||||
|
#
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_C
|
_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
|
|||||||
@@ -350,6 +350,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
||||||
${GPU_INCLUDE_DIRECTORIES})
|
${GPU_INCLUDE_DIRECTORIES})
|
||||||
|
|
||||||
|
# TODO: is torch_python_LIBRARY needed?
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
|
||||||
${GPU_LIBRARIES})
|
${GPU_LIBRARIES})
|
||||||
|
|
||||||
|
|||||||
@@ -24,8 +24,8 @@ namespace vec_op {
|
|||||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
#else
|
#else
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
std::cout << #NAME << " invoked." << std::endl;
|
RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
|||||||
explicit BF16Vec16(const FP32Vec16 &);
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||||
|
|
||||||
|
void save(void* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm256_mask_storeu_epi16(ptr, mask, reg);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
|
||||||
|
return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 max(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(_mm512_max_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 abs() const {
|
||||||
|
return FP32Vec16(_mm512_abs_ps(reg));
|
||||||
|
}
|
||||||
|
|
||||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||||
|
|
||||||
|
float reduce_max() const { return _mm512_reduce_max_ps(reg); }
|
||||||
|
|
||||||
template <int group_size> float reduce_sub_sum(int idx) {
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||||
|
|
||||||
|
void save(float* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm512_mask_storeu_ps(ptr, mask, reg);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
#else
|
#else
|
||||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
struct INT8Vec16: public Vec<INT8Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
__m128i reg;
|
||||||
|
int8_t values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m128i reg;
|
||||||
|
|
||||||
|
explicit INT8Vec16(const FP32Vec16& vec) : reg(
|
||||||
|
_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
|
||||||
|
) {}
|
||||||
|
|
||||||
|
void save(int8_t* ptr) const {
|
||||||
|
_mm_storeu_epi8(ptr, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(int8_t* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm_mask_storeu_epi8(ptr, mask, reg);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T> struct VecType { using vec_type = void; };
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|||||||
168
csrc/cpu/dnnl_helper.hpp
Normal file
168
csrc/cpu/dnnl_helper.hpp
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
#ifndef DNNL_HELPER_HPP
|
||||||
|
#define DNNL_HELPER_HPP
|
||||||
|
|
||||||
|
#include <c10/util/BFloat16.h>
|
||||||
|
|
||||||
|
#include "oneapi/dnnl/dnnl.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T>
|
||||||
|
struct DNNLType {
|
||||||
|
static constexpr dnnl::memory::data_type type =
|
||||||
|
dnnl::memory::data_type::undef;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<int8_t> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<int32_t> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<float> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<c10::BFloat16> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
constexpr inline dnnl::memory::data_type get_dnnl_type() {
|
||||||
|
return DNNLType<std::decay_t<T>>::type;
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <bool InputNoScale>
|
||||||
|
class DNNLPrimitiveHelper {
|
||||||
|
public:
|
||||||
|
// I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
|
||||||
|
// A: [M, K], row-major
|
||||||
|
// B: [K, N], column-major
|
||||||
|
// C: [M, N], row-major
|
||||||
|
// bias: [N], row-major, optional
|
||||||
|
// a_scales: [MS]
|
||||||
|
// b_scales: [NS]
|
||||||
|
// Note: Due to the limitation of oneDNN
|
||||||
|
// (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
|
||||||
|
// not supported.
|
||||||
|
template <typename OutputT, typename BiasT>
|
||||||
|
static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
|
||||||
|
const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
|
||||||
|
dnnl_dim_t K, const float* a_scales,
|
||||||
|
const float* b_scales, dnnl_dim_t MS,
|
||||||
|
dnnl_dim_t NS) {
|
||||||
|
auto&& OutputType = get_dnnl_type<OutputT>();
|
||||||
|
auto&& BiasType = get_dnnl_type<BiasT>();
|
||||||
|
|
||||||
|
dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
|
||||||
|
dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
|
||||||
|
dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
|
||||||
|
|
||||||
|
dnnl::primitive_attr attr;
|
||||||
|
if constexpr (!InputNoScale) {
|
||||||
|
if (MS == 1) {
|
||||||
|
// per-tensor
|
||||||
|
attr.set_scales_mask(DNNL_ARG_SRC, 0);
|
||||||
|
} else {
|
||||||
|
// per-token
|
||||||
|
TORCH_CHECK(false, "per-token quantization is unsupported.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NS == 1) {
|
||||||
|
// per-tensor
|
||||||
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
|
||||||
|
} else {
|
||||||
|
// per-channel
|
||||||
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
dnnl::matmul::primitive_desc matmul_pd;
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
|
||||||
|
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
|
||||||
|
bias_md, c_md, attr);
|
||||||
|
} else {
|
||||||
|
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
|
||||||
|
c_md, attr);
|
||||||
|
}
|
||||||
|
dnnl::matmul matmul(matmul_pd);
|
||||||
|
|
||||||
|
auto& engine = default_engine();
|
||||||
|
|
||||||
|
dnnl::memory a_m(a_md, engine, (void*)a);
|
||||||
|
dnnl::memory b_m(b_md, engine, (void*)b);
|
||||||
|
dnnl::memory c_m(c_md, engine, (void*)c);
|
||||||
|
dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
|
||||||
|
(void*)a_scales);
|
||||||
|
dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
|
||||||
|
(void*)b_scales);
|
||||||
|
|
||||||
|
auto& stream = default_stream();
|
||||||
|
if constexpr (InputNoScale) {
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({N}, BiasType, {1});
|
||||||
|
dnnl::memory bias_m(bias_md, engine, (void*)bias);
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_BIAS, bias_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({N}, BiasType, {1});
|
||||||
|
dnnl::memory bias_m(bias_md, engine, (void*)bias);
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_BIAS, bias_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stream.wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static dnnl::engine& default_engine() {
|
||||||
|
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
||||||
|
return engine;
|
||||||
|
}
|
||||||
|
|
||||||
|
static dnnl::stream& default_stream() {
|
||||||
|
static dnnl::stream stream(default_engine());
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
294
csrc/cpu/quant.cpp
Normal file
294
csrc/cpu/quant.cpp
Normal file
@@ -0,0 +1,294 @@
|
|||||||
|
#include "cpu_types.hpp"
|
||||||
|
#include "dnnl_helper.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t>
|
||||||
|
struct KernelVecType {
|
||||||
|
using load_vec_type = void;
|
||||||
|
using cvt_vec_type = void;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<float> {
|
||||||
|
using load_vec_type = vec_op::FP32Vec16;
|
||||||
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<c10::BFloat16> {
|
||||||
|
using load_vec_type = vec_op::BF16Vec16;
|
||||||
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
template <typename scalar_t>
|
||||||
|
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
const float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
constexpr float i8_min =
|
||||||
|
static_cast<float>(std::numeric_limits<int8_t>::min());
|
||||||
|
constexpr float i8_max =
|
||||||
|
static_cast<float>(std::numeric_limits<int8_t>::max());
|
||||||
|
const cvt_vec_t inv_scale(1.0 / *scale);
|
||||||
|
const cvt_vec_t i8_min_vec(i8_min);
|
||||||
|
const cvt_vec_t i8_max_vec(i8_max);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
cvt_vec_t max_abs(0.0);
|
||||||
|
{
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs());
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs());
|
||||||
|
} else {
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float scale_val = max_abs.reduce_max() / 127.0f;
|
||||||
|
scale[i] = scale_val;
|
||||||
|
const cvt_vec_t inv_scale(1.0 / scale_val);
|
||||||
|
|
||||||
|
{
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool Bias, typename scalar_t>
|
||||||
|
void dynamic_output_scale_impl(const float* input, scalar_t* output,
|
||||||
|
const float* scale, const scalar_t* bias,
|
||||||
|
const int num_tokens, const int hidden_size) {
|
||||||
|
CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
int j = 0;
|
||||||
|
cvt_vec_t token_scale_vec(scale[i]);
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
cvt_vec_t elems_fp32(input + i * hidden_size + j);
|
||||||
|
elems_fp32 = elems_fp32 * token_scale_vec;
|
||||||
|
|
||||||
|
if constexpr (Bias) {
|
||||||
|
load_vec_t bias_vec(bias + j);
|
||||||
|
cvt_vec_t bias_vec_fp32(bias_vec);
|
||||||
|
elems_fp32 = elems_fp32 + bias_vec_fp32;
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems_out(elems_fp32);
|
||||||
|
elems_out.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
cvt_vec_t elems_fp32(input + i * hidden_size + j);
|
||||||
|
elems_fp32 = elems_fp32 * token_scale_vec;
|
||||||
|
|
||||||
|
if constexpr (Bias) {
|
||||||
|
load_vec_t bias_vec(bias + j);
|
||||||
|
cvt_vec_t bias_vec_fp32(bias_vec);
|
||||||
|
elems_fp32 = elems_fp32 + bias_vec_fp32;
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems_out(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_out.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_out.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <typename scalar_t>
|
||||||
|
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
const float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_output_scale_impl() {
|
||||||
|
TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
|
||||||
|
const torch::Tensor& a, // [M, IC], row-major
|
||||||
|
const torch::Tensor& b, // [IC, OC], column-major
|
||||||
|
const torch::Tensor& a_scales, // [1] or [M]
|
||||||
|
const torch::Tensor& b_scales, // [1] or [OC]
|
||||||
|
const c10::optional<torch::Tensor>& bias // [OC]
|
||||||
|
) {
|
||||||
|
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
|
||||||
|
// Checks for conformality
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
|
||||||
|
"int8_scaled_mm only supports INT8 inputs.")
|
||||||
|
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
|
||||||
|
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
|
||||||
|
b.size(1) == c.size(1));
|
||||||
|
TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
|
||||||
|
TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
|
||||||
|
|
||||||
|
// Check for strides and alignment
|
||||||
|
TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
|
||||||
|
TORCH_CHECK(b.stride(0) == 1); // Column-major
|
||||||
|
TORCH_CHECK(c.stride(0) % 16 == 0 &&
|
||||||
|
b.stride(1) % 16 == 0); // 16 Byte Alignment
|
||||||
|
TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
|
||||||
|
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
|
||||||
|
bias->dim() == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
|
||||||
|
if (a_scales.numel() != 1) {
|
||||||
|
// per-token
|
||||||
|
// Note: oneDNN doesn't support per-token activation quantization
|
||||||
|
torch::Tensor tmp_fp32_out =
|
||||||
|
torch::empty_like(c, ::at::ScalarType::Float);
|
||||||
|
DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
|
||||||
|
tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
|
||||||
|
a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
|
||||||
|
b_scales.numel());
|
||||||
|
if (bias.has_value()) {
|
||||||
|
dynamic_output_scale_impl<true>(
|
||||||
|
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
|
||||||
|
a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
|
||||||
|
c.size(1));
|
||||||
|
} else {
|
||||||
|
dynamic_output_scale_impl<false>(
|
||||||
|
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
|
||||||
|
a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// per-tensor
|
||||||
|
if (bias.has_value()) {
|
||||||
|
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
|
||||||
|
bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
|
||||||
|
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
|
||||||
|
a_scales.numel(), b_scales.numel());
|
||||||
|
} else {
|
||||||
|
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
|
||||||
|
(void*)(0), a.size(0), b.size(1), a.size(1),
|
||||||
|
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
|
||||||
|
a_scales.numel(), b_scales.numel());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// static-per-tensor quantization.
|
||||||
|
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||||
|
const torch::Tensor& input, // [..., hidden_size]
|
||||||
|
const torch::Tensor& scale) {
|
||||||
|
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
TORCH_CHECK(scale.numel() == 1);
|
||||||
|
|
||||||
|
const int hidden_size = input.size(-1);
|
||||||
|
const int num_tokens = input.numel() / hidden_size;
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
|
||||||
|
static_scaled_int8_quant_impl(
|
||||||
|
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
|
||||||
|
scale.data_ptr<float>(), num_tokens, hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// dynamic-per-token quantization.
|
||||||
|
void dynamic_scaled_int8_quant(
|
||||||
|
torch::Tensor& out, // [..., hidden_size]
|
||||||
|
const torch::Tensor& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& scale // [..., 1]
|
||||||
|
) {
|
||||||
|
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
|
||||||
|
int const hidden_size = input.size(-1);
|
||||||
|
int const num_tokens = input.numel() / hidden_size;
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
|
||||||
|
dynamic_scaled_int8_quant_impl(
|
||||||
|
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
|
||||||
|
scale.data_ptr<float>(), num_tokens, hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -4,7 +4,12 @@
|
|||||||
|
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
void init_cpu_threads_env(const std::string& cpu_ids);
|
std::string init_cpu_threads_env(const std::string& cpu_ids);
|
||||||
|
|
||||||
|
void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||||
|
const torch::Tensor& b, const torch::Tensor& a_scales,
|
||||||
|
const torch::Tensor& b_scales,
|
||||||
|
const c10::optional<torch::Tensor>& bias);
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||||
// vLLM custom ops
|
// vLLM custom ops
|
||||||
@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// PagedAttention V2.
|
// PagedAttention V2.
|
||||||
ops.def(
|
ops.def(
|
||||||
"paged_attention_v2("
|
"paged_attention_v2("
|
||||||
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
" Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
|
||||||
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
" Tensor! tmp_out, Tensor query, Tensor key_cache,"
|
||||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
" int max_seq_len, Tensor? alibi_slopes,"
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" Tensor! key, int head_size,"
|
" Tensor! key, int head_size,"
|
||||||
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
||||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
|
||||||
|
// Quantization
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
|
ops.def(
|
||||||
|
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
|
||||||
|
// Compute int8 quantized tensor and scaling factor
|
||||||
|
ops.def(
|
||||||
|
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
|
||||||
|
&dynamic_scaled_int8_quant);
|
||||||
|
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
|
// quantization.
|
||||||
|
ops.def(
|
||||||
|
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
||||||
|
" Tensor b, Tensor a_scales,"
|
||||||
|
" Tensor b_scales, Tensor? bias) -> ()");
|
||||||
|
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||||
@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
// Copy the cache blocks from src to dst.
|
// Copy the cache blocks from src to dst.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||||
"block_mapping) -> ()");
|
"Tensor block_mapping) -> ()");
|
||||||
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
||||||
|
|
||||||
// Reshape the key and value tensors and cache them.
|
// Reshape the key and value tensors and cache them.
|
||||||
@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||||
// CPU utils
|
// CPU utils
|
||||||
utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
|
utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#include "cpu_types.hpp"
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
void init_cpu_threads_env(const std::string& cpu_ids) {
|
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||||
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
|
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
|
||||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
TORCH_CHECK(omp_cpu_mask->size > 0);
|
||||||
std::vector<int> omp_cpu_ids;
|
std::vector<int> omp_cpu_ids;
|
||||||
@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
torch::set_num_threads((int)omp_cpu_ids.size());
|
torch::set_num_threads((int)omp_cpu_ids.size());
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
|
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
|
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> thread_core_mapping;
|
||||||
|
thread_core_mapping.reserve(omp_cpu_ids.size());
|
||||||
|
omp_lock_t writelock;
|
||||||
|
omp_init_lock(&writelock);
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static, 1)
|
#pragma omp parallel for schedule(static, 1)
|
||||||
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
|
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
|
||||||
cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
|
cpu_set_t mask;
|
||||||
size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
|
CPU_ZERO(&mask);
|
||||||
CPU_ZERO_S(size, mask);
|
CPU_SET(omp_cpu_ids[i], &mask);
|
||||||
CPU_SET_S(omp_cpu_ids[i], size, mask);
|
int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
|
||||||
sched_setaffinity(0, sizeof(cpu_set_t), mask);
|
if (ret == -1) {
|
||||||
CPU_FREE(mask);
|
TORCH_CHECK(false,
|
||||||
|
"sched_setaffinity failed. errno: " + std::to_string(errno));
|
||||||
|
}
|
||||||
|
|
||||||
|
omp_set_lock(&writelock);
|
||||||
|
thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
|
||||||
|
omp_unset_lock(&writelock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
omp_destroy_lock(&writelock);
|
||||||
|
|
||||||
numa_free_nodemask(omp_cpu_mask);
|
numa_free_nodemask(omp_cpu_mask);
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "OMP threads binding of Process " << getpid() << ":\n";
|
||||||
|
std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
|
||||||
|
[](auto&& a, auto&& b) { return a.second < b.second; });
|
||||||
|
for (auto&& item : thread_core_mapping) {
|
||||||
|
ss << "\t"
|
||||||
|
<< "OMP tid: " << item.first << ", core " << item.second << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe(
|
|||||||
moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
|
moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
|
||||||
thread_n, sms, max_par, replicate_input, apply_weights);
|
thread_n, sms, max_par, replicate_input, apply_weights);
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe(
|
|||||||
const torch::Tensor& g_idx, const torch::Tensor& perm,
|
const torch::Tensor& g_idx, const torch::Tensor& perm,
|
||||||
torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
|
torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
|
||||||
bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
|
bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
|
||||||
bool replicate_input, bool apply_weights);
|
bool replicate_input, bool apply_weights);
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
"g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
|
"g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
|
||||||
"size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
|
"size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
|
||||||
"bool replicate_input, bool apply_weights) -> Tensor");
|
"bool replicate_input, bool apply_weights) -> Tensor");
|
||||||
|
|
||||||
m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
|
m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
11
csrc/ops.h
11
csrc/ops.h
@@ -123,9 +123,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|||||||
int64_t size_k, int64_t size_n,
|
int64_t size_k, int64_t size_n,
|
||||||
int64_t num_bits);
|
int64_t num_bits);
|
||||||
|
|
||||||
|
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& perm, c10::SymInt size_k,
|
||||||
|
c10::SymInt size_n, int64_t num_bits);
|
||||||
|
|
||||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
||||||
int64_t size_n, int64_t num_bits);
|
int64_t size_n, int64_t num_bits);
|
||||||
|
|
||||||
|
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
c10::SymInt size_k, c10::SymInt size_n,
|
||||||
|
int64_t num_bits);
|
||||||
|
|
||||||
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
||||||
int64_t n);
|
int64_t n);
|
||||||
|
|
||||||
@@ -170,9 +178,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
|||||||
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
torch::Tensor& scales);
|
torch::Tensor& scales);
|
||||||
|
|
||||||
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
|
||||||
torch::Tensor lookup_table);
|
|
||||||
|
|
||||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||||
torch::Tensor b_gptq_qzeros,
|
torch::Tensor b_gptq_qzeros,
|
||||||
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||||
|
|||||||
@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
c10::SymInt size_k, c10::SymInt size_n,
|
||||||
|
int64_t num_bits) {
|
||||||
|
int const pack_factor = 32 / num_bits;
|
||||||
|
auto options = torch::TensorOptions()
|
||||||
|
.dtype(b_q_weight.dtype())
|
||||||
|
.device(b_q_weight.device());
|
||||||
|
return torch::empty_symint(
|
||||||
|
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||||
|
options);
|
||||||
|
}
|
||||||
|
|||||||
@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& perm, c10::SymInt size_k,
|
||||||
|
c10::SymInt size_n, int64_t num_bits) {
|
||||||
|
int const pack_factor = 32 / num_bits;
|
||||||
|
auto options = torch::TensorOptions()
|
||||||
|
.dtype(b_q_weight.dtype())
|
||||||
|
.device(b_q_weight.device());
|
||||||
|
return torch::empty_symint(
|
||||||
|
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||||
|
options);
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,216 +0,0 @@
|
|||||||
#include <torch/all.h>
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
|
|
||||||
// half-tensor
|
|
||||||
#include <c10/cuda/CUDAStream.h>
|
|
||||||
#include <ATen/cuda/CUDATensorMethods.cuh>
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
|
||||||
|
|
||||||
#define BLOCKWIDTH 128
|
|
||||||
#define BLOCKHEIGHT4 16
|
|
||||||
|
|
||||||
namespace vllm {
|
|
||||||
namespace squeezellm {
|
|
||||||
|
|
||||||
__device__ inline unsigned int as_unsigned(int i) {
|
|
||||||
return *reinterpret_cast<unsigned int*>(&i);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4-bit matvec kernel (LUT-based)
|
|
||||||
__global__ void NUQ4MatMulKernel(
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
const half2* __restrict__ vec,
|
|
||||||
#else
|
|
||||||
const __half2* __restrict__ vec,
|
|
||||||
#endif
|
|
||||||
const int* __restrict__ mat,
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
half2* __restrict__ mul,
|
|
||||||
#else
|
|
||||||
float2* __restrict__ mul,
|
|
||||||
#endif
|
|
||||||
const __half* __restrict__ lookup_table, int height, int width, int batch,
|
|
||||||
int vec_height) {
|
|
||||||
|
|
||||||
const int blockwidth2 = BLOCKWIDTH / 2;
|
|
||||||
|
|
||||||
int row = BLOCKHEIGHT4 * blockIdx.x;
|
|
||||||
int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
__shared__ half2 blockvec[blockwidth2];
|
|
||||||
#else
|
|
||||||
__shared__ __half2 blockvec[blockwidth2];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__shared__ __half deq2[16][BLOCKWIDTH];
|
|
||||||
int off = threadIdx.x;
|
|
||||||
int column_offset = col * 16;
|
|
||||||
for (int val = 0; val < 16; val += 1) {
|
|
||||||
int lut_index = column_offset + val;
|
|
||||||
deq2[val][off] = lookup_table[lut_index];
|
|
||||||
}
|
|
||||||
|
|
||||||
__half res;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
half2 res2;
|
|
||||||
half2 tmp2;
|
|
||||||
#else
|
|
||||||
__half2 res2;
|
|
||||||
__half2 tmp2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int i;
|
|
||||||
int k;
|
|
||||||
|
|
||||||
unsigned int tmp1;
|
|
||||||
unsigned int lut_index1, lut_index2;
|
|
||||||
|
|
||||||
for (int b = 0; b < batch; ++b) {
|
|
||||||
i = width * row + col;
|
|
||||||
res = __int2half_rd(0);
|
|
||||||
k = 0;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
if (threadIdx.x < blockwidth2)
|
|
||||||
blockvec[threadIdx.x] =
|
|
||||||
vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 +
|
|
||||||
threadIdx.x];
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
while (k < blockwidth2) {
|
|
||||||
tmp1 = as_unsigned(mat[i]);
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
res2 = {};
|
|
||||||
tmp2 = {};
|
|
||||||
#else
|
|
||||||
res2.x = __half_as_ushort(__float2half(0));
|
|
||||||
res2.y = __half_as_ushort(__float2half(0));
|
|
||||||
tmp2.x = __half_as_ushort(__float2half(0));
|
|
||||||
tmp2.y = __half_as_ushort(__float2half(0));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
lut_index1 = tmp1 & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 4) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 0], res2);
|
|
||||||
|
|
||||||
lut_index1 = (tmp1 >> 8) & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 12) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 1], res2);
|
|
||||||
|
|
||||||
lut_index1 = (tmp1 >> 16) & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 20) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 2], res2);
|
|
||||||
|
|
||||||
lut_index1 = (tmp1 >> 24) & 0xF;
|
|
||||||
lut_index2 = (tmp1 >> 28) & 0xF;
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
tmp2.x = deq2[lut_index1][off];
|
|
||||||
tmp2.y = deq2[lut_index2][off];
|
|
||||||
#else
|
|
||||||
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
|
|
||||||
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
|
|
||||||
#endif
|
|
||||||
res2 = __hfma2(tmp2, blockvec[k + 3], res2);
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
res = __hadd(__hadd(res2.x, res2.y), res);
|
|
||||||
#else
|
|
||||||
res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)),
|
|
||||||
res);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
i += width;
|
|
||||||
k += 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
// col%2 -> only set one of the two values
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
half2 res3 = {};
|
|
||||||
if (col % 2 == 0) {
|
|
||||||
res3.x = res;
|
|
||||||
} else {
|
|
||||||
res3.y = res;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
__half2 res3;
|
|
||||||
res3.x = __half_as_ushort(__float2half(0));
|
|
||||||
res3.y = __half_as_ushort(__float2half(0));
|
|
||||||
if (col % 2 == 0) {
|
|
||||||
res3.x = __half_as_ushort(res);
|
|
||||||
} else {
|
|
||||||
res3.y = __half_as_ushort(res);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
atomicAdd(&mul[b * width / 2 + col / 2], res3);
|
|
||||||
#else
|
|
||||||
int tmp_addr = b * width / 2 + col / 2;
|
|
||||||
atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x)));
|
|
||||||
atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y)));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace squeezellm
|
|
||||||
} // namespace vllm
|
|
||||||
|
|
||||||
// 4-bit matvec kernel (LUT-based)
|
|
||||||
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
|
||||||
torch::Tensor lookup_table) {
|
|
||||||
int height = mat.size(0);
|
|
||||||
int width = mat.size(1);
|
|
||||||
|
|
||||||
int batch = vec.size(0);
|
|
||||||
int vec_height = vec.size(1);
|
|
||||||
|
|
||||||
dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
|
|
||||||
(width + BLOCKWIDTH - 1) / BLOCKWIDTH);
|
|
||||||
dim3 threads(BLOCKWIDTH);
|
|
||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
(half2*)vec.data_ptr<at::Half>(),
|
|
||||||
#else
|
|
||||||
(__half2*)vec.data_ptr<at::Half>(),
|
|
||||||
#endif
|
|
||||||
mat.data_ptr<int>(),
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
(half2*)mul.data_ptr<at::Half>(),
|
|
||||||
(__half*)lookup_table.data_ptr<at::Half>(),
|
|
||||||
#else
|
|
||||||
(float2*)mul.data_ptr<float>(),
|
|
||||||
(__half*)lookup_table.data_ptr<at::Half>(),
|
|
||||||
#endif
|
|
||||||
height, width, batch, vec_height);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef BLOCKWIDTH
|
|
||||||
#undef BLOCKHEIGHT4
|
|
||||||
@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// PagedAttention V2.
|
// PagedAttention V2.
|
||||||
ops.def(
|
ops.def(
|
||||||
"paged_attention_v2("
|
"paged_attention_v2("
|
||||||
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
" Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
|
||||||
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
" Tensor! tmp_out, Tensor query, Tensor key_cache,"
|
||||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
" int max_seq_len, Tensor? alibi_slopes,"
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
@@ -73,7 +73,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
|
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
|
||||||
|
|
||||||
// prepare_inputs advance_step
|
// prepare_inputs advance_step
|
||||||
ops.def("advance_step", &advance_step);
|
ops.def(
|
||||||
|
"advance_step(int num_seqs, int num_queries, int block_size, "
|
||||||
|
"Tensor! input_tokens, Tensor sampled_token_ids, "
|
||||||
|
"Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
|
||||||
|
"Tensor block_tables) -> ()");
|
||||||
ops.impl("advance_step", torch::kCUDA, &advance_step);
|
ops.impl("advance_step", torch::kCUDA, &advance_step);
|
||||||
|
|
||||||
// Layernorm
|
// Layernorm
|
||||||
@@ -110,27 +114,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// Quantization ops
|
// Quantization ops
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
// Quantized GEMM for AQLM.
|
// Quantized GEMM for AQLM.
|
||||||
ops.def("aqlm_gemm", &aqlm_gemm);
|
ops.def(
|
||||||
|
"aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
|
||||||
|
"Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
|
||||||
|
"-> Tensor");
|
||||||
ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
|
ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
|
||||||
|
|
||||||
// Decompression method for AQLM.
|
// Decompression method for AQLM.
|
||||||
ops.def("aqlm_dequant", &aqlm_dequant);
|
ops.def(
|
||||||
|
"aqlm_dequant(Tensor codes, Tensor codebooks, "
|
||||||
|
"int[] codebook_partition_sizes) -> Tensor");
|
||||||
ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
|
ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
|
||||||
|
|
||||||
// Quantized GEMM for AWQ.
|
// Quantized GEMM for AWQ.
|
||||||
ops.def("awq_gemm", &awq_gemm);
|
ops.def(
|
||||||
|
"awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
|
||||||
|
"Tensor _zeros, int split_k_iters) -> Tensor");
|
||||||
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
|
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
|
||||||
|
|
||||||
// Dequantization for AWQ.
|
// Dequantization for AWQ.
|
||||||
ops.def("awq_dequantize", &awq_dequantize);
|
ops.def(
|
||||||
|
"awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
|
||||||
|
"Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
|
||||||
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
|
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
|
||||||
|
|
||||||
|
// Note about marlin kernel 'workspace' arguments:
|
||||||
|
// Technically these should be mutable since they are modified by the kernel.
|
||||||
|
// But since they are set back to zero once the kernel is finished we can
|
||||||
|
// hand wave and say that they have no net effect.
|
||||||
|
//
|
||||||
|
// The reason to mark 'workspace' as immutable is so that they don't interfere
|
||||||
|
// with using ScalarType arguments in the ops. If they are marked as mutable,
|
||||||
|
// pytorch throws an assert in
|
||||||
|
// 'torch._higher_order_ops._register_effectful_op' that prevents these
|
||||||
|
// kernels from being torch.compile'd.
|
||||||
|
// See the following document for more info on custom types and ops that use
|
||||||
|
// custom types:
|
||||||
|
// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
|
||||||
|
|
||||||
// Marlin (Dense) Optimized Quantized GEMM for GPTQ.
|
// Marlin (Dense) Optimized Quantized GEMM for GPTQ.
|
||||||
ops.def("marlin_gemm", &marlin_gemm);
|
ops.def(
|
||||||
|
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
|
"Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
|
||||||
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
|
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
|
||||||
|
|
||||||
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
|
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
|
||||||
ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
|
ops.def(
|
||||||
|
"gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
|
||||||
|
"Tensor b_scales, Tensor workspace, "
|
||||||
|
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
||||||
|
"int size_m, int size_n, int size_k) -> Tensor");
|
||||||
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
|
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
|
||||||
|
|
||||||
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
|
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
|
||||||
@@ -149,35 +182,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
|
ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
|
||||||
|
|
||||||
// gptq_marlin Optimized Quantized GEMM for GPTQ.
|
// gptq_marlin Optimized Quantized GEMM for GPTQ.
|
||||||
ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
|
ops.def(
|
||||||
|
"gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
|
"Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
|
||||||
|
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
|
||||||
|
"int size_m, int size_n, int size_k, bool is_k_full, "
|
||||||
|
"bool has_zp, bool use_fp32_reduce) -> Tensor");
|
||||||
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
|
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
|
||||||
|
|
||||||
// gptq_marlin repack from GPTQ.
|
// gptq_marlin repack from GPTQ.
|
||||||
ops.def("gptq_marlin_repack", &gptq_marlin_repack);
|
ops.def(
|
||||||
|
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
|
||||||
|
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
|
||||||
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
||||||
|
ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
|
||||||
|
|
||||||
// awq_marlin repack from AWQ.
|
// awq_marlin repack from AWQ.
|
||||||
ops.def("awq_marlin_repack", &awq_marlin_repack);
|
ops.def(
|
||||||
|
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
|
||||||
|
"SymInt size_n, int num_bits) -> Tensor");
|
||||||
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
|
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
|
||||||
|
ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
|
||||||
|
|
||||||
// Dequantization for GGML.
|
// Dequantization for GGML.
|
||||||
ops.def("ggml_dequantize", &ggml_dequantize);
|
ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
|
||||||
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
|
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
|
||||||
|
|
||||||
// mmvq kernel for GGML.
|
// mmvq kernel for GGML.
|
||||||
ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8);
|
ops.def(
|
||||||
|
"ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
|
||||||
|
"-> Tensor");
|
||||||
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
|
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
|
||||||
|
|
||||||
// mmq kernel for GGML.
|
// mmq kernel for GGML.
|
||||||
ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8);
|
ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
|
||||||
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
|
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
|
||||||
|
|
||||||
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
|
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
|
||||||
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
|
ops.def(
|
||||||
|
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
|
||||||
|
"Tensor! workspace, int num_bits, int size_m, int size_n, "
|
||||||
|
"int size_k) -> Tensor");
|
||||||
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
|
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
|
||||||
|
|
||||||
// marlin_qqq_gemm for QQQ.
|
// marlin_qqq_gemm for QQQ.
|
||||||
ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
|
ops.def(
|
||||||
|
"marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
|
||||||
|
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
|
||||||
|
"Tensor! workspace, int size_m, int size_n, "
|
||||||
|
"int size_k) -> Tensor");
|
||||||
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
|
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
|
||||||
|
|
||||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
@@ -199,16 +252,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
|
|
||||||
// Check if cutlass scaled_mm is supported for CUDA devices of the given
|
// Check if cutlass scaled_mm is supported for CUDA devices of the given
|
||||||
// capability
|
// capability
|
||||||
ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
|
ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
|
||||||
ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
|
ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
|
||||||
&cutlass_scaled_mm_supports_fp8);
|
|
||||||
// Mamba selective scan kernel
|
// Mamba selective scan kernel
|
||||||
ops.def(
|
ops.def(
|
||||||
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
"selective_scan_fwd(Tensor! u, Tensor! delta,"
|
||||||
"Tensor! A, Tensor! B, Tensor! C,"
|
"Tensor! A, Tensor! B, Tensor! C,"
|
||||||
"Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
|
"Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
|
||||||
"bool delta_softplus,"
|
"bool delta_softplus,"
|
||||||
"Tensor? index_, Tensor? x) -> Tensor[]");
|
"Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
|
||||||
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
||||||
|
|
||||||
ops.def(
|
ops.def(
|
||||||
@@ -230,19 +283,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Quantized GEMM for GPTQ.
|
// Quantized GEMM for GPTQ.
|
||||||
ops.def("gptq_gemm", &gptq_gemm);
|
// Note: even though the C++ inferred schema is correct for this op, it seems
|
||||||
|
// to prevent the meta function registry.
|
||||||
|
ops.def(
|
||||||
|
"gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
|
||||||
|
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
|
||||||
|
"-> Tensor");
|
||||||
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
|
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
|
||||||
|
|
||||||
// Post processing for GPTQ.
|
// Post processing for GPTQ.
|
||||||
ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
|
ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
|
||||||
ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
|
ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
|
||||||
|
|
||||||
// Quantized GEMM for SqueezeLLM.
|
|
||||||
ops.def(
|
|
||||||
"squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
|
|
||||||
"lookup_table) -> ()");
|
|
||||||
ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
|
|
||||||
|
|
||||||
// Compute FP8 quantized tensor for given scaling factor.
|
// Compute FP8 quantized tensor for given scaling factor.
|
||||||
ops.def(
|
ops.def(
|
||||||
"static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
|
"static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
|
||||||
@@ -256,8 +308,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
|
|
||||||
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
|
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
|
||||||
ops.def(
|
ops.def(
|
||||||
"dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
|
"dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
|
||||||
"scale, Tensor? scale_ub) -> "
|
"Tensor! scale, Tensor? scale_ub) -> "
|
||||||
"()");
|
"()");
|
||||||
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
|
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
|
||||||
&dynamic_per_token_scaled_fp8_quant);
|
&dynamic_per_token_scaled_fp8_quant);
|
||||||
@@ -294,8 +346,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
// Copy the cache blocks from src to dst.
|
// Copy the cache blocks from src to dst.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||||
"block_mapping) -> ()");
|
"Tensor block_mapping) -> ()");
|
||||||
cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks);
|
cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks);
|
||||||
|
|
||||||
// Reshape the key and value tensors and cache them.
|
// Reshape the key and value tensors and cache them.
|
||||||
@@ -320,8 +372,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
// Convert the key and value cache to fp8 data type.
|
// Convert the key and value cache to fp8 data type.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
|
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
|
||||||
"kv_cache_dtype) -> ()");
|
"str kv_cache_dtype) -> ()");
|
||||||
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
|
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -329,24 +381,28 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
|
|||||||
// Cuda utils
|
// Cuda utils
|
||||||
|
|
||||||
// Gets the specified device attribute.
|
// Gets the specified device attribute.
|
||||||
cuda_utils.def("get_device_attribute", &get_device_attribute);
|
cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
|
||||||
cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
|
cuda_utils.impl("get_device_attribute", &get_device_attribute);
|
||||||
|
|
||||||
// Gets the maximum shared memory per block device attribute.
|
// Gets the maximum shared memory per block device attribute.
|
||||||
cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
|
cuda_utils.def(
|
||||||
&get_max_shared_memory_per_block_device_attribute);
|
"get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
|
||||||
cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
|
cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
|
||||||
torch::kCUDA,
|
|
||||||
&get_max_shared_memory_per_block_device_attribute);
|
&get_max_shared_memory_per_block_device_attribute);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
||||||
// Custom all-reduce kernels
|
// Custom all-reduce kernels
|
||||||
custom_ar.def("init_custom_ar", &init_custom_ar);
|
custom_ar.def(
|
||||||
|
"init_custom_ar(Tensor meta, Tensor rank_data, "
|
||||||
|
"str[] handles, int[] offsets, int rank, "
|
||||||
|
"bool full_nvlink) -> int");
|
||||||
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
|
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
|
||||||
|
|
||||||
custom_ar.def("should_custom_ar", &should_custom_ar);
|
custom_ar.def(
|
||||||
|
"should_custom_ar(Tensor inp, int max_size, int world_size, "
|
||||||
|
"bool full_nvlink) -> bool");
|
||||||
custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
|
custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
|
||||||
|
|
||||||
custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
|
custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
|
||||||
@@ -358,21 +414,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
|||||||
custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
|
custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
|
||||||
|
|
||||||
custom_ar.def("dispose", &dispose);
|
custom_ar.def("dispose", &dispose);
|
||||||
custom_ar.impl("dispose", torch::kCPU, &dispose);
|
|
||||||
|
|
||||||
custom_ar.def("meta_size", &meta_size);
|
custom_ar.def("meta_size", &meta_size);
|
||||||
custom_ar.impl("meta_size", torch::kCPU, &meta_size);
|
|
||||||
|
|
||||||
custom_ar.def("register_buffer", ®ister_buffer);
|
custom_ar.def(
|
||||||
|
"register_buffer(int fa, Tensor t, str[] handles, "
|
||||||
|
"int[] offsets) -> ()");
|
||||||
custom_ar.impl("register_buffer", torch::kCUDA, ®ister_buffer);
|
custom_ar.impl("register_buffer", torch::kCUDA, ®ister_buffer);
|
||||||
|
|
||||||
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
|
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
|
||||||
custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
|
|
||||||
&get_graph_buffer_ipc_meta);
|
|
||||||
|
|
||||||
custom_ar.def("register_graph_buffers", ®ister_graph_buffers);
|
custom_ar.def("register_graph_buffers", ®ister_graph_buffers);
|
||||||
custom_ar.impl("register_graph_buffers", torch::kCPU,
|
|
||||||
®ister_graph_buffers);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,5 @@ pydantic >= 2.8
|
|||||||
torch
|
torch
|
||||||
py-cpuinfo
|
py-cpuinfo
|
||||||
transformers
|
transformers
|
||||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
|
||||||
mistral_common >= 1.3.4
|
mistral_common >= 1.3.4
|
||||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||||
@@ -5,6 +5,7 @@ vLLM Meetups
|
|||||||
|
|
||||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||||
|
|
||||||
|
- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
|
||||||
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
|
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
|
||||||
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
|
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
|
||||||
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
|
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ autodoc_mock_imports = [
|
|||||||
"aiohttp",
|
"aiohttp",
|
||||||
"compressed_tensors",
|
"compressed_tensors",
|
||||||
"cpuinfo",
|
"cpuinfo",
|
||||||
|
"cv2",
|
||||||
"torch",
|
"torch",
|
||||||
"transformers",
|
"transformers",
|
||||||
"psutil",
|
"psutil",
|
||||||
|
|||||||
@@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
|
|||||||
.. tip::
|
.. tip::
|
||||||
|
|
||||||
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
||||||
|
|
||||||
Example commands:
|
.. tip::
|
||||||
|
|
||||||
|
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
|
||||||
|
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
|
||||||
|
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
|
||||||
|
|
||||||
|
Example commands and usage:
|
||||||
|
===========================
|
||||||
|
|
||||||
|
Offline Inference:
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
|
||||||
|
|
||||||
|
|
||||||
OpenAI Server:
|
OpenAI Server:
|
||||||
|
--------------
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
|
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
|
||||||
|
|
||||||
benchmark_serving.py:
|
benchmark_serving.py:
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ If you have already taken care of the above issues, but the vLLM instance still
|
|||||||
|
|
||||||
With more logging, hopefully you can find the root cause of the issue.
|
With more logging, hopefully you can find the root cause of the issue.
|
||||||
|
|
||||||
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
|
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
|
||||||
|
|
||||||
Here are some common issues that can cause hangs:
|
Here are some common issues that can cause hangs:
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,9 @@ Offline Batched Inference
|
|||||||
|
|
||||||
We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
|
We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
|
||||||
|
|
||||||
Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process.
|
Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
|
||||||
|
The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
|
||||||
|
The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The
|
|||||||
]
|
]
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
|
Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
|
|||||||
@@ -107,3 +107,55 @@ The following is an example request
|
|||||||
"max_tokens": 7,
|
"max_tokens": 7,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
}' | jq
|
}' | jq
|
||||||
|
|
||||||
|
|
||||||
|
Dynamically serving LoRA Adapters
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
|
||||||
|
LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
|
||||||
|
to change models on-the-fly is needed.
|
||||||
|
|
||||||
|
Note: Enabling this feature in production environments is risky as user may participate model adapter management.
|
||||||
|
|
||||||
|
To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
|
||||||
|
is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
|
||||||
|
|
||||||
|
|
||||||
|
Loading a LoRA Adapter:
|
||||||
|
|
||||||
|
To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
|
||||||
|
details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter.
|
||||||
|
|
||||||
|
Example request to load a LoRA adapter:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
curl -X POST http://localhost:8000/v1/load_lora_adapter \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"lora_name": "sql_adapter",
|
||||||
|
"lora_path": "/path/to/sql-lora-adapter"
|
||||||
|
}'
|
||||||
|
|
||||||
|
Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter
|
||||||
|
cannot be found or loaded, an appropriate error message will be returned.
|
||||||
|
|
||||||
|
Unloading a LoRA Adapter:
|
||||||
|
|
||||||
|
To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint
|
||||||
|
with the name or ID of the adapter to be unloaded.
|
||||||
|
|
||||||
|
Example request to unload a LoRA adapter:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
curl -X POST http://localhost:8000/v1/unload_lora_adapter \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"lora_name": "sql_adapter"
|
||||||
|
}'
|
||||||
|
|||||||
@@ -161,6 +161,46 @@ A variety of speculative models of this type are available on HF hub:
|
|||||||
* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
|
* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
|
||||||
* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
|
* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
|
||||||
|
|
||||||
|
Lossless guarantees of Speculative Decoding
|
||||||
|
-------------------------------------------
|
||||||
|
In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of
|
||||||
|
speculative decoding, breaking down the guarantees into three key areas:
|
||||||
|
|
||||||
|
1. **Theoretical Losslessness**
|
||||||
|
- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might
|
||||||
|
cause slight variations in output distributions, as discussed
|
||||||
|
in `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_
|
||||||
|
|
||||||
|
2. **Algorithmic Losslessness**
|
||||||
|
- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include:
|
||||||
|
|
||||||
|
- **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target
|
||||||
|
distribution. `View Test Code <https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252>`_
|
||||||
|
|
||||||
|
- **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
|
||||||
|
without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
|
||||||
|
provides a lossless guarantee. Almost all of the tests in `this directory <https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e>`_
|
||||||
|
verify this property using `this assertion implementation <https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291>`_
|
||||||
|
|
||||||
|
3. **vLLM Logprob Stability**
|
||||||
|
- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
|
||||||
|
same request across runs. For more details, see the FAQ section
|
||||||
|
titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
|
||||||
|
|
||||||
|
|
||||||
|
**Conclusion**
|
||||||
|
|
||||||
|
While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
|
||||||
|
can occur due to following factors:
|
||||||
|
|
||||||
|
- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution.
|
||||||
|
|
||||||
|
- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
|
||||||
|
due to non-deterministic behavior in batched operations or numerical instability.
|
||||||
|
|
||||||
|
**Mitigation Strategies**
|
||||||
|
|
||||||
|
For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
|
||||||
|
|
||||||
Resources for vLLM contributors
|
Resources for vLLM contributors
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|||||||
@@ -194,12 +194,12 @@ Multimodal Language Models
|
|||||||
|
|
||||||
* - Architecture
|
* - Architecture
|
||||||
- Models
|
- Models
|
||||||
- Supported Modalities
|
- Modalities
|
||||||
- Example HuggingFace Models
|
- Example HuggingFace Models
|
||||||
- :ref:`LoRA <lora>`
|
- :ref:`LoRA <lora>`
|
||||||
* - :code:`Blip2ForConditionalGeneration`
|
* - :code:`Blip2ForConditionalGeneration`
|
||||||
- BLIP-2
|
- BLIP-2
|
||||||
- Image
|
- Image\ :sup:`E`
|
||||||
- :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
|
- :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`ChameleonForConditionalGeneration`
|
* - :code:`ChameleonForConditionalGeneration`
|
||||||
@@ -214,44 +214,75 @@ Multimodal Language Models
|
|||||||
-
|
-
|
||||||
* - :code:`InternVLChatModel`
|
* - :code:`InternVLChatModel`
|
||||||
- InternVL2
|
- InternVL2
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
|
- :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`LlavaForConditionalGeneration`
|
* - :code:`LlavaForConditionalGeneration`
|
||||||
- LLaVA-1.5
|
- LLaVA-1.5
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
|
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`LlavaNextForConditionalGeneration`
|
* - :code:`LlavaNextForConditionalGeneration`
|
||||||
- LLaVA-NeXT
|
- LLaVA-NeXT
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
|
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
|
||||||
-
|
-
|
||||||
|
* - :code:`LlavaNextVideoForConditionalGeneration`
|
||||||
|
- LLaVA-NeXT-Video
|
||||||
|
- Video
|
||||||
|
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
|
||||||
|
-
|
||||||
|
* - :code:`MiniCPMV`
|
||||||
|
- MiniCPM-V
|
||||||
|
- Image\ :sup:`+`
|
||||||
|
- :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
|
||||||
|
-
|
||||||
* - :code:`PaliGemmaForConditionalGeneration`
|
* - :code:`PaliGemmaForConditionalGeneration`
|
||||||
- PaliGemma
|
- PaliGemma
|
||||||
- Image
|
- Image\ :sup:`E`
|
||||||
- :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
|
- :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`Phi3VForCausalLM`
|
* - :code:`Phi3VForCausalLM`
|
||||||
- Phi-3-Vision, Phi-3.5-Vision
|
- Phi-3-Vision, Phi-3.5-Vision
|
||||||
- Image
|
- Image\ :sup:`E+`
|
||||||
- :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
|
- :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
|
||||||
-
|
-
|
||||||
* - :code:`MiniCPMV`
|
* - :code:`PixtralForConditionalGeneration`
|
||||||
- MiniCPM-V
|
- Pixtral
|
||||||
- Image
|
- Image\ :sup:`+`
|
||||||
- :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
|
- :code:`mistralai/Pixtral-12B-2409`
|
||||||
|
-
|
||||||
|
* - :code:`QWenLMHeadModel`
|
||||||
|
- Qwen-VL
|
||||||
|
- Image\ :sup:`E`
|
||||||
|
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
|
||||||
|
-
|
||||||
|
* - :code:`Qwen2VLForConditionalGeneration`
|
||||||
|
- Qwen2-VL (see note)
|
||||||
|
- Image\ :sup:`+` / Video\ :sup:`+`
|
||||||
|
- :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
|
||||||
-
|
-
|
||||||
* - :code:`UltravoxModel`
|
* - :code:`UltravoxModel`
|
||||||
- Ultravox
|
- Ultravox
|
||||||
- Audio
|
- Audio\ :sup:`E+`
|
||||||
- :code:`fixie-ai/ultravox-v0_3`
|
- :code:`fixie-ai/ultravox-v0_3`
|
||||||
-
|
-
|
||||||
|
|
||||||
|
| :sup:`E` Pre-computed embeddings can be inputted for this modality.
|
||||||
|
| :sup:`+` Multiple items can be inputted per text prompt for this modality.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
|
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
|
||||||
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
|
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
|
||||||
|
This can be installed by running the following command:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
||||||
|
|||||||
@@ -9,26 +9,23 @@ This document shows you how to run and serve these models using vLLM.
|
|||||||
.. important::
|
.. important::
|
||||||
We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
|
We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
|
||||||
|
|
||||||
Currently, the support for vision language models on vLLM has the following limitations:
|
|
||||||
|
|
||||||
* Only single image input is supported per text prompt.
|
|
||||||
|
|
||||||
We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
|
We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
|
||||||
|
|
||||||
Offline Batched Inference
|
Offline Inference
|
||||||
-------------------------
|
-----------------
|
||||||
|
|
||||||
To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
|
Single-image input
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||||
|
|
||||||
.. important::
|
.. note::
|
||||||
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
||||||
the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
|
the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model.
|
||||||
internally for each model.
|
|
||||||
|
|
||||||
|
|
||||||
To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
|
To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
|
||||||
|
|
||||||
@@ -86,61 +83,117 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
|
|||||||
|
|
||||||
A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
|
A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
|
||||||
|
|
||||||
|
Multi-image input
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
Online OpenAI Vision API Compatible Inference
|
Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
|
||||||
----------------------------------------------
|
|
||||||
|
To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
trust_remote_code=True, # Required to load Phi-3.5-vision
|
||||||
|
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
|
||||||
|
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
|
||||||
|
)
|
||||||
|
|
||||||
|
Instead of passing in a single image, you can pass in a list of images.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
|
prompt = "<|user|>\n<image_1>\n<image_2>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
||||||
|
|
||||||
|
# Load the images using PIL.Image
|
||||||
|
image1 = PIL.Image.open(...)
|
||||||
|
image2 = PIL.Image.open(...)
|
||||||
|
|
||||||
|
outputs = llm.generate({
|
||||||
|
"prompt": prompt,
|
||||||
|
"multi_modal_data": {
|
||||||
|
"image": [image1, image2]
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
|
||||||
|
A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
|
||||||
|
|
||||||
|
Online Inference
|
||||||
|
----------------
|
||||||
|
|
||||||
|
OpenAI Vision API
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
|
You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
|
||||||
|
|
||||||
.. note::
|
Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server.
|
||||||
Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
|
|
||||||
added in the future.
|
|
||||||
|
|
||||||
Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
|
|
||||||
|
|
||||||
.. important::
|
|
||||||
Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template
|
|
||||||
is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the
|
|
||||||
HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
|
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
|
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
|
||||||
|
--trust-remote-code --limit-mm-per-prompt image=2
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
|
||||||
the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
|
a chat template is **required** to launch the API server.
|
||||||
internally for each model.
|
|
||||||
|
Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
|
||||||
|
The chat template can be inferred based on the documentation on the model's HuggingFace repo.
|
||||||
|
For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
|
||||||
|
|
||||||
To consume the server, you can use the OpenAI client like in the example below:
|
To consume the server, you can use the OpenAI client like in the example below:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
openai_api_key = "EMPTY"
|
openai_api_key = "EMPTY"
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
api_key=openai_api_key,
|
api_key=openai_api_key,
|
||||||
base_url=openai_api_base,
|
base_url=openai_api_base,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Single-image input inference
|
||||||
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||||
|
|
||||||
chat_response = client.chat.completions.create(
|
chat_response = client.chat.completions.create(
|
||||||
model="llava-hf/llava-1.5-7b-hf",
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
messages=[{
|
messages=[{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||||
# since the prompt will be processed automatically by the API server.
|
# since the prompt will be processed automatically by the API server.
|
||||||
{"type": "text", "text": "What's in this image?"},
|
{"type": "text", "text": "What’s in this image?"},
|
||||||
{
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {
|
|
||||||
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
}],
|
}],
|
||||||
)
|
)
|
||||||
print("Chat response:", chat_response)
|
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||||
|
|
||||||
|
# Multi-image input inference
|
||||||
|
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
||||||
|
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
||||||
|
|
||||||
|
chat_response = client.chat.completions.create(
|
||||||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "What are the animals in these images?"},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url_duck}},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url_lion}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||||
|
|
||||||
|
|
||||||
A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
|
A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
|
||||||
|
|
||||||
|
|||||||
@@ -119,17 +119,6 @@ The table below shows the compatibility of various quantization implementations
|
|||||||
- ✗
|
- ✗
|
||||||
- ✗
|
- ✗
|
||||||
- ✗
|
- ✗
|
||||||
* - SqueezeLLM
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✅︎
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
- ✗
|
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
^^^^^^
|
^^^^^^
|
||||||
|
|||||||
@@ -10,3 +10,22 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
|
|||||||
Q: Which model to use for offline inference embedding?
|
Q: Which model to use for offline inference embedding?
|
||||||
|
|
||||||
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
|
A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
|
||||||
|
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
Q: Can the output of a prompt vary across runs in vLLM?
|
||||||
|
|
||||||
|
A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to
|
||||||
|
numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details,
|
||||||
|
see the `Numerical Accuracy section <https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations>`_.
|
||||||
|
|
||||||
|
In vLLM, the same requests might be batched differently due to factors such as other concurrent requests,
|
||||||
|
changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations,
|
||||||
|
can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in
|
||||||
|
different tokens being sampled. Once a different token is sampled, further divergence is likely.
|
||||||
|
|
||||||
|
**Mitigation Strategies**
|
||||||
|
|
||||||
|
- For improved stability and reduced variance, use `float32`. Note that this will require more memory.
|
||||||
|
- If using `bfloat16`, switching to `float16` can also help.
|
||||||
|
- Using request seeds can aid in achieving more stable generation for temperature > 0, but discrepancies due to precision differences may still occur.
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ This script evaluates the inference throughput of language models using various
|
|||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --help
|
python3 benchmarks/benchmark_throughput.py --help
|
||||||
usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
|
usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
|
||||||
[--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
|
[--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
|
||||||
[--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
|
[--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
|
||||||
[--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
|
[--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
|
||||||
[--quantization-param-path KV_CACHE_quantization_param_path]
|
[--quantization-param-path KV_CACHE_quantization_param_path]
|
||||||
@@ -76,7 +76,7 @@ optional arguments:
|
|||||||
--output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset.
|
--output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset.
|
||||||
--model MODEL
|
--model MODEL
|
||||||
--tokenizer TOKENIZER
|
--tokenizer TOKENIZER
|
||||||
--quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None}
|
--quantization {awq,gptq,None}, -q {awq,gptq,None}
|
||||||
--tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
|
--tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
|
||||||
--n N Number of generated sequences per prompt.
|
--n N Number of generated sequences per prompt.
|
||||||
--use-beam-search
|
--use-beam-search
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
### Quantizer Utilities
|
### Quantizer Utilities
|
||||||
`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
|
`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported
|
||||||
`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
|
from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
|
||||||
|
|
||||||
### Prerequisite
|
### Prerequisite
|
||||||
|
|
||||||
|
|||||||
164
examples/offline_inference_pixtral.py
Normal file
164
examples/offline_inference_pixtral.py
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
# ruff: noqa
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from vllm import LLM
|
||||||
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
|
# This script is an offline demo for running Pixtral.
|
||||||
|
#
|
||||||
|
# If you want to run a server/client setup, please follow this code:
|
||||||
|
#
|
||||||
|
# - Server:
|
||||||
|
#
|
||||||
|
# ```bash
|
||||||
|
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer_mode mistral --limit_mm_per_prompt 'image=4' --max_num_batched_tokens 16384
|
||||||
|
# ```
|
||||||
|
#
|
||||||
|
# - Client:
|
||||||
|
#
|
||||||
|
# ```bash
|
||||||
|
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
|
||||||
|
# --header 'Content-Type: application/json' \
|
||||||
|
# --header 'Authorization: Bearer token' \
|
||||||
|
# --data '{
|
||||||
|
# "model": "mistralai/Pixtral-12B-2409",
|
||||||
|
# "messages": [
|
||||||
|
# {
|
||||||
|
# "role": "user",
|
||||||
|
# "content": [
|
||||||
|
# {"type" : "text", "text": "Describe this image in detail please."},
|
||||||
|
# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
|
||||||
|
# {"type" : "text", "text": "and this one as well. Answer in French."},
|
||||||
|
# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
|
||||||
|
# ]
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
# }'
|
||||||
|
# ```
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python demo.py simple
|
||||||
|
# python demo.py advanced
|
||||||
|
|
||||||
|
|
||||||
|
def run_simple_demo():
|
||||||
|
model_name = "mistralai/Pixtral-12B-2409"
|
||||||
|
sampling_params = SamplingParams(max_tokens=8192)
|
||||||
|
|
||||||
|
llm = LLM(model=model_name, tokenizer_mode="mistral")
|
||||||
|
|
||||||
|
prompt = "Describe this image in one sentence."
|
||||||
|
image_url = "https://picsum.photos/id/237/200/300"
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": image_url
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
outputs = llm.chat(messages, sampling_params=sampling_params)
|
||||||
|
|
||||||
|
print(outputs[0].outputs[0].text)
|
||||||
|
|
||||||
|
|
||||||
|
def run_advanced_demo():
|
||||||
|
model_name = "mistralai/Pixtral-12B-2409"
|
||||||
|
max_img_per_msg = 5
|
||||||
|
max_tokens_per_img = 4096
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
tokenizer_mode="mistral",
|
||||||
|
limit_mm_per_prompt={"image": max_img_per_msg},
|
||||||
|
max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = "Describe the following image."
|
||||||
|
|
||||||
|
url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
|
||||||
|
url_2 = "https://picsum.photos/seed/picsum/200/300"
|
||||||
|
url_3 = "https://picsum.photos/id/32/512/512"
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": url_1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": url_2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "The images show nature.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "More details please and answer only in French!.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": url_3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
|
||||||
|
print(outputs[0].outputs[0].text)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Run a demo in simple or advanced mode.")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"mode",
|
||||||
|
choices=["simple", "advanced"],
|
||||||
|
help="Specify the demo mode: 'simple' or 'advanced'",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.mode == "simple":
|
||||||
|
print("Running simple demo...")
|
||||||
|
run_simple_demo()
|
||||||
|
elif args.mode == "advanced":
|
||||||
|
print("Running advanced demo...")
|
||||||
|
run_advanced_demo()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -9,12 +9,9 @@ from transformers import AutoTokenizer
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# Input image and question
|
|
||||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
|
||||||
question = "What is the content of this image?"
|
|
||||||
|
|
||||||
|
|
||||||
# LLaVA-1.5
|
# LLaVA-1.5
|
||||||
def run_llava(question):
|
def run_llava(question):
|
||||||
@@ -30,7 +27,16 @@ def run_llava(question):
|
|||||||
def run_llava_next(question):
|
def run_llava_next(question):
|
||||||
|
|
||||||
prompt = f"[INST] <image>\n{question} [/INST]"
|
prompt = f"[INST] <image>\n{question} [/INST]"
|
||||||
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
|
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
|
# LlaVA-NeXT-Video
|
||||||
|
# Currently only support for video input
|
||||||
|
def run_llava_next_video(question):
|
||||||
|
prompt = f"USER: <video>\n{question} ASSISTANT:"
|
||||||
|
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
|
||||||
stop_token_ids = None
|
stop_token_ids = None
|
||||||
return llm, prompt, stop_token_ids
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
@@ -159,9 +165,41 @@ def run_blip2(question):
|
|||||||
return llm, prompt, stop_token_ids
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
|
# Qwen
|
||||||
|
def run_qwen_vl(question):
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model="Qwen/Qwen-VL",
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_num_seqs=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = f"{question}Picture 1: <img></img>\n"
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
|
# Qwen2-VL
|
||||||
|
def run_qwen2_vl(question):
|
||||||
|
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
max_num_seqs=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||||
|
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
|
||||||
|
f"{question}<|im_end|>\n"
|
||||||
|
"<|im_start|>assistant\n")
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
"llava": run_llava,
|
"llava": run_llava,
|
||||||
"llava-next": run_llava_next,
|
"llava-next": run_llava_next,
|
||||||
|
"llava-next-video": run_llava_next_video,
|
||||||
"fuyu": run_fuyu,
|
"fuyu": run_fuyu,
|
||||||
"phi3_v": run_phi3v,
|
"phi3_v": run_phi3v,
|
||||||
"paligemma": run_paligemma,
|
"paligemma": run_paligemma,
|
||||||
@@ -169,14 +207,54 @@ model_example_map = {
|
|||||||
"minicpmv": run_minicpmv,
|
"minicpmv": run_minicpmv,
|
||||||
"blip-2": run_blip2,
|
"blip-2": run_blip2,
|
||||||
"internvl_chat": run_internvl,
|
"internvl_chat": run_internvl,
|
||||||
|
"qwen_vl": run_qwen_vl,
|
||||||
|
"qwen2_vl": run_qwen2_vl,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_multi_modal_input(args):
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"data": image or video,
|
||||||
|
"question": question,
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
if args.modality == "image":
|
||||||
|
# Input image and question
|
||||||
|
image = ImageAsset("cherry_blossom") \
|
||||||
|
.pil_image.convert("RGB")
|
||||||
|
img_question = "What is the content of this image?"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"data": image,
|
||||||
|
"question": img_question,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.modality == "video":
|
||||||
|
# Input video and question
|
||||||
|
video = VideoAsset(name="sample_demo_1.mp4",
|
||||||
|
num_frames=args.num_frames).np_ndarrays
|
||||||
|
vid_question = "Why is this video funny?"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"data": video,
|
||||||
|
"question": vid_question,
|
||||||
|
}
|
||||||
|
|
||||||
|
msg = f"Modality {args.modality} is not supported."
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
model = args.model_type
|
model = args.model_type
|
||||||
if model not in model_example_map:
|
if model not in model_example_map:
|
||||||
raise ValueError(f"Model type {model} is not supported.")
|
raise ValueError(f"Model type {model} is not supported.")
|
||||||
|
|
||||||
|
modality = args.modality
|
||||||
|
mm_input = get_multi_modal_input(args)
|
||||||
|
data = mm_input["data"]
|
||||||
|
question = mm_input["question"]
|
||||||
|
|
||||||
llm, prompt, stop_token_ids = model_example_map[model](question)
|
llm, prompt, stop_token_ids = model_example_map[model](question)
|
||||||
|
|
||||||
# We set temperature to 0.2 so that outputs can be different
|
# We set temperature to 0.2 so that outputs can be different
|
||||||
@@ -191,7 +269,7 @@ def main(args):
|
|||||||
inputs = {
|
inputs = {
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"multi_modal_data": {
|
"multi_modal_data": {
|
||||||
"image": image
|
modality: data
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -200,7 +278,7 @@ def main(args):
|
|||||||
inputs = [{
|
inputs = [{
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"multi_modal_data": {
|
"multi_modal_data": {
|
||||||
"image": image
|
modality: data
|
||||||
},
|
},
|
||||||
} for _ in range(args.num_prompts)]
|
} for _ in range(args.num_prompts)]
|
||||||
|
|
||||||
@@ -223,8 +301,15 @@ if __name__ == "__main__":
|
|||||||
help='Huggingface "model_type".')
|
help='Huggingface "model_type".')
|
||||||
parser.add_argument('--num-prompts',
|
parser.add_argument('--num-prompts',
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=4,
|
||||||
help='Number of prompts to run.')
|
help='Number of prompts to run.')
|
||||||
|
parser.add_argument('--modality',
|
||||||
|
type=str,
|
||||||
|
default="image",
|
||||||
|
help='Modality of the input.')
|
||||||
|
parser.add_argument('--num-frames',
|
||||||
|
type=int,
|
||||||
|
default=16,
|
||||||
|
help='Number of frames to extract from the video.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
207
examples/offline_inference_vision_language_multi_image.py
Normal file
207
examples/offline_inference_vision_language_multi_image.py
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
"""
|
||||||
|
This example shows how to use vLLM for running offline inference with
|
||||||
|
multi-image input on vision language models, using the chat template defined
|
||||||
|
by the model.
|
||||||
|
"""
|
||||||
|
from argparse import Namespace
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from transformers import AutoProcessor, AutoTokenizer
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.multimodal.utils import fetch_image
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
QUESTION = "What is the content of each image?"
|
||||||
|
IMAGE_URLS = [
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def load_phi3v(question, image_urls: List[str]):
|
||||||
|
llm = LLM(
|
||||||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_model_len=4096,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
placeholders = "\n".join(f"<|image_{i}|>"
|
||||||
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
|
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||||||
|
stop_token_ids = None
|
||||||
|
return llm, prompt, stop_token_ids, None
|
||||||
|
|
||||||
|
|
||||||
|
def load_internvl(question, image_urls: List[str]):
|
||||||
|
model_name = "OpenGVLab/InternVL2-2B"
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
max_num_seqs=5,
|
||||||
|
max_model_len=4096,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||||
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
|
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||||
|
trust_remote_code=True)
|
||||||
|
prompt = tokenizer.apply_chat_template(messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True)
|
||||||
|
|
||||||
|
# Stop tokens for InternVL
|
||||||
|
# models variants may have different stop tokens
|
||||||
|
# please refer to the model card for the correct "stop words":
|
||||||
|
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
|
||||||
|
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||||
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
|
|
||||||
|
return llm, prompt, stop_token_ids, None
|
||||||
|
|
||||||
|
|
||||||
|
def load_qwen2_vl(question, image_urls: List[str]):
|
||||||
|
try:
|
||||||
|
from qwen_vl_utils import process_vision_info
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
print('WARNING: `qwen-vl-utils` not installed, input images will not '
|
||||||
|
'be automatically resized. You can enable this functionality by '
|
||||||
|
'`pip install qwen-vl-utils`.')
|
||||||
|
process_vision_info = None
|
||||||
|
|
||||||
|
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name,
|
||||||
|
max_num_seqs=5,
|
||||||
|
max_model_len=32768 if process_vision_info is None else 4096,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
}, {
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name)
|
||||||
|
|
||||||
|
prompt = processor.apply_chat_template(messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True)
|
||||||
|
|
||||||
|
stop_token_ids = None
|
||||||
|
|
||||||
|
if process_vision_info is None:
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
else:
|
||||||
|
image_data, _ = process_vision_info(messages)
|
||||||
|
|
||||||
|
return llm, prompt, stop_token_ids, image_data
|
||||||
|
|
||||||
|
|
||||||
|
model_example_map = {
|
||||||
|
"phi3_v": load_phi3v,
|
||||||
|
"internvl_chat": load_internvl,
|
||||||
|
"qwen2_vl": load_qwen2_vl,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_generate(model, question: str, image_urls: List[str]):
|
||||||
|
llm, prompt, stop_token_ids, image_data = model_example_map[model](
|
||||||
|
question, image_urls)
|
||||||
|
if image_data is None:
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=0.0,
|
||||||
|
max_tokens=128,
|
||||||
|
stop_token_ids=stop_token_ids)
|
||||||
|
|
||||||
|
outputs = llm.generate(
|
||||||
|
{
|
||||||
|
"prompt": prompt,
|
||||||
|
"multi_modal_data": {
|
||||||
|
"image": image_data
|
||||||
|
},
|
||||||
|
},
|
||||||
|
sampling_params=sampling_params)
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
|
||||||
|
|
||||||
|
def run_chat(model: str, question: str, image_urls: List[str]):
|
||||||
|
llm, _, stop_token_ids, _ = model_example_map[model](question, image_urls)
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=0.0,
|
||||||
|
max_tokens=128,
|
||||||
|
stop_token_ids=stop_token_ids)
|
||||||
|
|
||||||
|
outputs = llm.chat([{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": question,
|
||||||
|
},
|
||||||
|
*({
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": image_url
|
||||||
|
},
|
||||||
|
} for image_url in image_urls),
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
sampling_params=sampling_params)
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Namespace):
|
||||||
|
model = args.model_type
|
||||||
|
method = args.method
|
||||||
|
|
||||||
|
if method == "generate":
|
||||||
|
run_generate(model, QUESTION, IMAGE_URLS)
|
||||||
|
elif method == "chat":
|
||||||
|
run_chat(model, QUESTION, IMAGE_URLS)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid method: {method}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description='Demo on using vLLM for offline inference with '
|
||||||
|
'vision language models that support multi-image input')
|
||||||
|
parser.add_argument('--model-type',
|
||||||
|
'-m',
|
||||||
|
type=str,
|
||||||
|
default="phi3_v",
|
||||||
|
choices=model_example_map.keys(),
|
||||||
|
help='Huggingface "model_type".')
|
||||||
|
parser.add_argument("--method",
|
||||||
|
type=str,
|
||||||
|
default="generate",
|
||||||
|
choices=["generate", "chat"],
|
||||||
|
help="The method to run in `vllm.LLM`.")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
33
examples/offline_inference_with_profiler.py
Normal file
33
examples/offline_inference_with_profiler.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
# enable torch profiler, can also be set on cmd line
|
||||||
|
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
|
||||||
|
|
||||||
|
# Sample prompts.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
# Create a sampling params object.
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
llm = LLM(model="facebook/opt-125m")
|
||||||
|
|
||||||
|
llm.start_profile()
|
||||||
|
|
||||||
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
|
# that contain the prompt, generated text, and other information.
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
llm.stop_profile()
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
@@ -27,9 +27,10 @@ client = OpenAI(
|
|||||||
models = client.models.list()
|
models = client.models.list()
|
||||||
model = models.data[0].id
|
model = models.data[0].id
|
||||||
|
|
||||||
|
# Single-image input inference
|
||||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||||
|
|
||||||
# Use image url in the payload
|
## Use image url in the payload
|
||||||
chat_completion_from_url = client.chat.completions.create(
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
messages=[{
|
messages=[{
|
||||||
"role":
|
"role":
|
||||||
@@ -52,10 +53,10 @@ chat_completion_from_url = client.chat.completions.create(
|
|||||||
)
|
)
|
||||||
|
|
||||||
result = chat_completion_from_url.choices[0].message.content
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
print(f"Chat completion output:{result}")
|
print("Chat completion output:", result)
|
||||||
|
|
||||||
|
|
||||||
# Use base64 encoded image in the payload
|
## Use base64 encoded image in the payload
|
||||||
def encode_image_base64_from_url(image_url: str) -> str:
|
def encode_image_base64_from_url(image_url: str) -> str:
|
||||||
"""Encode an image retrieved from a remote url to base64 format."""
|
"""Encode an image retrieved from a remote url to base64 format."""
|
||||||
|
|
||||||
@@ -122,4 +123,4 @@ chat_completion_from_url = client.chat.completions.create(
|
|||||||
)
|
)
|
||||||
|
|
||||||
result = chat_completion_from_url.choices[0].message.content
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
print(f"Chat completion output:{result}")
|
print("Chat completion output:", result)
|
||||||
|
|||||||
@@ -89,22 +89,23 @@
|
|||||||
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||||
{%- elif message.role == "assistant" and message.tool_calls is defined %}
|
{%- elif message.role == "assistant" and message.tool_calls is defined %}
|
||||||
{{- '<|im_start|>' + message.role }}
|
{{- '<|im_start|>' + message.role }}
|
||||||
{%- for tool_call in message.tool_calls %}
|
{%- for tool_call in message.tool_calls %}
|
||||||
{{- '\n<tool_call>\n' }}
|
{{- '\n<tool_call>\n' }}
|
||||||
{%- if tool_call.function is defined %}
|
{%- if tool_call.function is defined %}
|
||||||
{%- set tool_call = tool_call.function %}
|
{%- set tool_call = tool_call.function %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{{- '{' }}
|
{{- '{' }}
|
||||||
{{- '"name": "' }}
|
{{- '"name": "' }}
|
||||||
{{- tool_call.name }}
|
{{- tool_call.name }}
|
||||||
{{- '"}' }}
|
{{- '"' }}
|
||||||
|
{%- if tool_call.arguments is defined %}
|
||||||
{{- ', ' }}
|
{{- ', ' }}
|
||||||
{%- if tool_call.arguments is defined %}
|
{{- '"arguments": ' }}
|
||||||
{{- '"arguments": ' }}
|
{{- tool_call.arguments|tojson }}
|
||||||
{{- tool_call.arguments|tojson }}
|
{%- endif %}
|
||||||
{%- endif %}
|
{{- '}' }}
|
||||||
{{- '\n</tool_call>' }}
|
{{- '\n</tool_call>' }}
|
||||||
{%- endfor %}
|
{%- endfor %}
|
||||||
{{- '<|im_end|>\n' }}
|
{{- '<|im_end|>\n' }}
|
||||||
{%- elif message.role == "tool" %}
|
{%- elif message.role == "tool" %}
|
||||||
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
||||||
|
|||||||
@@ -1,3 +0,0 @@
|
|||||||
# Dependencies for Ray accelerated DAG
|
|
||||||
cupy-cuda12x
|
|
||||||
ray >= 2.32
|
|
||||||
@@ -9,7 +9,7 @@ tokenizers >= 0.19.1 # Required for Llama 3.
|
|||||||
protobuf # Required by LlamaTokenizer.
|
protobuf # Required by LlamaTokenizer.
|
||||||
fastapi
|
fastapi
|
||||||
aiohttp
|
aiohttp
|
||||||
openai >= 1.0 # Ensure modern openai package (ensure types module present)
|
openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
|
||||||
uvicorn[standard]
|
uvicorn[standard]
|
||||||
pydantic >= 2.8 # Required for OpenAI server.
|
pydantic >= 2.8 # Required for OpenAI server.
|
||||||
pillow # Required for image processing
|
pillow # Required for image processing
|
||||||
@@ -25,5 +25,7 @@ pyzmq
|
|||||||
msgspec
|
msgspec
|
||||||
gguf == 0.9.1
|
gguf == 0.9.1
|
||||||
importlib_metadata
|
importlib_metadata
|
||||||
mistral_common >= 1.3.4
|
mistral_common >= 1.4.0
|
||||||
pyyaml
|
pyyaml
|
||||||
|
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||||
|
einops # Required for Qwen2-VL.
|
||||||
|
|||||||
@@ -1,6 +1,3 @@
|
|||||||
# Needed for Ray accelerated DAG tests
|
|
||||||
-r requirements-adag.txt
|
|
||||||
|
|
||||||
# testing
|
# testing
|
||||||
pytest
|
pytest
|
||||||
tensorizer>=2.9.0
|
tensorizer>=2.9.0
|
||||||
@@ -14,9 +11,10 @@ awscli
|
|||||||
einops # required for MPT, qwen-vl and Mamba
|
einops # required for MPT, qwen-vl and Mamba
|
||||||
httpx
|
httpx
|
||||||
librosa # required for audio test
|
librosa # required for audio test
|
||||||
|
opencv-python # required for video test
|
||||||
peft
|
peft
|
||||||
requests
|
requests
|
||||||
ray
|
ray[adag]>=2.35
|
||||||
sentence-transformers # required for embedding
|
sentence-transformers # required for embedding
|
||||||
soundfile # required for audio test
|
soundfile # required for audio test
|
||||||
compressed-tensors==0.4.0 # required for compressed-tensors
|
compressed-tensors==0.4.0 # required for compressed-tensors
|
||||||
|
|||||||
6
setup.py
6
setup.py
@@ -170,14 +170,17 @@ class cmake_build_ext(build_ext):
|
|||||||
|
|
||||||
if is_sccache_available():
|
if is_sccache_available():
|
||||||
cmake_args += [
|
cmake_args += [
|
||||||
|
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
|
||||||
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
|
'-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
|
||||||
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
|
'-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
|
||||||
'-DCMAKE_C_COMPILER_LAUNCHER=sccache',
|
'-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
|
||||||
]
|
]
|
||||||
elif is_ccache_available():
|
elif is_ccache_available():
|
||||||
cmake_args += [
|
cmake_args += [
|
||||||
|
'-DCMAKE_C_COMPILER_LAUNCHER=ccache',
|
||||||
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
|
'-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
|
||||||
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
|
'-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
|
||||||
|
'-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
|
||||||
]
|
]
|
||||||
|
|
||||||
# Pass the python executable to cmake so it can find an exact
|
# Pass the python executable to cmake so it can find an exact
|
||||||
@@ -502,6 +505,7 @@ setup(
|
|||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
extras_require={
|
extras_require={
|
||||||
"tensorizer": ["tensorizer>=2.9.0"],
|
"tensorizer": ["tensorizer>=2.9.0"],
|
||||||
|
"video": ["opencv-python"], # Required for video processing
|
||||||
"audio": ["librosa", "soundfile"] # Required for audio processing
|
"audio": ["librosa", "soundfile"] # Required for audio processing
|
||||||
},
|
},
|
||||||
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
|
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
|
from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
|
||||||
|
load_chat_template)
|
||||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
@@ -87,7 +88,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
|
|||||||
add_generation_prompt=add_generation_prompt)
|
add_generation_prompt=add_generation_prompt)
|
||||||
|
|
||||||
# Call the function and get the result
|
# Call the function and get the result
|
||||||
result = apply_chat_template(
|
result = apply_hf_chat_template(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
conversation=mock_request.messages,
|
conversation=mock_request.messages,
|
||||||
chat_template=mock_request.chat_template or template_content,
|
chat_template=mock_request.chat_template or template_content,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ from typing import Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispacther
|
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
||||||
|
|
||||||
|
|
||||||
class MyMod(torch.nn.Module):
|
class MyMod(torch.nn.Module):
|
||||||
@@ -13,7 +13,7 @@ class MyMod(torch.nn.Module):
|
|||||||
return x * 2
|
return x * 2
|
||||||
|
|
||||||
|
|
||||||
class MyWrapper(TorchCompileWrapperWithCustomDispacther):
|
class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
||||||
|
|
||||||
def __init__(self, model):
|
def __init__(self, model):
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.config import TokenizerPoolConfig
|
from vllm.config import TokenizerPoolConfig
|
||||||
from vllm.connections import global_http_connection
|
from vllm.connections import global_http_connection
|
||||||
from vllm.distributed import (destroy_distributed_environment,
|
from vllm.distributed import (destroy_distributed_environment,
|
||||||
@@ -44,6 +45,7 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
|
|||||||
PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
|
PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
|
||||||
PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
|
PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
|
||||||
List[List[Tuple[np.ndarray, int]]]]
|
List[List[Tuple[np.ndarray, int]]]]
|
||||||
|
PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
|
||||||
|
|
||||||
|
|
||||||
def _read_prompts(filename: str) -> List[str]:
|
def _read_prompts(filename: str) -> List[str]:
|
||||||
@@ -85,8 +87,35 @@ class _ImageAssets(_ImageAssetsBase):
|
|||||||
return [prompts["stop_sign"], prompts["cherry_blossom"]]
|
return [prompts["stop_sign"], prompts["cherry_blossom"]]
|
||||||
|
|
||||||
|
|
||||||
|
class _VideoAssetPrompts(TypedDict):
|
||||||
|
sample_demo_1: str
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info < (3, 9):
|
||||||
|
# UserList cannot be subscripted
|
||||||
|
class _VideoAssetsBase(UserList):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
|
||||||
|
class _VideoAssetsBase(UserList[VideoAsset]):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class _VideoAssets(_VideoAssetsBase):
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__([
|
||||||
|
VideoAsset("sample_demo_1.mp4"),
|
||||||
|
])
|
||||||
|
|
||||||
|
def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
|
||||||
|
return [prompts["sample_demo_1"]]
|
||||||
|
|
||||||
|
|
||||||
IMAGE_ASSETS = _ImageAssets()
|
IMAGE_ASSETS = _ImageAssets()
|
||||||
"""Singleton instance of :class:`_ImageAssets`."""
|
"""Singleton instance of :class:`_ImageAssets`."""
|
||||||
|
VIDEO_ASSETS = _VideoAssets()
|
||||||
|
"""Singleton instance of :class:`_VideoAssets`."""
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@@ -202,6 +231,11 @@ def image_assets() -> _ImageAssets:
|
|||||||
return IMAGE_ASSETS
|
return IMAGE_ASSETS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def video_assets() -> _VideoAssets:
|
||||||
|
return VIDEO_ASSETS
|
||||||
|
|
||||||
|
|
||||||
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
|
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
|
||||||
|
|
||||||
|
|
||||||
@@ -278,7 +312,8 @@ class HfRunner:
|
|||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
|
videos: Optional[List[np.ndarray]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Tuple[List[List[int]], List[str]]]:
|
) -> List[Tuple[List[List[int]], List[str]]]:
|
||||||
if images:
|
if images:
|
||||||
@@ -292,6 +327,8 @@ class HfRunner:
|
|||||||
}
|
}
|
||||||
if images is not None and images[i] is not None:
|
if images is not None and images[i] is not None:
|
||||||
processor_kwargs["images"] = images[i]
|
processor_kwargs["images"] = images[i]
|
||||||
|
if videos is not None and videos[i] is not None:
|
||||||
|
processor_kwargs["videos"] = videos[i]
|
||||||
|
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs)
|
inputs = self.postprocess_inputs(inputs)
|
||||||
@@ -314,7 +351,7 @@ class HfRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Tuple[List[int], str]]:
|
) -> List[Tuple[List[int], str]]:
|
||||||
outputs = self.generate(prompts,
|
outputs = self.generate(prompts,
|
||||||
@@ -351,7 +388,8 @@ class HfRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
|
videos: Optional[List[np.ndarray]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[List[torch.Tensor]]:
|
) -> List[List[torch.Tensor]]:
|
||||||
all_logprobs: List[List[torch.Tensor]] = []
|
all_logprobs: List[List[torch.Tensor]] = []
|
||||||
@@ -362,6 +400,8 @@ class HfRunner:
|
|||||||
}
|
}
|
||||||
if images is not None and images[i] is not None:
|
if images is not None and images[i] is not None:
|
||||||
processor_kwargs["images"] = images[i]
|
processor_kwargs["images"] = images[i]
|
||||||
|
if videos is not None and videos[i] is not None:
|
||||||
|
processor_kwargs["videos"] = videos[i]
|
||||||
|
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs)
|
inputs = self.postprocess_inputs(inputs)
|
||||||
@@ -433,8 +473,9 @@ class HfRunner:
|
|||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
audios: Optional[List[Tuple[np.ndarray, int]]] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
|
videos: Optional[List[np.ndarray]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
|
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
|
||||||
all_logprobs: List[List[Dict[int, float]]] = []
|
all_logprobs: List[List[Dict[int, float]]] = []
|
||||||
@@ -454,6 +495,8 @@ class HfRunner:
|
|||||||
processor_kwargs["audio"] = audio
|
processor_kwargs["audio"] = audio
|
||||||
processor_kwargs["sampling_rate"] = sr
|
processor_kwargs["sampling_rate"] = sr
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
processor_kwargs["videos"] = videos[i]
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs)
|
inputs = self.postprocess_inputs(inputs)
|
||||||
|
|
||||||
@@ -634,12 +677,16 @@ class VllmRunner:
|
|||||||
sampling_params: SamplingParams,
|
sampling_params: SamplingParams,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
|
videos: Optional[PromptVideoInput] = None,
|
||||||
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
||||||
assert sampling_params.logprobs is not None
|
assert sampling_params.logprobs is not None
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
assert len(prompts) == len(images)
|
assert len(prompts) == len(images)
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
assert len(prompts) == len(videos)
|
||||||
|
|
||||||
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
|
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
|
||||||
if images is not None:
|
if images is not None:
|
||||||
for i, image in enumerate(images):
|
for i, image in enumerate(images):
|
||||||
@@ -649,6 +696,11 @@ class VllmRunner:
|
|||||||
for i, audio in enumerate(audios):
|
for i, audio in enumerate(audios):
|
||||||
inputs[i]["multi_modal_data"] = {"audio": audio}
|
inputs[i]["multi_modal_data"] = {"audio": audio}
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
for i, video in enumerate(videos):
|
||||||
|
inputs[i]["multi_modal_data"] = {"video": video}
|
||||||
|
print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
|
||||||
|
|
||||||
req_outputs = self.model.generate(inputs,
|
req_outputs = self.model.generate(inputs,
|
||||||
sampling_params=sampling_params)
|
sampling_params=sampling_params)
|
||||||
return self._final_steps_generate_w_logprobs(req_outputs)
|
return self._final_steps_generate_w_logprobs(req_outputs)
|
||||||
@@ -671,7 +723,7 @@ class VllmRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: List[str],
|
prompts: List[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[List[Image.Image]] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
) -> List[Tuple[List[int], str]]:
|
) -> List[Tuple[List[int], str]]:
|
||||||
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
||||||
outputs = self.generate(prompts, greedy_params, images=images)
|
outputs = self.generate(prompts, greedy_params, images=images)
|
||||||
@@ -685,6 +737,7 @@ class VllmRunner:
|
|||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
|
videos: Optional[PromptVideoInput] = None,
|
||||||
stop_token_ids: Optional[List[int]] = None,
|
stop_token_ids: Optional[List[int]] = None,
|
||||||
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
||||||
greedy_logprobs_params = SamplingParams(temperature=0.0,
|
greedy_logprobs_params = SamplingParams(temperature=0.0,
|
||||||
@@ -694,7 +747,8 @@ class VllmRunner:
|
|||||||
outputs = self.generate_w_logprobs(prompts,
|
outputs = self.generate_w_logprobs(prompts,
|
||||||
greedy_logprobs_params,
|
greedy_logprobs_params,
|
||||||
images=images,
|
images=images,
|
||||||
audios=audios)
|
audios=audios,
|
||||||
|
videos=videos)
|
||||||
|
|
||||||
return [(output_ids, output_str, output_logprobs)
|
return [(output_ids, output_str, output_logprobs)
|
||||||
for output_ids, output_str, output_logprobs in outputs]
|
for output_ids, output_str, output_logprobs in outputs]
|
||||||
|
|||||||
@@ -35,9 +35,11 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str,
|
|||||||
if model.startswith("llava-hf/llava-1.5"):
|
if model.startswith("llava-hf/llava-1.5"):
|
||||||
from ..models.test_llava import models, run_test
|
from ..models.test_llava import models, run_test
|
||||||
elif model.startswith("llava-hf/llava-v1.6"):
|
elif model.startswith("llava-hf/llava-v1.6"):
|
||||||
from ..models.test_llava_next import models, run_test
|
from ..models.test_llava_next import run_test # type: ignore[no-redef]
|
||||||
|
from ..models.test_llava_next import models
|
||||||
elif model.startswith("facebook/chameleon"):
|
elif model.startswith("facebook/chameleon"):
|
||||||
from ..models.test_chameleon import models, run_test
|
from ..models.test_chameleon import run_test # type: ignore[no-redef]
|
||||||
|
from ..models.test_chameleon import models
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Unsupported model: {model}")
|
raise NotImplementedError(f"Unsupported model: {model}")
|
||||||
|
|
||||||
|
|||||||
@@ -18,23 +18,28 @@ logger = init_logger("test_pipeline_parallel")
|
|||||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
|
@pytest.mark.parametrize(
|
||||||
"MODEL_NAME, DIST_BACKEND"),
|
("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
|
||||||
[
|
"MODEL_NAME, DIST_BACKEND"),
|
||||||
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
|
[
|
||||||
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
|
||||||
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
|
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
])
|
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||||
|
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
|
||||||
|
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
|
||||||
|
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
|
||||||
|
],
|
||||||
|
)
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
|
||||||
DIST_BACKEND):
|
TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
|
||||||
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
|
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
|
||||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||||
"multiprocessing distributed backend")
|
"multiprocessing distributed backend")
|
||||||
@@ -43,6 +48,8 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
|||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"float16",
|
"float16",
|
||||||
|
"--max-model-len",
|
||||||
|
"8192",
|
||||||
"--pipeline-parallel-size",
|
"--pipeline-parallel-size",
|
||||||
str(PP_SIZE),
|
str(PP_SIZE),
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
@@ -59,7 +66,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
|||||||
tp_args = [
|
tp_args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"float16",
|
||||||
|
"--max-model-len",
|
||||||
|
"8192",
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
|
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
|
||||||
"--distributed-executor-backend",
|
"--distributed-executor-backend",
|
||||||
@@ -71,6 +80,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
|
|||||||
if EAGER_MODE:
|
if EAGER_MODE:
|
||||||
pp_args.append("--enforce-eager")
|
pp_args.append("--enforce-eager")
|
||||||
tp_args.append("--enforce-eager")
|
tp_args.append("--enforce-eager")
|
||||||
|
if TRUST_REMOTE_CODE:
|
||||||
|
pp_args.append("--trust-remote-code")
|
||||||
|
tp_args.append("--trust-remote-code")
|
||||||
pp_env = None
|
pp_env = None
|
||||||
if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
|
if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
|
||||||
and CHUNKED_PREFILL):
|
and CHUNKED_PREFILL):
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ def test_local_workers() -> None:
|
|||||||
workers[3].process.kill()
|
workers[3].process.kill()
|
||||||
|
|
||||||
# Other workers should get shut down here
|
# Other workers should get shut down here
|
||||||
worker_monitor.join(2)
|
worker_monitor.join(20)
|
||||||
|
|
||||||
# Ensure everything is stopped
|
# Ensure everything is stopped
|
||||||
assert not worker_monitor.is_alive()
|
assert not worker_monitor.is_alive()
|
||||||
@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
|
|||||||
# Clean shutdown
|
# Clean shutdown
|
||||||
worker_monitor.close()
|
worker_monitor.close()
|
||||||
|
|
||||||
worker_monitor.join(5)
|
worker_monitor.join(20)
|
||||||
|
|
||||||
# Ensure everything is stopped
|
# Ensure everything is stopped
|
||||||
assert not worker_monitor.is_alive()
|
assert not worker_monitor.is_alive()
|
||||||
@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
|
|||||||
workers[3].process.kill()
|
workers[3].process.kill()
|
||||||
|
|
||||||
# Other workers should get shut down here
|
# Other workers should get shut down here
|
||||||
worker_monitor.join(2)
|
worker_monitor.join(20)
|
||||||
|
|
||||||
# Ensure everything is stopped
|
# Ensure everything is stopped
|
||||||
assert not worker_monitor.is_alive()
|
assert not worker_monitor.is_alive()
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ def zephyr_lora_files():
|
|||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
|
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
|
||||||
lora_request = [
|
lora_request = [
|
||||||
LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
|
LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
|
||||||
for idx in range(len(PROMPTS))
|
for idx in range(len(PROMPTS))
|
||||||
]
|
]
|
||||||
# Multiple SamplingParams should be matched with each prompt
|
# Multiple SamplingParams should be matched with each prompt
|
||||||
|
|||||||
@@ -8,7 +8,9 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
|
|||||||
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
|
|
||||||
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
|
{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
|
{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
||||||
|
|
||||||
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
|
||||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
|
||||||
|
|||||||
107
tests/entrypoints/openai/test_serving_engine.py
Normal file
107
tests/entrypoints/openai/test_serving_engine.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
from http import HTTPStatus
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config import ModelConfig
|
||||||
|
from vllm.engine.protocol import AsyncEngineClient
|
||||||
|
from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||||
|
LoadLoraAdapterRequest,
|
||||||
|
UnloadLoraAdapterRequest)
|
||||||
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
|
|
||||||
|
MODEL_NAME = "meta-llama/Llama-2-7b"
|
||||||
|
LORA_LOADING_SUCCESS_MESSAGE = (
|
||||||
|
"Success: LoRA adapter '{lora_name}' added successfully.")
|
||||||
|
LORA_UNLOADING_SUCCESS_MESSAGE = (
|
||||||
|
"Success: LoRA adapter '{lora_name}' removed successfully.")
|
||||||
|
|
||||||
|
|
||||||
|
async def _async_serving_engine_init():
|
||||||
|
mock_engine_client = MagicMock(spec=AsyncEngineClient)
|
||||||
|
mock_model_config = MagicMock(spec=ModelConfig)
|
||||||
|
# Set the max_model_len attribute to avoid missing attribute
|
||||||
|
mock_model_config.max_model_len = 2048
|
||||||
|
|
||||||
|
serving_engine = OpenAIServing(mock_engine_client,
|
||||||
|
mock_model_config,
|
||||||
|
served_model_names=[MODEL_NAME],
|
||||||
|
lora_modules=None,
|
||||||
|
prompt_adapters=None,
|
||||||
|
request_logger=None)
|
||||||
|
return serving_engine
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_lora_adapter_success():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter",
|
||||||
|
lora_path="/path/to/adapter2")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
assert serving_engine.lora_requests[0].lora_name == "adapter"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_lora_adapter_missing_fields():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="", lora_path="")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_lora_adapter_duplicate():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||||
|
lora_path="/path/to/adapter1")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
|
||||||
|
lora_name='adapter1')
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||||
|
lora_path="/path/to/adapter1")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unload_lora_adapter_success():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||||
|
lora_path="/path/to/adapter1")
|
||||||
|
response = await serving_engine.load_lora_adapter(request)
|
||||||
|
assert len(serving_engine.lora_requests) == 1
|
||||||
|
|
||||||
|
request = UnloadLoraAdapterRequest(lora_name="adapter1")
|
||||||
|
response = await serving_engine.unload_lora_adapter(request)
|
||||||
|
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
|
||||||
|
lora_name='adapter1')
|
||||||
|
assert len(serving_engine.lora_requests) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unload_lora_adapter_missing_fields():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
|
||||||
|
response = await serving_engine.unload_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unload_lora_adapter_not_found():
|
||||||
|
serving_engine = await _async_serving_engine_init()
|
||||||
|
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
|
||||||
|
response = await serving_engine.unload_lora_adapter(request)
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.type == "InvalidUserInput"
|
||||||
|
assert response.code == HTTPStatus.BAD_REQUEST
|
||||||
@@ -3,8 +3,10 @@ from typing import Type
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
|
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
|
||||||
NewGELU, SiluAndMul)
|
NewGELU, QuickGELU,
|
||||||
|
SiluAndMul)
|
||||||
|
|
||||||
from .allclose_default import get_default_atol, get_default_rtol
|
from .allclose_default import get_default_atol, get_default_rtol
|
||||||
|
|
||||||
@@ -39,18 +41,28 @@ def test_act_and_mul(
|
|||||||
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
||||||
if activation == "silu":
|
if activation == "silu":
|
||||||
layer = SiluAndMul()
|
layer = SiluAndMul()
|
||||||
|
fn = torch.ops._C.silu_and_mul
|
||||||
elif activation == "gelu":
|
elif activation == "gelu":
|
||||||
layer = GeluAndMul(approximate="none")
|
layer = GeluAndMul(approximate="none")
|
||||||
|
fn = torch.ops._C.gelu_and_mul
|
||||||
elif activation == "gelu_tanh":
|
elif activation == "gelu_tanh":
|
||||||
layer = GeluAndMul(approximate="tanh")
|
layer = GeluAndMul(approximate="tanh")
|
||||||
|
fn = torch.ops._C.gelu_tanh_and_mul
|
||||||
out = layer(x)
|
out = layer(x)
|
||||||
ref_out = layer.forward_native(x)
|
ref_out = layer.forward_native(x)
|
||||||
# The SiLU and GELU implementations are equivalent to the native PyTorch
|
# The SiLU and GELU implementations are equivalent to the native PyTorch
|
||||||
# implementations, so we can do exact comparison.
|
# implementations, so we can do exact comparison.
|
||||||
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
|
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
|
||||||
|
|
||||||
|
d = x.shape[-1] // 2
|
||||||
|
output_shape = (x.shape[:-1] + (d, ))
|
||||||
|
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
||||||
|
opcheck(fn, (out, x))
|
||||||
|
|
||||||
@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
|
|
||||||
|
@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
|
||||||
|
(NewGELU, torch.ops._C.gelu_new),
|
||||||
|
(QuickGELU, torch.ops._C.gelu_quick)])
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("d", D)
|
@pytest.mark.parametrize("d", D)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@@ -70,10 +82,14 @@ def test_activation(
|
|||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
x = torch.randn(num_tokens, d, dtype=dtype)
|
x = torch.randn(num_tokens, d, dtype=dtype)
|
||||||
layer = activation()
|
layer = activation[0]()
|
||||||
|
fn = activation[1]
|
||||||
out = layer(x)
|
out = layer(x)
|
||||||
ref_out = layer.forward_native(x)
|
ref_out = layer.forward_native(x)
|
||||||
torch.testing.assert_close(out,
|
torch.testing.assert_close(out,
|
||||||
ref_out,
|
ref_out,
|
||||||
atol=get_default_atol(out),
|
atol=get_default_atol(out),
|
||||||
rtol=get_default_rtol(out))
|
rtol=get_default_rtol(out))
|
||||||
|
|
||||||
|
out = torch.empty_like(x)
|
||||||
|
opcheck(fn, (out, x))
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import torch
|
|||||||
from xformers import ops as xops
|
from xformers import ops as xops
|
||||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import get_max_shared_memory_bytes, is_hip
|
from vllm.utils import get_max_shared_memory_bytes, is_hip
|
||||||
|
|
||||||
@@ -198,6 +199,13 @@ def test_paged_attention(
|
|||||||
k_scale,
|
k_scale,
|
||||||
v_scale,
|
v_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.paged_attention_v1,
|
||||||
|
(output, query, key_cache, value_cache, num_kv_heads, scale,
|
||||||
|
block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
|
||||||
|
kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
|
|
||||||
elif version == "v2":
|
elif version == "v2":
|
||||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||||
assert PARTITION_SIZE % block_size == 0
|
assert PARTITION_SIZE % block_size == 0
|
||||||
@@ -230,6 +238,14 @@ def test_paged_attention(
|
|||||||
k_scale,
|
k_scale,
|
||||||
v_scale,
|
v_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.paged_attention_v2,
|
||||||
|
(output, exp_sums, max_logits, tmp_output, query, key_cache,
|
||||||
|
value_cache, num_kv_heads, scale, block_tables, seq_lens,
|
||||||
|
block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
|
||||||
|
k_scale, v_scale, 0, 0, 0, 64, 0),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise AssertionError(f"Unknown version: {version}")
|
raise AssertionError(f"Unknown version: {version}")
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from typing import List, Tuple
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
|
||||||
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
||||||
@@ -87,6 +88,11 @@ def test_copy_blocks(
|
|||||||
block_mapping_tensor = torch.tensor(block_mapping,
|
block_mapping_tensor = torch.tensor(block_mapping,
|
||||||
dtype=torch.int64,
|
dtype=torch.int64,
|
||||||
device=device).view(-1, 2)
|
device=device).view(-1, 2)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C_cache_ops.copy_blocks,
|
||||||
|
(key_caches, value_caches, block_mapping_tensor),
|
||||||
|
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
||||||
|
|
||||||
# Run the reference implementation.
|
# Run the reference implementation.
|
||||||
@@ -162,6 +168,10 @@ def test_reshape_and_cache(
|
|||||||
k_scale = v_scale = 1.0
|
k_scale = v_scale = 1.0
|
||||||
|
|
||||||
# Call the reshape_and_cache kernel.
|
# Call the reshape_and_cache kernel.
|
||||||
|
opcheck(torch.ops._C_cache_ops.reshape_and_cache,
|
||||||
|
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
|
||||||
|
k_scale, v_scale),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
|
ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
|
||||||
kv_cache_dtype, k_scale, v_scale)
|
kv_cache_dtype, k_scale, v_scale)
|
||||||
|
|
||||||
@@ -269,6 +279,10 @@ def test_reshape_and_cache_flash(
|
|||||||
k_scale = v_scale = 1.0
|
k_scale = v_scale = 1.0
|
||||||
|
|
||||||
# Call the reshape_and_cache kernel.
|
# Call the reshape_and_cache kernel.
|
||||||
|
opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
|
||||||
|
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
|
||||||
|
k_scale, v_scale),
|
||||||
|
cond=(head_size == HEAD_SIZES[0]))
|
||||||
ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
|
ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
|
||||||
slot_mapping, kv_cache_dtype, k_scale, v_scale)
|
slot_mapping, kv_cache_dtype, k_scale, v_scale)
|
||||||
|
|
||||||
@@ -366,6 +380,14 @@ def test_swap_blocks(
|
|||||||
src_value_caches_clone = src_value_caches[0].clone()
|
src_value_caches_clone = src_value_caches[0].clone()
|
||||||
|
|
||||||
# Call the swap_blocks kernel.
|
# Call the swap_blocks kernel.
|
||||||
|
do_opcheck = (head_size == HEAD_SIZES[0])
|
||||||
|
opcheck(torch.ops._C_cache_ops.swap_blocks,
|
||||||
|
(src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
|
||||||
|
cond=do_opcheck)
|
||||||
|
opcheck(torch.ops._C_cache_ops.swap_blocks,
|
||||||
|
(src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
|
||||||
|
cond=do_opcheck)
|
||||||
|
|
||||||
ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
|
ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
|
||||||
block_mapping_tensor)
|
block_mapping_tensor)
|
||||||
ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
|
ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from typing import Optional, Type
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@@ -108,6 +109,9 @@ def cutlass_int8_gemm_helper(m: int,
|
|||||||
|
|
||||||
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
|
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.cutlass_scaled_mm,
|
||||||
|
(out, a, b, scale_a, scale_b, bias))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
|
@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
|
||||||
@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
|
@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
|
||||||
@@ -341,6 +345,15 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
|
|||||||
torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
|
torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
|
||||||
torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
|
torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
|
||||||
|
|
||||||
|
if azp_per_token:
|
||||||
|
opcheck(torch.ops._C.cutlass_scaled_mm_azp,
|
||||||
|
(out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
|
||||||
|
func_bias))
|
||||||
|
else:
|
||||||
|
opcheck(torch.ops._C.cutlass_scaled_mm_azp,
|
||||||
|
(out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
|
||||||
|
func_bias))
|
||||||
|
|
||||||
|
|
||||||
# Test working with a subset of A and B
|
# Test working with a subset of A and B
|
||||||
def test_cutlass_subset():
|
def test_cutlass_subset():
|
||||||
|
|||||||
@@ -445,7 +445,8 @@ def test_flashinfer_decode_with_paged_fp8_kv(
|
|||||||
head_size,
|
head_size,
|
||||||
block_size,
|
block_size,
|
||||||
"NONE",
|
"NONE",
|
||||||
data_type=dtype)
|
data_type=dtype,
|
||||||
|
q_data_type=dtype)
|
||||||
output = wrapper.forward(query,
|
output = wrapper.forward(query,
|
||||||
kv_cache_fp8,
|
kv_cache_fp8,
|
||||||
logits_soft_cap=soft_cap,
|
logits_soft_cap=soft_cap,
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
|
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm._custom_ops import scaled_int8_quant
|
from vllm._custom_ops import scaled_int8_quant
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
@@ -12,6 +13,16 @@ SEEDS = [0]
|
|||||||
SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
|
SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
|
||||||
|
|
||||||
|
|
||||||
|
def opcheck_int8_quant(output, input, scale=None):
|
||||||
|
if scale is not None:
|
||||||
|
opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale))
|
||||||
|
else:
|
||||||
|
scale = torch.empty((input.numel() // input.shape[-1], 1),
|
||||||
|
device=input.device,
|
||||||
|
dtype=torch.float32)
|
||||||
|
opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@@ -34,6 +45,8 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
|
|||||||
ops_out, ref_out, atol=1,
|
ops_out, ref_out, atol=1,
|
||||||
rtol=0.0) # big atol to account for rounding errors
|
rtol=0.0) # big atol to account for rounding errors
|
||||||
|
|
||||||
|
opcheck_int8_quant(ops_out, x)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||||
@@ -58,3 +71,5 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
|
|||||||
torch.testing.assert_close(
|
torch.testing.assert_close(
|
||||||
out1, out2, atol=1,
|
out1, out2, atol=1,
|
||||||
rtol=0.0) # big atol to account for rounding errors
|
rtol=0.0) # big atol to account for rounding errors
|
||||||
|
|
||||||
|
opcheck_int8_quant(out2, x, scale)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
@@ -52,3 +53,10 @@ def test_rms_norm(
|
|||||||
torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
|
torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
|
||||||
else:
|
else:
|
||||||
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
|
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
|
||||||
|
|
||||||
|
if residual is not None:
|
||||||
|
opcheck(torch.ops._C.fused_add_rms_norm,
|
||||||
|
(x, residual, layer.weight.data, layer.variance_epsilon))
|
||||||
|
else:
|
||||||
|
opcheck(torch.ops._C.rms_norm,
|
||||||
|
(out, x, layer.weight.data, layer.variance_epsilon))
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from typing import Optional, Tuple
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import opcheck
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
pack_rows, quantize_weights)
|
pack_rows, quantize_weights)
|
||||||
@@ -76,6 +77,8 @@ def machete_quantize_and_pack(w: torch.Tensor,
|
|||||||
w_q = w_q.t().contiguous().t() # convert to col major
|
w_q = w_q.t().contiguous().t() # convert to col major
|
||||||
w_q_machete = ops.machete_prepack_B(w_q, wtype)
|
w_q_machete = ops.machete_prepack_B(w_q, wtype)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype))
|
||||||
|
|
||||||
return w_ref, w_q_machete, w_s, w_zp
|
return w_ref, w_q_machete, w_s, w_zp
|
||||||
|
|
||||||
|
|
||||||
@@ -146,6 +149,10 @@ def test_machete_all_schedules(shape, atype: torch.dtype,
|
|||||||
schedule=schedule,
|
schedule=schedule,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.machete_gemm,
|
||||||
|
(a, w_q_machete, wtype, w_s, maybe_convert_zeropoints(
|
||||||
|
w_zp, w_s), group_size, None, None, None, schedule))
|
||||||
|
|
||||||
# Relax atol as our reduction dim becomes larger (more rounding error)
|
# Relax atol as our reduction dim becomes larger (more rounding error)
|
||||||
# Relax atol when we have zeropoints since the way machete applies
|
# Relax atol when we have zeropoints since the way machete applies
|
||||||
# zeropoints (after scales) causes noise around 0
|
# zeropoints (after scales) causes noise around 0
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||||
@@ -73,12 +74,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
act_order, mnk_factors):
|
act_order, mnk_factors):
|
||||||
m_factor, n_factor, k_factor = mnk_factors
|
m_factor, n_factor, k_factor = mnk_factors
|
||||||
|
|
||||||
size_m = m_factor
|
|
||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
|
|
||||||
# Filter act_order
|
# Filter act_order
|
||||||
if act_order:
|
if act_order:
|
||||||
if group_size == -1:
|
if group_size == -1:
|
||||||
@@ -112,6 +110,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
||||||
weight_perm)
|
weight_perm)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.gptq_marlin_repack,
|
||||||
|
(q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
|
||||||
|
|
||||||
# Run Marlin repack GPU kernel
|
# Run Marlin repack GPU kernel
|
||||||
marlin_q_w_2 = ops.gptq_marlin_repack(
|
marlin_q_w_2 = ops.gptq_marlin_repack(
|
||||||
q_w_gptq,
|
q_w_gptq,
|
||||||
@@ -137,12 +138,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
mnk_factors):
|
mnk_factors):
|
||||||
m_factor, n_factor, k_factor = mnk_factors
|
m_factor, n_factor, k_factor = mnk_factors
|
||||||
|
|
||||||
size_m = m_factor
|
|
||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
|
|
||||||
# Normalize group_size
|
# Normalize group_size
|
||||||
if group_size == -1:
|
if group_size == -1:
|
||||||
group_size = size_k
|
group_size = size_k
|
||||||
@@ -165,6 +163,9 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
||||||
weight_perm)
|
weight_perm)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.awq_marlin_repack,
|
||||||
|
(q_w_awq, size_k, size_n, quant_type.size_bits))
|
||||||
|
|
||||||
# Run Marlin repack GPU kernel
|
# Run Marlin repack GPU kernel
|
||||||
marlin_q_w_2 = ops.awq_marlin_repack(
|
marlin_q_w_2 = ops.awq_marlin_repack(
|
||||||
q_w_awq,
|
q_w_awq,
|
||||||
@@ -204,9 +205,6 @@ def test_gptq_marlin_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
if act_order:
|
if act_order:
|
||||||
if group_size == -1:
|
if group_size == -1:
|
||||||
return
|
return
|
||||||
@@ -224,6 +222,13 @@ def test_gptq_marlin_gemm(
|
|||||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||||
GPTQ_MARLIN_MAX_PARALLEL)
|
GPTQ_MARLIN_MAX_PARALLEL)
|
||||||
|
|
||||||
|
opcheck(
|
||||||
|
torch.ops._C.gptq_marlin_gemm,
|
||||||
|
(a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
|
||||||
|
workspace.scratch, quant_type, a_input.shape[0], b_weight.shape[1],
|
||||||
|
a_input.shape[1], is_k_full, False, use_fp32_reduce),
|
||||||
|
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
|
||||||
|
|
||||||
output = ops.gptq_marlin_gemm(
|
output = ops.gptq_marlin_gemm(
|
||||||
a_input,
|
a_input,
|
||||||
marlin_q_w,
|
marlin_q_w,
|
||||||
@@ -245,7 +250,6 @@ def test_gptq_marlin_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -265,9 +269,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k))
|
a_input = rand_data((size_m, size_k))
|
||||||
b_weight = rand_data((size_k, size_n))
|
b_weight = rand_data((size_k, size_n))
|
||||||
|
|
||||||
@@ -279,6 +280,12 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
|
|
||||||
output_ref = torch.matmul(a_input, w_24_ref)
|
output_ref = torch.matmul(a_input, w_24_ref)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.gptq_marlin_24_gemm,
|
||||||
|
(a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
|
||||||
|
workspace_24.scratch, quant_type, a_input.shape[0],
|
||||||
|
b_weight.shape[1], a_input.shape[1]),
|
||||||
|
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
|
||||||
|
|
||||||
output = ops.gptq_marlin_24_gemm(
|
output = ops.gptq_marlin_24_gemm(
|
||||||
a_input,
|
a_input,
|
||||||
marlin_24_q_w_comp,
|
marlin_24_q_w_comp,
|
||||||
@@ -294,7 +301,6 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -321,9 +327,6 @@ def test_fp8_marlin_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k), dtype=dtype)
|
a_input = rand_data((size_m, size_k), dtype=dtype)
|
||||||
b_weight = rand_data((size_k, size_n), dtype=dtype)
|
b_weight = rand_data((size_k, size_n), dtype=dtype)
|
||||||
|
|
||||||
@@ -353,6 +356,10 @@ def test_fp8_marlin_gemm(
|
|||||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||||
GPTQ_MARLIN_MAX_PARALLEL)
|
GPTQ_MARLIN_MAX_PARALLEL)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.fp8_marlin_gemm,
|
||||||
|
(a_input, marlin_qweight, marlin_scales, workspace.scratch,
|
||||||
|
num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1]))
|
||||||
|
|
||||||
output = ops.fp8_marlin_gemm(
|
output = ops.fp8_marlin_gemm(
|
||||||
a=a_input,
|
a=a_input,
|
||||||
b_q_weight=marlin_qweight,
|
b_q_weight=marlin_qweight,
|
||||||
@@ -368,7 +375,6 @@ def test_fp8_marlin_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -396,9 +402,6 @@ def test_awq_marlin_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k))
|
a_input = rand_data((size_m, size_k))
|
||||||
b_weight = rand_data((size_k, size_n))
|
b_weight = rand_data((size_k, size_n))
|
||||||
|
|
||||||
@@ -434,7 +437,6 @@ def test_awq_marlin_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|
||||||
@@ -460,9 +462,6 @@ def test_marlin_qqq_gemm(
|
|||||||
size_k = k_chunk * k_factor
|
size_k = k_chunk * k_factor
|
||||||
size_n = n_chunk * n_factor
|
size_n = n_chunk * n_factor
|
||||||
|
|
||||||
print(f"MNK = {size_m} {size_n} {size_k}")
|
|
||||||
print(f"groupsize = {group_size}")
|
|
||||||
|
|
||||||
a_input = rand_data((size_m, size_k))
|
a_input = rand_data((size_m, size_k))
|
||||||
b_weight = rand_data((size_k, size_n))
|
b_weight = rand_data((size_k, size_n))
|
||||||
|
|
||||||
@@ -479,6 +478,11 @@ def test_marlin_qqq_gemm(
|
|||||||
workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
|
workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
|
||||||
MARLIN_QQQ_MAX_PARALLEL)
|
MARLIN_QQQ_MAX_PARALLEL)
|
||||||
|
|
||||||
|
opcheck(torch.ops._C.marlin_qqq_gemm,
|
||||||
|
(q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
|
||||||
|
marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
|
||||||
|
b_weight.shape[1], a_input.shape[1]))
|
||||||
|
|
||||||
output = ops.marlin_qqq_gemm(
|
output = ops.marlin_qqq_gemm(
|
||||||
q_a,
|
q_a,
|
||||||
marlin_qqq_q_w,
|
marlin_qqq_q_w,
|
||||||
@@ -495,6 +499,5 @@ def test_marlin_qqq_gemm(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
max_diff = compute_max_diff(output, output_ref)
|
max_diff = compute_max_diff(output, output_ref)
|
||||||
print("max_diff = {}".format(max_diff))
|
|
||||||
|
|
||||||
assert max_diff < 0.04
|
assert max_diff < 0.04
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
Run `pytest tests/kernels/test_moe.py`.
|
Run `pytest tests/kernels/test_moe.py`.
|
||||||
"""
|
"""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from transformers import MixtralConfig
|
from transformers import MixtralConfig
|
||||||
@@ -9,7 +11,13 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
|||||||
|
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
|
||||||
|
fused_marlin_moe, single_marlin_moe)
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
||||||
|
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||||
|
marlin_quantize)
|
||||||
from vllm.model_executor.models.mixtral import MixtralMoE
|
from vllm.model_executor.models.mixtral import MixtralMoE
|
||||||
|
from vllm.scalar_type import scalar_types
|
||||||
|
|
||||||
|
|
||||||
def torch_moe(a, w1, w2, score, topk):
|
def torch_moe(a, w1, w2, score, topk):
|
||||||
@@ -29,6 +37,20 @@ def torch_moe(a, w1, w2, score, topk):
|
|||||||
topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
|
topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
|
||||||
|
|
||||||
|
|
||||||
|
def torch_moe_single(a, w, score, topk):
|
||||||
|
B, D = a.shape
|
||||||
|
a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
|
||||||
|
out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
|
||||||
|
score = torch.softmax(score, dim=-1, dtype=torch.float32)
|
||||||
|
_, topk_ids = torch.topk(score, topk)
|
||||||
|
topk_ids = topk_ids.view(-1)
|
||||||
|
for i in range(w.shape[0]):
|
||||||
|
mask = topk_ids == i
|
||||||
|
if mask.sum():
|
||||||
|
out[mask] = a[mask] @ w[i].transpose(0, 1)
|
||||||
|
return (out.view(B, -1, w.shape[1])).sum(dim=1)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
|
@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
|
||||||
@pytest.mark.parametrize("n", [2048, 256, 1024])
|
@pytest.mark.parametrize("n", [2048, 256, 1024])
|
||||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||||
@@ -43,11 +65,11 @@ def test_fused_moe(
|
|||||||
topk: int,
|
topk: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||||
w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
|
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||||
w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
|
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||||
|
|
||||||
score = torch.randn((m, e), device='cuda', dtype=dtype)
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||||
triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
|
triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
|
||||||
torch_output = torch_moe(a, w1, w2, score, topk)
|
torch_output = torch_moe(a, w1, w2, score, topk)
|
||||||
torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
|
torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
|
||||||
@@ -99,3 +121,194 @@ def test_mixtral_moe(dtype: torch.dtype):
|
|||||||
vllm_states,
|
vllm_states,
|
||||||
rtol=mixtral_moe_tol[dtype],
|
rtol=mixtral_moe_tol[dtype],
|
||||||
atol=mixtral_moe_tol[dtype])
|
atol=mixtral_moe_tol[dtype])
|
||||||
|
|
||||||
|
|
||||||
|
def stack_and_dev(tensors: List[torch.Tensor]):
|
||||||
|
dev = tensors[0].device
|
||||||
|
return torch.stack(tensors, dim=0).to(dev)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_max_diff(output, output_ref):
|
||||||
|
return torch.mean(torch.abs(output - output_ref)) / torch.mean(
|
||||||
|
torch.abs(output_ref))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
|
||||||
|
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
|
||||||
|
@pytest.mark.parametrize("k", [128, 1024, 512])
|
||||||
|
@pytest.mark.parametrize("e", [4, 8, 64])
|
||||||
|
@pytest.mark.parametrize("topk", [2, 6])
|
||||||
|
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
|
||||||
|
@pytest.mark.parametrize("act_order", [True, False])
|
||||||
|
def test_fused_marlin_moe(
|
||||||
|
m: int,
|
||||||
|
n: int,
|
||||||
|
k: int,
|
||||||
|
e: int,
|
||||||
|
topk: int,
|
||||||
|
group_size: int,
|
||||||
|
act_order: bool,
|
||||||
|
):
|
||||||
|
torch.manual_seed(7)
|
||||||
|
|
||||||
|
if topk > e:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filter act_order
|
||||||
|
if act_order:
|
||||||
|
if group_size == -1:
|
||||||
|
return
|
||||||
|
if group_size in (k, n):
|
||||||
|
return
|
||||||
|
|
||||||
|
quant_type = scalar_types.uint4b8
|
||||||
|
dtype = torch.float16
|
||||||
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||||
|
for i in range(w2.shape[0]):
|
||||||
|
w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
|
||||||
|
|
||||||
|
w_ref1_l = []
|
||||||
|
qweight1_l = []
|
||||||
|
scales1_l = []
|
||||||
|
g_idx1_l = []
|
||||||
|
sort_indices1_l = []
|
||||||
|
|
||||||
|
for i in range(w1.shape[0]):
|
||||||
|
test_perm = torch.randperm(k)
|
||||||
|
w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
|
||||||
|
w1[i].transpose(1, 0), quant_type, group_size, act_order,
|
||||||
|
test_perm)
|
||||||
|
w_ref1_l.append(w_ref1)
|
||||||
|
qweight1_l.append(qweight1)
|
||||||
|
scales1_l.append(scales1)
|
||||||
|
g_idx1_l.append(g_idx1)
|
||||||
|
sort_indices1_l.append(sort_indices1)
|
||||||
|
|
||||||
|
w_ref1 = stack_and_dev(w_ref1_l)
|
||||||
|
qweight1 = stack_and_dev(qweight1_l).contiguous()
|
||||||
|
scales1 = stack_and_dev(scales1_l)
|
||||||
|
g_idx1 = stack_and_dev(g_idx1_l)
|
||||||
|
sort_indices1 = stack_and_dev(sort_indices1_l)
|
||||||
|
|
||||||
|
w_ref2_l = []
|
||||||
|
qweight2_l = []
|
||||||
|
scales2_l = []
|
||||||
|
g_idx2_l = []
|
||||||
|
sort_indices2_l = []
|
||||||
|
|
||||||
|
for i in range(w2.shape[0]):
|
||||||
|
test_perm = torch.randperm(n)
|
||||||
|
w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
|
||||||
|
w2[i].transpose(1, 0), quant_type, group_size, act_order,
|
||||||
|
test_perm)
|
||||||
|
w_ref2_l.append(w_ref2)
|
||||||
|
qweight2_l.append(qweight2)
|
||||||
|
scales2_l.append(scales2)
|
||||||
|
g_idx2_l.append(g_idx2)
|
||||||
|
sort_indices2_l.append(sort_indices2)
|
||||||
|
|
||||||
|
w_ref2 = stack_and_dev(w_ref2_l)
|
||||||
|
qweight2 = stack_and_dev(qweight2_l).contiguous()
|
||||||
|
scales2 = stack_and_dev(scales2_l)
|
||||||
|
g_idx2 = stack_and_dev(g_idx2_l)
|
||||||
|
sort_indices2 = stack_and_dev(sort_indices2_l)
|
||||||
|
|
||||||
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||||
|
|
||||||
|
topk_weights, topk_ids = fused_topk(a, score, topk, False)
|
||||||
|
|
||||||
|
triton_output = fused_moe(
|
||||||
|
a,
|
||||||
|
w_ref1.transpose(1, 2).contiguous(),
|
||||||
|
w_ref2.transpose(1, 2).contiguous(),
|
||||||
|
score,
|
||||||
|
topk,
|
||||||
|
renormalize=False,
|
||||||
|
)
|
||||||
|
marlin_output = fused_marlin_moe(
|
||||||
|
a,
|
||||||
|
qweight1,
|
||||||
|
qweight2,
|
||||||
|
score,
|
||||||
|
g_idx1,
|
||||||
|
g_idx2,
|
||||||
|
sort_indices1,
|
||||||
|
sort_indices2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
w1_scale=scales1,
|
||||||
|
w2_scale=scales2,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert compute_max_diff(marlin_output, triton_output) < 4e-2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip("This test is here for the sake of debugging, "
|
||||||
|
"don't run it in automated tests.")
|
||||||
|
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
|
||||||
|
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
|
||||||
|
@pytest.mark.parametrize("k", [128, 1024, 512])
|
||||||
|
@pytest.mark.parametrize("e", [4, 8, 64])
|
||||||
|
@pytest.mark.parametrize("topk", [2, 6])
|
||||||
|
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
|
||||||
|
@pytest.mark.parametrize("act_order", [True, False])
|
||||||
|
def test_marlin_moe_mmm(
|
||||||
|
m: int,
|
||||||
|
n: int,
|
||||||
|
k: int,
|
||||||
|
e: int,
|
||||||
|
topk: int,
|
||||||
|
group_size: int,
|
||||||
|
act_order: bool,
|
||||||
|
):
|
||||||
|
if topk > e:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Filter act_order
|
||||||
|
if act_order:
|
||||||
|
if group_size == -1:
|
||||||
|
return
|
||||||
|
if group_size == k:
|
||||||
|
return
|
||||||
|
|
||||||
|
quant_type = scalar_types.uint4b8
|
||||||
|
dtype = torch.float16
|
||||||
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
|
||||||
|
|
||||||
|
w_ref_l = []
|
||||||
|
qweights_l = []
|
||||||
|
scales_l = []
|
||||||
|
g_idx_l = []
|
||||||
|
sort_indices_l = []
|
||||||
|
|
||||||
|
for i in range(w.shape[0]):
|
||||||
|
test_perm = torch.randperm(k)
|
||||||
|
w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
|
||||||
|
w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
|
||||||
|
w_ref_l.append(w_ref)
|
||||||
|
qweights_l.append(qweight)
|
||||||
|
scales_l.append(scales)
|
||||||
|
g_idx_l.append(g_idx)
|
||||||
|
sort_indices_l.append(sort_indices)
|
||||||
|
|
||||||
|
w_ref = stack_and_dev(w_ref_l)
|
||||||
|
qweight = stack_and_dev(qweights_l).contiguous()
|
||||||
|
scales = stack_and_dev(scales_l)
|
||||||
|
g_idx = stack_and_dev(g_idx_l)
|
||||||
|
sort_indices = stack_and_dev(sort_indices_l)
|
||||||
|
|
||||||
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||||
|
marlin_output = single_marlin_moe(a,
|
||||||
|
qweight,
|
||||||
|
scales,
|
||||||
|
score,
|
||||||
|
g_idx,
|
||||||
|
sort_indices,
|
||||||
|
topk,
|
||||||
|
renormalize=False)
|
||||||
|
torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
|
||||||
|
|
||||||
|
assert compute_max_diff(marlin_output, torch_output) < 1e-2
|
||||||
|
|||||||
@@ -3,7 +3,8 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
from numbers import Number
|
from numbers import Number
|
||||||
from typing import Any, List, NamedTuple, Optional, Tuple, Union
|
from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
|
||||||
|
Union)
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@@ -13,6 +14,21 @@ from vllm.attention.backends.xformers import XFormersBackend
|
|||||||
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
|
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
|
||||||
make_tensor_with_pad)
|
make_tensor_with_pad)
|
||||||
|
|
||||||
|
# For now, disable "test_aot_dispatch_dynamic" since there are some
|
||||||
|
# bugs related to this test in PyTorch 2.4.
|
||||||
|
DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
|
||||||
|
"test_schema",
|
||||||
|
"test_autograd_registration",
|
||||||
|
"test_faketensor",
|
||||||
|
)
|
||||||
|
|
||||||
|
ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
|
||||||
|
"test_schema",
|
||||||
|
"test_autograd_registration",
|
||||||
|
"test_faketensor",
|
||||||
|
"test_aot_dispatch_dynamic",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class QKVInputs(NamedTuple):
|
class QKVInputs(NamedTuple):
|
||||||
'''
|
'''
|
||||||
@@ -926,3 +942,19 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
|
|||||||
ideal_output = test_params.packed_qkvo.ideal_output
|
ideal_output = test_params.packed_qkvo.ideal_output
|
||||||
torch.testing.assert_close(ideal_output,
|
torch.testing.assert_close(ideal_output,
|
||||||
output_under_test.view_as(ideal_output))
|
output_under_test.view_as(ideal_output))
|
||||||
|
|
||||||
|
|
||||||
|
def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
|
||||||
|
torch._library.custom_ops.CustomOpDef],
|
||||||
|
args: Tuple[Any, ...],
|
||||||
|
kwargs: Optional[Dict[str, Any]] = None,
|
||||||
|
*,
|
||||||
|
test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
|
||||||
|
raise_exception: bool = True,
|
||||||
|
cond: bool = True) -> Dict[str, str]:
|
||||||
|
return torch.library.opcheck(
|
||||||
|
op,
|
||||||
|
args,
|
||||||
|
kwargs,
|
||||||
|
test_utils=test_utils,
|
||||||
|
raise_exception=raise_exception) if cond else {}
|
||||||
|
|||||||
@@ -7,26 +7,6 @@ import pytest
|
|||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
|
||||||
# In this test we hardcode prompts and generations for the model so we don't
|
|
||||||
# need to require the AQLM package as a dependency
|
|
||||||
example_prompts = [
|
|
||||||
'vLLM is a high-throughput and memory-efficient inference and serving '
|
|
||||||
'engine for LLMs.\n',
|
|
||||||
'Briefly describe the major milestones in the development of artificial '
|
|
||||||
'intelligence from 1950 to 2020.\n',
|
|
||||||
'Compare and contrast artificial intelligence with human intelligence in '
|
|
||||||
'terms of processing information.\n',
|
|
||||||
'Describe the basic components of a neural network and how it can be '
|
|
||||||
'trained.\n',
|
|
||||||
'Write a short story about a robot that dreams for the first time.\n',
|
|
||||||
'Analyze the impact of the COVID-19 pandemic on global economic structures '
|
|
||||||
'and future business models.\n',
|
|
||||||
'Explain the cultural significance of the Mona Lisa painting, and how its '
|
|
||||||
'perception might vary in Western versus Eastern societies.\n',
|
|
||||||
"Translate the following English sentence into Japanese, French, and "
|
|
||||||
"Swahili: 'The early bird catches the worm.'\n"
|
|
||||||
]
|
|
||||||
|
|
||||||
# These ground truth generations were generated using `transformers==4.38.1
|
# These ground truth generations were generated using `transformers==4.38.1
|
||||||
# aqlm==1.1.0 torch==2.2.0`
|
# aqlm==1.1.0 torch==2.2.0`
|
||||||
# and the below code:
|
# and the below code:
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import types
|
import types
|
||||||
from typing import List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@@ -9,7 +9,8 @@ from transformers import AutoConfig
|
|||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
from vllm.utils import is_cpu
|
from vllm.utils import is_cpu
|
||||||
|
|
||||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||||
|
_ImageAssets)
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.vlm
|
pytestmark = pytest.mark.vlm
|
||||||
@@ -20,6 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|||||||
"cherry_blossom":
|
"cherry_blossom":
|
||||||
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
})
|
})
|
||||||
|
HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
"OpenGVLab/InternVL2-1B",
|
"OpenGVLab/InternVL2-1B",
|
||||||
@@ -64,13 +66,13 @@ def generate(
|
|||||||
def run_test(
|
def run_test(
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
image_assets: _ImageAssets,
|
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||||
model: str,
|
model: str,
|
||||||
*,
|
*,
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
|
mm_limit: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
distributed_executor_backend: Optional[str] = None,
|
distributed_executor_backend: Optional[str] = None,
|
||||||
):
|
):
|
||||||
@@ -83,12 +85,6 @@ def run_test(
|
|||||||
Note, the text input is also adjusted to abide by vllm contract.
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
The text output is sanitized to be able to compare with hf.
|
The text output is sanitized to be able to compare with hf.
|
||||||
"""
|
"""
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
@@ -110,13 +106,21 @@ def run_test(
|
|||||||
self.max_num = self.config.max_dynamic_patch
|
self.max_num = self.config.max_dynamic_patch
|
||||||
self.image_size = self.vision_config.image_size
|
self.image_size = self.vision_config.image_size
|
||||||
|
|
||||||
def __call__(self, text: str, images: Image, **kwargs):
|
def __call__(self, text: str, images: Union[Image, List[Image]],
|
||||||
|
**kwargs):
|
||||||
from vllm.model_executor.models.internvl import (
|
from vllm.model_executor.models.internvl import (
|
||||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
||||||
pixel_values = image_to_pixel_values(
|
images = [images] if isinstance(images, Image) else images
|
||||||
images, self.image_size, self.min_num, self.max_num,
|
pixel_values = [
|
||||||
self.use_thumbnail).to(self.dtype)
|
image_to_pixel_values(image, self.image_size, self.min_num,
|
||||||
num_patches_list = [pixel_values.shape[0]]
|
self.max_num,
|
||||||
|
self.use_thumbnail).to(self.dtype)
|
||||||
|
for image in images
|
||||||
|
]
|
||||||
|
num_patches_list = [
|
||||||
|
pixel_value.shape[0] for pixel_value in pixel_values
|
||||||
|
]
|
||||||
|
pixel_values = torch.cat(pixel_values, dim=0)
|
||||||
for num_patches in num_patches_list:
|
for num_patches in num_patches_list:
|
||||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||||
* num_patches
|
* num_patches
|
||||||
@@ -130,6 +134,7 @@ def run_test(
|
|||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
limit_mm_per_prompt={"image": mm_limit},
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=True) as vllm_model:
|
||||||
@@ -138,7 +143,7 @@ def run_test(
|
|||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=images)
|
images=images)
|
||||||
for prompts, images in inputs_per_image
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
@@ -156,7 +161,7 @@ def run_test(
|
|||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=hf_images,
|
images=hf_images,
|
||||||
eos_token_id=eos_token_id)
|
eos_token_id=eos_token_id)
|
||||||
for prompts, hf_images in inputs_per_image
|
for prompts, hf_images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||||
@@ -264,15 +269,64 @@ if is_cpu():
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
|
||||||
run_test(
|
run_test(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
image_assets,
|
inputs_per_image,
|
||||||
model,
|
model,
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=1,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.5, 0.75, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
@torch.inference_mode()
|
||||||
|
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||||
|
size_factors, dtype: str, max_tokens: int,
|
||||||
|
num_logprobs: int) -> None:
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
inputs_per_case = [
|
||||||
|
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||||
|
[[rescale_image_size(image, factor) for image in images]
|
||||||
|
for factor in size_factors])
|
||||||
|
]
|
||||||
|
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs_per_case,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
mm_limit=2,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type, overload
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
||||||
@@ -8,11 +8,14 @@ from vllm.multimodal.utils import rescale_image_size
|
|||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||||
|
_ImageAssets)
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.vlm
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
_LIMIT_IMAGE_PER_PROMPT = 4
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
"stop_sign":
|
"stop_sign":
|
||||||
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
||||||
@@ -52,6 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|||||||
return hf_output_ids, hf_output_str, out_logprobs
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
def run_test(
|
def run_test(
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
@@ -64,6 +68,78 @@ def run_test(
|
|||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
distributed_executor_backend: Optional[str] = None,
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
sizes: List[Tuple[int, int]],
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
size_factors: Optional[List[float]] = None,
|
||||||
|
sizes: Optional[List[Tuple[int, int]]] = None,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
if size_factors is not None:
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
elif sizes is not None:
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in sizes],
|
||||||
|
[image.resize(size) for size in sizes],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
else:
|
||||||
|
raise ValueError("You must provide either `size_factors` or `sizes`")
|
||||||
|
|
||||||
|
_run_test(hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs_per_image,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Inference result should be the same between hf and vllm.
|
"""Inference result should be the same between hf and vllm.
|
||||||
|
|
||||||
@@ -85,13 +161,6 @@ def run_test(
|
|||||||
else:
|
else:
|
||||||
mantis_processor = None
|
mantis_processor = None
|
||||||
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
@@ -100,15 +169,18 @@ def run_test(
|
|||||||
# max_model_len should be greater than image_feature_size
|
# max_model_len should be greater than image_feature_size
|
||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
max_model_len=4096,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=True,
|
||||||
|
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
||||||
|
}) as vllm_model:
|
||||||
vllm_outputs_per_image = [
|
vllm_outputs_per_image = [
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=images)
|
images=images)
|
||||||
for prompts, images in inputs_per_image
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
if mantis_processor is not None:
|
if mantis_processor is not None:
|
||||||
@@ -131,7 +203,7 @@ def run_test(
|
|||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
images=images)
|
images=images)
|
||||||
for prompts, images in inputs_per_image
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||||
@@ -181,6 +253,51 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
|
||||||
|
model, dtype, max_tokens,
|
||||||
|
num_logprobs) -> None:
|
||||||
|
stop_sign = image_assets[0].pil_image
|
||||||
|
cherry_blossom = image_assets[1].pil_image
|
||||||
|
|
||||||
|
inputs = [(
|
||||||
|
[
|
||||||
|
"USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
|
||||||
|
"USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
|
||||||
|
"USER: <image><image><image><image>\nDescribe 4 images.\nASSISTANT:", # noqa: E501
|
||||||
|
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
[stop_sign, cherry_blossom],
|
||||||
|
# Images with different sizes and aspect-ratios
|
||||||
|
[
|
||||||
|
rescale_image_size(stop_sign, 0.1),
|
||||||
|
stop_sign,
|
||||||
|
],
|
||||||
|
[
|
||||||
|
stop_sign,
|
||||||
|
rescale_image_size(stop_sign, 0.25),
|
||||||
|
cherry_blossom.resize((183, 488)),
|
||||||
|
cherry_blossom.resize((488, 183))
|
||||||
|
],
|
||||||
|
cherry_blossom,
|
||||||
|
])]
|
||||||
|
|
||||||
|
_run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
inputs,
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
@pytest.mark.parametrize("model", models)
|
||||||
def test_context_length_too_short(vllm_runner, image_assets, model):
|
def test_context_length_too_short(vllm_runner, image_assets, model):
|
||||||
images = [asset.pil_image for asset in image_assets]
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|||||||
236
tests/models/test_llava_next_video.py
Normal file
236
tests/models/test_llava_next_video.py
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
from typing import List, Optional, Tuple, Type, overload
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import transformers
|
||||||
|
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
||||||
|
|
||||||
|
from vllm.multimodal.utils import (rescale_video_size, resize_video,
|
||||||
|
sample_frames_from_video)
|
||||||
|
from vllm.sequence import SampleLogprobs
|
||||||
|
|
||||||
|
from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
|
||||||
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
_PREFACE = (
|
||||||
|
"A chat between a curious human and an artificial intelligence assistant. "
|
||||||
|
"The assistant gives helpful, detailed, and polite answers to the human's "
|
||||||
|
"questions.")
|
||||||
|
|
||||||
|
HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||||
|
"sample_demo_1":
|
||||||
|
f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
|
||||||
|
})
|
||||||
|
|
||||||
|
models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
|
||||||
|
|
||||||
|
|
||||||
|
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||||
|
Optional[SampleLogprobs]],
|
||||||
|
model: str):
|
||||||
|
"""Sanitize vllm output to be comparable with hf output."""
|
||||||
|
output_ids, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(model)
|
||||||
|
video_token_id = config.video_token_index
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
eos_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
hf_output_ids = [
|
||||||
|
token_id for idx, token_id in enumerate(output_ids)
|
||||||
|
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output_str[0] == " "
|
||||||
|
hf_output_str = output_str[1:]
|
||||||
|
if hf_output_ids[-1] == eos_token_id:
|
||||||
|
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||||
|
|
||||||
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
size_factors: List[float],
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
num_frames: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
sizes: List[Tuple[int, int]],
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
num_frames: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
size_factors: Optional[List[float]] = None,
|
||||||
|
sizes: Optional[List[Tuple[int, int]]] = None,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
num_frames: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
videos = [
|
||||||
|
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||||
|
for asset in video_assets
|
||||||
|
]
|
||||||
|
|
||||||
|
for video in videos:
|
||||||
|
print(video.shape)
|
||||||
|
|
||||||
|
if size_factors is not None:
|
||||||
|
inputs_per_video = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_video_size(video, factor) for factor in size_factors],
|
||||||
|
) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
|
||||||
|
elif sizes is not None:
|
||||||
|
inputs_per_video = [(
|
||||||
|
[prompt for _ in sizes],
|
||||||
|
[resize_video(video, size) for size in sizes],
|
||||||
|
) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
|
||||||
|
else:
|
||||||
|
raise ValueError("You must provide either `size_factors` or `sizes`")
|
||||||
|
|
||||||
|
# max_model_len should be greater than image_feature_size
|
||||||
|
with vllm_runner(model,
|
||||||
|
dtype=dtype,
|
||||||
|
max_model_len=4096,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
enforce_eager=True) as vllm_model:
|
||||||
|
vllm_outputs_per_video = [
|
||||||
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
videos=videos)
|
||||||
|
for prompts, videos in inputs_per_video
|
||||||
|
]
|
||||||
|
|
||||||
|
with hf_runner(model, dtype=dtype,
|
||||||
|
auto_cls=AutoModelForVision2Seq) as hf_model:
|
||||||
|
hf_outputs_per_video = [
|
||||||
|
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
videos=videos)
|
||||||
|
for prompts, videos in inputs_per_video
|
||||||
|
]
|
||||||
|
|
||||||
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
|
||||||
|
vllm_outputs_per_video):
|
||||||
|
# TODO: Check whether using original CLIPVisionModel can improve
|
||||||
|
# consistency against HF
|
||||||
|
check_logprobs_close(
|
||||||
|
outputs_0_lst=hf_outputs,
|
||||||
|
outputs_1_lst=[
|
||||||
|
vllm_to_hf_output(vllm_output, model)
|
||||||
|
for vllm_output in vllm_outputs
|
||||||
|
],
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(transformers.__version__ < "4.45",
|
||||||
|
reason="Waiting for next transformers release")
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No video
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
@pytest.mark.parametrize("num_frames", [16])
|
||||||
|
def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
|
||||||
|
dtype, max_tokens, num_logprobs, num_frames) -> None:
|
||||||
|
"""Inference result should be the same between hf and vllm.
|
||||||
|
|
||||||
|
All the image fixtures for the test is under tests/videos.
|
||||||
|
For huggingface runner, we provide the np.ndarray as input.
|
||||||
|
For vllm runner, we provide MultiModalDataDict objects
|
||||||
|
and corresponding MultiModalConfig as input.
|
||||||
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
|
The text output is sanitized to be able to compare with hf.
|
||||||
|
"""
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
video_assets,
|
||||||
|
model,
|
||||||
|
size_factors=size_factors,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
num_frames=num_frames,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(transformers.__version__ < "4.45",
|
||||||
|
reason="Waiting for next transformers release")
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sizes",
|
||||||
|
[[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
@pytest.mark.parametrize("num_frames", [16])
|
||||||
|
def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
|
||||||
|
dtype, max_tokens, num_logprobs,
|
||||||
|
num_frames) -> None:
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
video_assets,
|
||||||
|
model,
|
||||||
|
sizes=sizes,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
num_frames=num_frames,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
@@ -41,3 +41,43 @@ def test_models(
|
|||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", MODELS[1:])
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [64])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_mistral_format(
|
||||||
|
vllm_runner,
|
||||||
|
example_prompts,
|
||||||
|
model: str,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
) -> None:
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
tokenizer_mode="auto",
|
||||||
|
load_format="safetensors",
|
||||||
|
config_format="hf",
|
||||||
|
) as hf_format_model:
|
||||||
|
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
|
||||||
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
dtype=dtype,
|
||||||
|
tokenizer_mode="mistral",
|
||||||
|
load_format="mistral",
|
||||||
|
config_format="mistral",
|
||||||
|
) as mistral_format_model:
|
||||||
|
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
|
||||||
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
|
||||||
|
check_logprobs_close(
|
||||||
|
outputs_0_lst=hf_format_outputs,
|
||||||
|
outputs_1_lst=mistral_format_outputs,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="mistral",
|
||||||
|
)
|
||||||
|
|||||||
79
tests/models/test_modelopt.py
Normal file
79
tests/models/test_modelopt.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# flake8: noqa
|
||||||
|
"""Tests Model Optimizer fp8 models against ground truth generation
|
||||||
|
Note: these tests will only pass on H100
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||||
|
|
||||||
|
MAX_MODEL_LEN = 1024
|
||||||
|
|
||||||
|
MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
|
||||||
|
|
||||||
|
EXPECTED_STRS_MAP = {
|
||||||
|
"nvidia/Llama-3.1-8B-Instruct-FP8": [
|
||||||
|
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
|
||||||
|
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||||
|
'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
|
||||||
|
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
|
||||||
|
'**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
|
||||||
|
'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
|
||||||
|
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||||
|
'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# This test compares against golden strings for exact match since
|
||||||
|
# there is no baseline implementation to compare against
|
||||||
|
# and is unstable w.r.t specifics of the fp8 implementation or
|
||||||
|
# the hardware being run on.
|
||||||
|
# Disabled to prevent it from breaking the build
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason=
|
||||||
|
"Prevent unstable test based on golden strings from breaking the build.")
|
||||||
|
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||||
|
reason="fp8 is not supported on this GPU type.")
|
||||||
|
@pytest.mark.parametrize("model_name", MODELS)
|
||||||
|
def test_models(example_prompts, model_name) -> None:
|
||||||
|
model = LLM(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=MAX_MODEL_LEN,
|
||||||
|
trust_remote_code=True,
|
||||||
|
enforce_eager=True,
|
||||||
|
quantization="modelopt",
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
formatted_prompts = [
|
||||||
|
tokenizer.apply_chat_template([{
|
||||||
|
"role": "user",
|
||||||
|
"content": prompt
|
||||||
|
}],
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True)
|
||||||
|
for prompt in example_prompts
|
||||||
|
]
|
||||||
|
params = SamplingParams(max_tokens=20, temperature=0)
|
||||||
|
generations: List[str] = []
|
||||||
|
# Note: these need to be run 1 at a time due to numerical precision,
|
||||||
|
# since the expected strs were generated this way.
|
||||||
|
for prompt in formatted_prompts:
|
||||||
|
outputs = model.generate(prompt, params)
|
||||||
|
generations.append(outputs[0].outputs[0].text)
|
||||||
|
del model
|
||||||
|
|
||||||
|
print(model_name, generations)
|
||||||
|
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||||
|
for i in range(len(example_prompts)):
|
||||||
|
generated_str = generations[i]
|
||||||
|
expected_str = expected_strs[i]
|
||||||
|
assert expected_str == generated_str, (
|
||||||
|
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
|
||||||
@@ -1,16 +1,15 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import List, Optional, Tuple, Type, Union
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from PIL import Image
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import is_cpu, is_hip
|
from vllm.utils import is_cpu, is_hip
|
||||||
|
|
||||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.vlm
|
pytestmark = pytest.mark.vlm
|
||||||
@@ -60,8 +59,7 @@ if is_hip():
|
|||||||
def run_test(
|
def run_test(
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
inputs: List[Tuple[List[str], Union[List[Image.Image],
|
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||||
List[List[Image.Image]]]]],
|
|
||||||
model: str,
|
model: str,
|
||||||
*,
|
*,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
|
|||||||
64
tests/models/test_pixtral.py
Normal file
64
tests/models/test_pixtral.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
|
||||||
|
|
||||||
|
Run `pytest tests/models/test_mistral.py`.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
MODELS = ["mistralai/Pixtral-12B-2409"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason=
|
||||||
|
"Model is too big, test passed on A100 locally but will OOM on CI machine."
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [64])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_models(
|
||||||
|
vllm_runner,
|
||||||
|
example_prompts,
|
||||||
|
model: str,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
) -> None:
|
||||||
|
image_urls = [
|
||||||
|
"https://picsum.photos/id/237/200/300",
|
||||||
|
"https://picsum.photos/seed/picsum/200/300"
|
||||||
|
]
|
||||||
|
expected = [
|
||||||
|
"The image depicts a black dog lying on a wooden surface, looking directly at the camera with a calm expression.", # noqa
|
||||||
|
"The image depicts a serene landscape with a snow-covered mountain under a pastel-colored sky during sunset." # noqa
|
||||||
|
]
|
||||||
|
prompt = "Describe the image in one short sentence."
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(max_tokens=512, temperature=0.0)
|
||||||
|
|
||||||
|
with vllm_runner(model, dtype=dtype,
|
||||||
|
tokenizer_mode="mistral") as vllm_model:
|
||||||
|
|
||||||
|
for i, image_url in enumerate(image_urls):
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
}, {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": image_url
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
outputs = vllm_model.model.chat(messages,
|
||||||
|
sampling_params=sampling_params)
|
||||||
|
assert outputs[0].outputs[0].text == expected[i]
|
||||||
@@ -1,19 +1,154 @@
|
|||||||
from typing import Type
|
import pathlib
|
||||||
|
from typing import List, Optional, Type
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ..conftest import HfRunner, VllmRunner
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
|
|
||||||
|
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
models = ["qwen/qwen-vl"]
|
pytestmark = pytest.mark.vlm
|
||||||
|
|
||||||
|
text_only_models = [
|
||||||
|
"Qwen/Qwen-7B-Chat" # Has no visual component
|
||||||
|
]
|
||||||
|
|
||||||
|
multimodal_models = ["Qwen/Qwen-VL"]
|
||||||
|
|
||||||
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
|
"stop_sign":
|
||||||
|
"Picture 1: <img></img>\nWhat's the content of the image?: ",
|
||||||
|
"cherry_blossom":
|
||||||
|
"Picture 1: <img></img>\nWhat is the season?: ",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
### Tests for multimodal Qwen models
|
||||||
|
def run_test(
|
||||||
|
tmp_path: pathlib.PosixPath,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
size_factors: List[float],
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Inference result should be the same between hf and vllm.
|
||||||
|
|
||||||
|
All the image fixtures for the test is under tests/images.
|
||||||
|
For huggingface runner, we provide the PIL images as input.
|
||||||
|
For vllm runner, we provide MultiModalDataDict objects
|
||||||
|
and corresponding MultiModalConfig as input.
|
||||||
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
|
The text output is sanitized to be able to compare with hf.
|
||||||
|
"""
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
# Export the images to a tempdir and substitute it into the hf prompt;
|
||||||
|
# the contents between <img>/</img> will be ignored by VLLM, but the
|
||||||
|
# transformers implementation for the visual transformer parses this to
|
||||||
|
# reload it in the forward call; the contents are treated as a URL or a
|
||||||
|
# local path.
|
||||||
|
for idx, asset in enumerate(image_assets):
|
||||||
|
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
||||||
|
asset.pil_image.save(image_tmp_path)
|
||||||
|
HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace(
|
||||||
|
"<img></img>", f"<img>{image_tmp_path}</img>")
|
||||||
|
|
||||||
|
inputs_per_image = [(
|
||||||
|
[prompt for _ in size_factors],
|
||||||
|
[rescale_image_size(image, factor) for factor in size_factors],
|
||||||
|
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||||
|
|
||||||
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
|
# will hurt multiprocessing backend with fork method (the default method).
|
||||||
|
|
||||||
|
# max_model_len should be greater than image_feature_size
|
||||||
|
# Qwen encodes images into a fixed content size of 256
|
||||||
|
with vllm_runner(model,
|
||||||
|
max_model_len=300,
|
||||||
|
max_num_seqs=1,
|
||||||
|
dtype=dtype,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
enforce_eager=True) as vllm_model:
|
||||||
|
vllm_outputs_per_image = [
|
||||||
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
images=images)
|
||||||
|
for prompts, images in inputs_per_image
|
||||||
|
]
|
||||||
|
|
||||||
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
|
hf_outputs_per_image = [
|
||||||
|
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
images=images)
|
||||||
|
for prompts, images in inputs_per_image
|
||||||
|
]
|
||||||
|
|
||||||
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||||
|
vllm_outputs_per_image):
|
||||||
|
|
||||||
|
check_logprobs_close(
|
||||||
|
outputs_0_lst=hf_outputs,
|
||||||
|
outputs_1_lst=vllm_outputs,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", multimodal_models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [8])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets,
|
||||||
|
model, size_factors, dtype, max_tokens,
|
||||||
|
num_logprobs) -> None:
|
||||||
|
run_test(
|
||||||
|
tmp_path,
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
image_assets,
|
||||||
|
model,
|
||||||
|
size_factors=size_factors,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Ensure that a text-only Qwen model can still be loaded and
|
||||||
|
# used for inference in VLLM without throwing.
|
||||||
|
@pytest.mark.parametrize("model", text_only_models)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
@pytest.mark.parametrize("model", models)
|
def test_text_only_qwen_model_can_be_loaded_and_run(
|
||||||
def test_text_only_qwen_model(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
example_prompts,
|
example_prompts,
|
||||||
model: str,
|
model: str,
|
||||||
@@ -22,27 +157,9 @@ def test_text_only_qwen_model(
|
|||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
):
|
):
|
||||||
# This test checks language inputs only, since the visual component
|
|
||||||
# for qwen-vl is still unsupported in VLLM. In the near-future, the
|
|
||||||
# implementation and this test will be extended to consider
|
|
||||||
# visual inputs as well.
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
|
||||||
example_prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
)
|
|
||||||
|
|
||||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
vllm_model.generate_greedy_logprobs(
|
||||||
example_prompts,
|
example_prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
)
|
)
|
||||||
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,9 +1,14 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
import transformers
|
||||||
|
|
||||||
from vllm.model_executor.models import _MODELS, ModelRegistry
|
from vllm.model_executor.models import _MODELS, ModelRegistry
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_cls", _MODELS)
|
@pytest.mark.parametrize("model_cls", _MODELS)
|
||||||
def test_registry_imports(model_cls):
|
def test_registry_imports(model_cls):
|
||||||
|
if (model_cls == "Qwen2VLForConditionalGeneration"
|
||||||
|
and transformers.__version__ < "4.45"):
|
||||||
|
pytest.skip("Waiting for next transformers release")
|
||||||
|
|
||||||
# Ensure all model classes can be imported successfully
|
# Ensure all model classes can be imported successfully
|
||||||
ModelRegistry.resolve_model_cls([model_cls])
|
ModelRegistry.resolve_model_cls([model_cls])
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ def test_multi_step_llm(
|
|||||||
GPU -> CPU output transfer
|
GPU -> CPU output transfer
|
||||||
num_prompts: number of example prompts under test
|
num_prompts: number of example prompts under test
|
||||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||||
completions endpoint; `None` -> no logprobs
|
completions endpoint; `None` -> 1 logprob returned.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
prompts = example_prompts
|
prompts = example_prompts
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
|||||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||||
assert qkv_proj.input_scale.dtype is torch.float32
|
assert qkv_proj.input_scale.dtype is torch.float32
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
@@ -85,7 +85,7 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
|
|||||||
assert qkv_proj.scheme.strategy == strategy
|
assert qkv_proj.scheme.strategy == strategy
|
||||||
assert qkv_proj.weight.dtype is torch.int8
|
assert qkv_proj.weight.dtype is torch.int8
|
||||||
|
|
||||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||||
assert output
|
assert output
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,224 +1,54 @@
|
|||||||
import asyncio
|
|
||||||
import os
|
|
||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import ray
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
from vllm.multimodal import MultiModalDataDict
|
|
||||||
from vllm.outputs import RequestOutput
|
|
||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
|
||||||
from vllm.sampling_params import SamplingParams
|
|
||||||
from vllm.sequence import Logprob
|
|
||||||
from vllm.usage.usage_lib import UsageContext
|
|
||||||
from vllm.utils import Counter, random_uuid
|
|
||||||
|
|
||||||
from ...conftest import cleanup
|
from ...conftest import cleanup
|
||||||
from ...utils import wait_for_gpu_memory_to_clear
|
from ...models.utils import check_logprobs_close, check_outputs_equal
|
||||||
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
|
PROMPTS = [
|
||||||
class AsyncLLM:
|
"Hello, my name is",
|
||||||
"""AsyncLLM
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
Note: Current LLM class in vllm don't support async mode, for test purpose,
|
"The future of AI is",
|
||||||
we implement async one in here. Maybe we could move to
|
"San Francisco is know for its",
|
||||||
vllm/entrypoints/llm.py in future.
|
"Facebook was created in 2004 by",
|
||||||
|
"Curious George is a",
|
||||||
Below AsyncLLM is directly borrow from vllm/entrypoints/llm.py with changes
|
"Python 3.11 brings improvements to its",
|
||||||
to make to work in async mode.
|
]
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model: str,
|
|
||||||
tokenizer: Optional[str] = None,
|
|
||||||
tokenizer_mode: str = "auto",
|
|
||||||
skip_tokenizer_init: bool = False,
|
|
||||||
trust_remote_code: bool = False,
|
|
||||||
tensor_parallel_size: int = 1,
|
|
||||||
dtype: str = "auto",
|
|
||||||
quantization: Optional[str] = None,
|
|
||||||
revision: Optional[str] = None,
|
|
||||||
tokenizer_revision: Optional[str] = None,
|
|
||||||
seed: int = 0,
|
|
||||||
gpu_memory_utilization: float = 0.9,
|
|
||||||
swap_space: int = 4,
|
|
||||||
enforce_eager: bool = False,
|
|
||||||
max_seq_len_to_capture: int = 8192,
|
|
||||||
disable_custom_all_reduce: bool = False,
|
|
||||||
**kwargs,
|
|
||||||
) -> None:
|
|
||||||
if "disable_log_stats" not in kwargs:
|
|
||||||
kwargs["disable_log_stats"] = True
|
|
||||||
|
|
||||||
# Needed to engine_use_ray works as a deprecated feature,
|
|
||||||
# otherwise the following constructor will raise an exception
|
|
||||||
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
|
|
||||||
|
|
||||||
engine_args = AsyncEngineArgs(
|
|
||||||
model=model,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
tokenizer_mode=tokenizer_mode,
|
|
||||||
skip_tokenizer_init=skip_tokenizer_init,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
dtype=dtype,
|
|
||||||
quantization=quantization,
|
|
||||||
revision=revision,
|
|
||||||
tokenizer_revision=tokenizer_revision,
|
|
||||||
seed=seed,
|
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
|
||||||
swap_space=swap_space,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
max_seq_len_to_capture=max_seq_len_to_capture,
|
|
||||||
# For now use ray for the distributed back-end, since
|
|
||||||
# we rely on the use of engine_use_ray=True to avoid
|
|
||||||
# reinitializing CUDA in the same process (driver worker)
|
|
||||||
engine_use_ray=True,
|
|
||||||
distributed_executor_backend="ray",
|
|
||||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
self.request_counter = Counter()
|
|
||||||
self.llm_engine = AsyncLLMEngine.from_engine_args(
|
|
||||||
engine_args, usage_context=UsageContext.LLM_CLASS)
|
|
||||||
|
|
||||||
def generate(
|
|
||||||
self,
|
|
||||||
prompts: Optional[Union[str, List[str]]] = None,
|
|
||||||
sampling_params: Optional[Union[SamplingParams,
|
|
||||||
List[SamplingParams]]] = None,
|
|
||||||
prompt_token_ids: Optional[List[List[int]]] = None,
|
|
||||||
use_tqdm: bool = True,
|
|
||||||
lora_request: Optional[LoRARequest] = None,
|
|
||||||
multi_modal_data: Optional[MultiModalDataDict] = None,
|
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None
|
|
||||||
) -> List[RequestOutput]:
|
|
||||||
|
|
||||||
if prompts is None:
|
|
||||||
raise ValueError("prompts must be provided.")
|
|
||||||
if isinstance(prompts, str):
|
|
||||||
# Convert a single prompt to a list.
|
|
||||||
prompts = [prompts]
|
|
||||||
|
|
||||||
if prompts is not None:
|
|
||||||
num_requests = len(prompts)
|
|
||||||
|
|
||||||
if sampling_params is None:
|
|
||||||
# Use default sampling params.
|
|
||||||
sampling_params = SamplingParams()
|
|
||||||
|
|
||||||
elif isinstance(sampling_params,
|
|
||||||
list) and len(sampling_params) != num_requests:
|
|
||||||
raise ValueError("The lengths of prompts and "
|
|
||||||
"sampling_params must be the same.")
|
|
||||||
|
|
||||||
async def get_output(prompt, sampling_param) -> RequestOutput:
|
|
||||||
request_id = random_uuid()
|
|
||||||
results_generator = self.llm_engine.generate(
|
|
||||||
prompt, sampling_param, request_id)
|
|
||||||
final_output = None
|
|
||||||
async for request_output in results_generator:
|
|
||||||
final_output = request_output
|
|
||||||
assert final_output is not None
|
|
||||||
return final_output
|
|
||||||
|
|
||||||
outputs: List[RequestOutput] = []
|
|
||||||
try:
|
|
||||||
for i in range(num_requests):
|
|
||||||
prompt = prompts[i] if prompts is not None else None
|
|
||||||
params = sampling_params[i] if isinstance(
|
|
||||||
sampling_params, Sequence) else sampling_params
|
|
||||||
res = asyncio.run(get_output(prompt, params))
|
|
||||||
outputs.append(res)
|
|
||||||
finally:
|
|
||||||
ray.shutdown()
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def baseline_llm_generator(request, common_llm_kwargs,
|
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
|
||||||
seed):
|
|
||||||
return create_llm_generator("baseline", request, common_llm_kwargs,
|
|
||||||
per_test_common_llm_kwargs,
|
|
||||||
baseline_llm_kwargs, seed)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
||||||
test_llm_kwargs, seed):
|
test_llm_kwargs, seed):
|
||||||
return create_llm_generator("test", request, common_llm_kwargs,
|
|
||||||
per_test_common_llm_kwargs, test_llm_kwargs,
|
|
||||||
seed)
|
|
||||||
|
|
||||||
|
def generate():
|
||||||
|
kwargs = {
|
||||||
|
**common_llm_kwargs,
|
||||||
|
**per_test_common_llm_kwargs,
|
||||||
|
**test_llm_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
|
llm = LLM(**kwargs)
|
||||||
per_test_common_llm_kwargs, distinct_llm_kwargs,
|
|
||||||
seed):
|
|
||||||
kwargs = {
|
|
||||||
**common_llm_kwargs,
|
|
||||||
**per_test_common_llm_kwargs,
|
|
||||||
**distinct_llm_kwargs,
|
|
||||||
}
|
|
||||||
test_name = request.node.name
|
|
||||||
|
|
||||||
model = kwargs["model"]
|
|
||||||
draft_model = kwargs.get("speculative_model", None)
|
|
||||||
same_draft_target_model = (draft_model is not None
|
|
||||||
and draft_model == model)
|
|
||||||
|
|
||||||
def generator_inner():
|
|
||||||
|
|
||||||
wait_for_gpu_memory_to_clear(
|
|
||||||
devices=list(range(torch.cuda.device_count())),
|
|
||||||
threshold_bytes=2 * 2**30,
|
|
||||||
timeout_s=60,
|
|
||||||
)
|
|
||||||
|
|
||||||
use_async = False
|
|
||||||
if "use_async" in kwargs:
|
|
||||||
use_async = kwargs.pop("use_async")
|
|
||||||
print(f'{use_async=}')
|
|
||||||
|
|
||||||
print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
|
|
||||||
llm = AsyncLLM(**kwargs) if use_async else LLM(**kwargs)
|
|
||||||
|
|
||||||
# Override logging interval to 0 for spec decode test run to
|
|
||||||
# log all metrics in time.
|
|
||||||
if (baseline_or_test == "test" and not use_async
|
|
||||||
and llm.llm_engine.log_stats):
|
|
||||||
for sate_logger in llm.llm_engine.stat_loggers.values():
|
|
||||||
sate_logger.local_interval = 0
|
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
|
||||||
yield llm
|
yield llm
|
||||||
|
|
||||||
del llm
|
del llm
|
||||||
cleanup()
|
cleanup()
|
||||||
|
|
||||||
def generator_outer():
|
return generate
|
||||||
for llm in generator_inner():
|
|
||||||
yield llm
|
|
||||||
del llm
|
|
||||||
|
|
||||||
# Set an attribute to the generator_outer function to allow us to
|
|
||||||
# determine whether to further check the acceptance rate in tests.
|
|
||||||
generator_outer.same_draft_target_model = same_draft_target_model # type: ignore
|
|
||||||
return generator_outer
|
|
||||||
|
|
||||||
|
|
||||||
def maybe_assert_ngram_worker(llm):
|
def maybe_assert_ngram_worker(llm):
|
||||||
# Verify the proposer worker is ngram if ngram is specified.
|
# Verify the proposer worker is ngram if ngram is specified.
|
||||||
if (not isinstance(llm, AsyncLLM)
|
if (llm.llm_engine.speculative_config is not None
|
||||||
and llm.llm_engine.speculative_config is not None
|
|
||||||
and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
|
and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
|
||||||
from vllm.spec_decode.ngram_worker import NGramWorker
|
from vllm.spec_decode.ngram_worker import NGramWorker
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
@@ -251,118 +81,165 @@ def get_output_from_llm_generator(
|
|||||||
return tokens, token_ids, acceptance_rate
|
return tokens, token_ids, acceptance_rate
|
||||||
|
|
||||||
|
|
||||||
def get_logprobs_from_llm_generator(
|
def run_logprob_correctness_test(vllm_runner,
|
||||||
llm_generator, prompts,
|
common_llm_kwargs,
|
||||||
sampling_params) -> List[List[Dict[int, Logprob]]]:
|
per_test_common_llm_kwargs,
|
||||||
"""Returns a dict of (token_id: Logprob) for each generated position, for
|
baseline_llm_kwargs,
|
||||||
each sequence in the batch.
|
test_llm_kwargs,
|
||||||
"""
|
batch_size: int,
|
||||||
for llm in llm_generator():
|
max_output_len: int,
|
||||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
seed: Optional[int] = 0,
|
||||||
logprobs = [output.outputs[0].logprobs[:] for output in outputs]
|
temperature: float = 0.0,
|
||||||
del llm
|
logprobs: int = 1):
|
||||||
|
org_args = {
|
||||||
|
**common_llm_kwargs,
|
||||||
|
**per_test_common_llm_kwargs,
|
||||||
|
**baseline_llm_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
return logprobs
|
sd_args = {
|
||||||
|
**common_llm_kwargs,
|
||||||
|
**per_test_common_llm_kwargs,
|
||||||
|
**test_llm_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
|
prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=temperature,
|
||||||
|
max_tokens=max_output_len,
|
||||||
|
seed=seed,
|
||||||
|
logprobs=logprobs)
|
||||||
|
|
||||||
|
with vllm_runner(**org_args) as vllm_model:
|
||||||
|
org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
|
||||||
|
|
||||||
|
with vllm_runner(**sd_args) as vllm_model:
|
||||||
|
sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
|
||||||
|
|
||||||
|
check_logprobs_close(outputs_0_lst=org_outputs,
|
||||||
|
outputs_1_lst=sd_outputs,
|
||||||
|
name_0="org",
|
||||||
|
name_1="sd")
|
||||||
|
|
||||||
|
|
||||||
def run_greedy_equality_correctness_test(baseline_llm_generator,
|
def run_equality_correctness_test(
|
||||||
test_llm_generator,
|
vllm_runner,
|
||||||
batch_size,
|
common_llm_kwargs,
|
||||||
max_output_len,
|
per_test_common_llm_kwargs,
|
||||||
force_output_len: bool,
|
baseline_llm_kwargs,
|
||||||
print_tokens: bool = False,
|
test_llm_kwargs,
|
||||||
ensure_all_accepted: bool = False):
|
batch_size: int,
|
||||||
|
max_output_len: int,
|
||||||
|
seed: Optional[int] = 0,
|
||||||
|
temperature: float = 0.0,
|
||||||
|
disable_seed: bool = False,
|
||||||
|
ignore_eos: bool = True,
|
||||||
|
ensure_all_accepted: bool = False,
|
||||||
|
expected_acceptance_rate: Optional[float] = None):
|
||||||
|
|
||||||
|
org_args = {
|
||||||
|
**common_llm_kwargs,
|
||||||
|
**per_test_common_llm_kwargs,
|
||||||
|
**baseline_llm_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_args = {
|
||||||
|
**common_llm_kwargs,
|
||||||
|
**per_test_common_llm_kwargs,
|
||||||
|
**test_llm_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
|
prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
|
||||||
|
|
||||||
|
if disable_seed:
|
||||||
|
seed = None
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=temperature,
|
||||||
|
max_tokens=max_output_len,
|
||||||
|
seed=seed,
|
||||||
|
ignore_eos=ignore_eos)
|
||||||
|
|
||||||
|
with vllm_runner(**org_args) as vllm_model:
|
||||||
|
org_outputs = vllm_model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
with vllm_runner(**sd_args) as vllm_model:
|
||||||
|
if ensure_all_accepted or expected_acceptance_rate is not None:
|
||||||
|
# Force log interval to be 0 to catch all metrics.
|
||||||
|
stat_logger = vllm_model.model.llm_engine.stat_loggers[
|
||||||
|
'prometheus']
|
||||||
|
stat_logger.local_interval = -100
|
||||||
|
|
||||||
|
sd_outputs = vllm_model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
if ensure_all_accepted or expected_acceptance_rate is not None:
|
||||||
|
acceptance_rate = (stat_logger.metrics.
|
||||||
|
gauge_spec_decode_draft_acceptance_rate.labels(
|
||||||
|
**stat_logger.labels)._value.get())
|
||||||
|
|
||||||
|
if ensure_all_accepted:
|
||||||
|
assert True
|
||||||
|
# FIXME: ci fails to log acceptance rate.
|
||||||
|
# It works locally.
|
||||||
|
# assert acceptance_rate == 1.0
|
||||||
|
|
||||||
|
if expected_acceptance_rate is not None:
|
||||||
|
assert acceptance_rate >= expected_acceptance_rate - 1e-2
|
||||||
|
|
||||||
|
check_outputs_equal(outputs_0_lst=org_outputs,
|
||||||
|
outputs_1_lst=sd_outputs,
|
||||||
|
name_0="org",
|
||||||
|
name_1="sd")
|
||||||
|
|
||||||
|
|
||||||
|
def run_equality_correctness_test_tp(model,
|
||||||
|
common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs,
|
||||||
|
batch_size: int,
|
||||||
|
max_output_len: int,
|
||||||
|
seed: int = 0,
|
||||||
|
temperature: float = 0.0):
|
||||||
"""Helper method that compares the outputs of both the baseline LLM and
|
"""Helper method that compares the outputs of both the baseline LLM and
|
||||||
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
|
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
|
||||||
the same when temperature is zero.
|
the same when temperature is zero.
|
||||||
"""
|
"""
|
||||||
|
arg1 = common_llm_kwargs + per_test_common_llm_kwargs + baseline_llm_kwargs
|
||||||
|
arg2 = common_llm_kwargs + per_test_common_llm_kwargs + test_llm_kwargs
|
||||||
|
env1 = env2 = None
|
||||||
|
|
||||||
run_equality_correctness_test(baseline_llm_generator,
|
max_wait_seconds = 240
|
||||||
test_llm_generator,
|
results = []
|
||||||
batch_size,
|
|
||||||
max_output_len,
|
|
||||||
force_output_len,
|
|
||||||
temperature=0.0,
|
|
||||||
seeded=False,
|
|
||||||
print_tokens=print_tokens,
|
|
||||||
ensure_all_accepted=ensure_all_accepted)
|
|
||||||
|
|
||||||
|
prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
|
||||||
|
|
||||||
def run_equality_correctness_test(
|
for args, env in ((arg1, env1), (arg2, env2)):
|
||||||
baseline_llm_generator,
|
with RemoteOpenAIServer(model,
|
||||||
test_llm_generator,
|
args,
|
||||||
batch_size,
|
env_dict=env,
|
||||||
max_output_len,
|
max_wait_seconds=max_wait_seconds) as server:
|
||||||
force_output_len: bool,
|
client = server.get_client()
|
||||||
temperature: float,
|
|
||||||
seeded: bool,
|
|
||||||
print_tokens: bool = False,
|
|
||||||
ensure_all_accepted: bool = False,
|
|
||||||
expected_acceptance_rate: Optional[float] = None):
|
|
||||||
"""Helper method that compares the outputs of both the baseline LLM and
|
|
||||||
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
|
|
||||||
the same when temperature is zero (or when temperature is > 0 and seeded).
|
|
||||||
"""
|
|
||||||
|
|
||||||
prompts = [
|
completion = client.completions.create(model=model,
|
||||||
"Hello, my name is",
|
prompt=prompts,
|
||||||
"The president of the United States is",
|
max_tokens=max_output_len,
|
||||||
"The capital of France is",
|
seed=seed,
|
||||||
"The future of AI is",
|
temperature=temperature)
|
||||||
"San Francisco is know for its",
|
|
||||||
"Facebook was created in 2004 by",
|
|
||||||
"Curious George is a",
|
|
||||||
"Python 3.11 brings improvements to its",
|
|
||||||
]
|
|
||||||
|
|
||||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
results.append({
|
||||||
|
"test":
|
||||||
|
"seeded_sampling",
|
||||||
|
"text": [choice.text for choice in completion.choices],
|
||||||
|
"finish_reason":
|
||||||
|
[choice.finish_reason for choice in completion.choices],
|
||||||
|
"usage":
|
||||||
|
completion.usage,
|
||||||
|
})
|
||||||
|
|
||||||
# If the test requires that we generated max_output_len tokens, then set the
|
n = len(results) // 2
|
||||||
# sampling params to ignore eos token.
|
arg1_results = results[:n]
|
||||||
ignore_eos = force_output_len
|
arg2_results = results[n:]
|
||||||
|
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
|
||||||
if seeded:
|
assert arg1_result == arg2_result, (
|
||||||
sampling_params = [
|
f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
|
||||||
SamplingParams(
|
f"{arg1_result=} != {arg2_result=}")
|
||||||
max_tokens=max_output_len,
|
|
||||||
ignore_eos=ignore_eos,
|
|
||||||
temperature=temperature,
|
|
||||||
seed=i,
|
|
||||||
) for i in range(len(prompts))
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
sampling_params = SamplingParams(
|
|
||||||
max_tokens=max_output_len,
|
|
||||||
ignore_eos=ignore_eos,
|
|
||||||
temperature=temperature,
|
|
||||||
)
|
|
||||||
|
|
||||||
(spec_batch_tokens, spec_batch_token_ids,
|
|
||||||
acceptance_rate) = get_output_from_llm_generator(test_llm_generator,
|
|
||||||
prompts, sampling_params)
|
|
||||||
|
|
||||||
(baseline_batch_tokens, baseline_batch_token_ids,
|
|
||||||
_) = get_output_from_llm_generator(baseline_llm_generator, prompts,
|
|
||||||
sampling_params)
|
|
||||||
|
|
||||||
assert len(baseline_batch_token_ids) == len(prompts)
|
|
||||||
assert len(spec_batch_token_ids) == len(prompts)
|
|
||||||
|
|
||||||
for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
|
|
||||||
spec_tokens) in enumerate(
|
|
||||||
zip(baseline_batch_token_ids, baseline_batch_tokens,
|
|
||||||
spec_batch_token_ids, spec_batch_tokens)):
|
|
||||||
if print_tokens:
|
|
||||||
print(f'{i=} {baseline_tokens=}')
|
|
||||||
print(f'{i=} {spec_tokens=}')
|
|
||||||
print(f'{i=} {baseline_token_ids=}')
|
|
||||||
print(f'{i=} {spec_token_ids=}')
|
|
||||||
assert baseline_token_ids == spec_token_ids
|
|
||||||
|
|
||||||
print(f'{acceptance_rate=}')
|
|
||||||
|
|
||||||
if ensure_all_accepted:
|
|
||||||
assert acceptance_rate == 1.0
|
|
||||||
|
|
||||||
if expected_acceptance_rate is not None:
|
|
||||||
assert acceptance_rate >= expected_acceptance_rate - 1e-2
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ correctess for the target model outputs.
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .conftest import run_greedy_equality_correctness_test
|
from .conftest import run_equality_correctness_test
|
||||||
|
|
||||||
# main model
|
# main model
|
||||||
MAIN_MODEL = "JackFram/llama-68m"
|
MAIN_MODEL = "JackFram/llama-68m"
|
||||||
@@ -53,7 +53,7 @@ PRECISION = "float32"
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -68,15 +68,16 @@ PRECISION = "float32"
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
|
def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||||
test_llm_generator, batch_size: int,
|
per_test_common_llm_kwargs,
|
||||||
output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
"""Verify greedy equality with different batch size."""
|
batch_size: int, output_len: int,
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
seed: int):
|
||||||
test_llm_generator,
|
|
||||||
batch_size,
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
per_test_common_llm_kwargs,
|
||||||
force_output_len=True)
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size, output_len, seed)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -94,7 +95,7 @@ def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -109,17 +110,16 @@ def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
|
def test_eagle_e2e_greedy_correctness_cuda_graph(
|
||||||
test_llm_generator,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
output_len: int):
|
seed: int):
|
||||||
"""Verify greedy equality with cuda graph enabled and different
|
"""Verify greedy equality with cuda graph enabled and different
|
||||||
batch sizes."""
|
batch sizes."""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
test_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size,
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
max_output_len=output_len,
|
batch_size, output_len, seed)
|
||||||
force_output_len=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -140,7 +140,7 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -158,18 +158,17 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
def test_eagle_e2e_greedy_correctness_with_preemption(
|
||||||
test_llm_generator,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
output_len: int):
|
seed: int):
|
||||||
"""Verify greedy equality, even when some sequences are preempted mid-
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
generation.
|
generation.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
test_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size,
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
max_output_len=output_len,
|
batch_size, output_len, seed)
|
||||||
force_output_len=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -185,7 +184,7 @@ def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -207,16 +206,17 @@ def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
|
def test_eagle_different_k(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify that eagle speculative decoding produces exact equality
|
"""Verify that eagle speculative decoding produces exact equality
|
||||||
to without spec decode with different values of num_speculative_tokens.
|
to without spec decode with different values of num_speculative_tokens.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
test_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size,
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
max_output_len=output_len,
|
batch_size, output_len, seed)
|
||||||
force_output_len=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -232,7 +232,7 @@ def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -250,17 +250,18 @@ def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_eagle_disable_queue(baseline_llm_generator, test_llm_generator,
|
def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify that eagle speculative decoding produces exact equality
|
"""Verify that eagle speculative decoding produces exact equality
|
||||||
to without spec decode when speculation is disabled for large
|
to without spec decode when speculation is disabled for large
|
||||||
batch sizes.
|
batch sizes.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
|
||||||
test_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size,
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
max_output_len=output_len,
|
batch_size, output_len, seed)
|
||||||
force_output_len=True)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -4,7 +4,9 @@ other features, e.g. cuda graphs.
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .conftest import run_greedy_equality_correctness_test
|
from .conftest import run_equality_correctness_test
|
||||||
|
|
||||||
|
MAIN_MODEL = "JackFram/llama-68m"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -15,7 +17,7 @@ from .conftest import run_greedy_equality_correctness_test
|
|||||||
|
|
||||||
# Verify equality when cuda graphs allowed.
|
# Verify equality when cuda graphs allowed.
|
||||||
"enforce_eager": False,
|
"enforce_eager": False,
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"per_test_common_llm_kwargs",
|
"per_test_common_llm_kwargs",
|
||||||
@@ -31,23 +33,27 @@ from .conftest import run_greedy_equality_correctness_test
|
|||||||
@pytest.mark.parametrize("batch_size", [8])
|
@pytest.mark.parametrize("batch_size", [8])
|
||||||
@pytest.mark.parametrize("output_len", [32])
|
@pytest.mark.parametrize("output_len", [32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
|
def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
|
||||||
batch_size, output_len):
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int, seed: int):
|
||||||
"""Verify spec decode equality when cuda graphs are enabled.
|
"""Verify spec decode equality when cuda graphs are enabled.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(
|
run_equality_correctness_test(vllm_runner,
|
||||||
baseline_llm_generator,
|
common_llm_kwargs,
|
||||||
test_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size,
|
baseline_llm_kwargs,
|
||||||
max_output_len=output_len,
|
test_llm_kwargs,
|
||||||
force_output_len=True,
|
batch_size,
|
||||||
)
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -80,13 +86,19 @@ def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
|
|||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("batch_size", [2])
|
@pytest.mark.parametrize("batch_size", [2])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_speculative_model_quantization_config(baseline_llm_generator,
|
def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
|
||||||
test_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size: int):
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs,
|
||||||
|
batch_size: int, seed: int):
|
||||||
"""Verify spec decode works well with draft model quantization configs.
|
"""Verify spec decode works well with draft model quantization configs.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=32,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=32,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|||||||
@@ -7,42 +7,39 @@ import torch
|
|||||||
|
|
||||||
from vllm.utils import is_hip
|
from vllm.utils import is_hip
|
||||||
|
|
||||||
from .conftest import run_greedy_equality_correctness_test
|
from .conftest import run_equality_correctness_test_tp
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||||
reason="Need at least 2 GPUs to run the test.")
|
reason="Need at least 2 GPUs to run the test.")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[[
|
||||||
"model": "JackFram/llama-68m",
|
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"--enforce-eager",
|
||||||
|
|
||||||
# Required for spec decode.
|
# Required for spec decode.
|
||||||
"use_v2_block_manager": True,
|
"--use-v2-block-manager",
|
||||||
"tensor_parallel_size": 2,
|
"--tensor-parallel-size",
|
||||||
|
"2"
|
||||||
# Use AsyncLLM engine, so that the engine runs in its own process.
|
]])
|
||||||
# Otherwise, since vLLM does not follow true SPMD, the test runner
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
|
||||||
# process will have both the engine and the rank0 worker. NCCL is not
|
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||||
# cleaned up properly, and its server host thread leaks, causing the
|
|
||||||
# second run of the test to fail with internal NCCL error.
|
|
||||||
"use_async": True,
|
|
||||||
}])
|
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
{
|
[
|
||||||
"speculative_model": "JackFram/llama-68m",
|
"--speculative-model",
|
||||||
"num_speculative_tokens": 3,
|
"JackFram/llama-68m",
|
||||||
},
|
"--num-speculative-tokens",
|
||||||
{
|
"3",
|
||||||
"speculative_model": "[ngram]",
|
],
|
||||||
"num_speculative_tokens": 5,
|
[
|
||||||
"ngram_prompt_lookup_max": 3,
|
"--speculative-model",
|
||||||
},
|
"[ngram]",
|
||||||
|
"--num-speculative-tokens",
|
||||||
|
"5",
|
||||||
|
"--ngram-prompt-lookup-max",
|
||||||
|
"3",
|
||||||
|
],
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [2])
|
@pytest.mark.parametrize("batch_size", [2])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -52,75 +49,75 @@ from .conftest import run_greedy_equality_correctness_test
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
|
def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int, seed: int):
|
||||||
"""Verify greedy equality when tensor parallelism is used.
|
"""Verify greedy equality when tensor parallelism is used.
|
||||||
"""
|
"""
|
||||||
if is_hip():
|
if is_hip():
|
||||||
pytest.skip("hip is not well-supported yet")
|
pytest.skip("hip is not well-supported yet")
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test_tp("JackFram/llama-68m",
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
output_len,
|
||||||
|
seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||||
reason="Need at least 2 GPUs to run the test.")
|
reason="Need at least 2 GPUs to run the test.")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[[
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"--enforce-eager",
|
||||||
|
|
||||||
# Required for spec decode.
|
# Required for spec decode.
|
||||||
"use_v2_block_manager": True,
|
"--use_v2_block_manager",
|
||||||
"tensor_parallel_size": 2,
|
"--tensor_parallel_size",
|
||||||
|
"2",
|
||||||
# Use AsyncLLM engine, so that the engine runs in its own process.
|
|
||||||
# Otherwise, since vLLM does not follow true SPMD, the test runner
|
|
||||||
# process will have both the engine and the rank0 worker. NCCL is not
|
|
||||||
# cleaned up properly, and its server host thread leaks, causing the
|
|
||||||
# second run of the test to fail with internal NCCL error.
|
|
||||||
"use_async": True,
|
|
||||||
|
|
||||||
# precision
|
# precision
|
||||||
"dtype": "float32",
|
"--dtype",
|
||||||
}])
|
"bfloat16",
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
]])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
|
||||||
"per_test_common_llm_kwargs, test_llm_kwargs",
|
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||||
[
|
@pytest.mark.parametrize("model, test_llm_kwargs",
|
||||||
(
|
[("JackFram/llama-68m", [
|
||||||
{
|
"--speculative-model",
|
||||||
# Use a small model for a fast test.
|
"JackFram/llama-68m",
|
||||||
# Note this is repeated in the test body; to initialize a
|
"--num_speculative-tokens",
|
||||||
# tokenizer.
|
"5",
|
||||||
"model": "JackFram/llama-68m",
|
"--speculative-draft-tensor-parallel-size",
|
||||||
},
|
"1",
|
||||||
{
|
]),
|
||||||
"speculative_model": "JackFram/llama-68m",
|
("ibm-granite/granite-3b-code-instruct", [
|
||||||
"num_speculative_tokens": 5,
|
"--speculative-model",
|
||||||
"speculative_draft_tensor_parallel_size": 1,
|
"ibm-granite/granite-3b-code-instruct",
|
||||||
}),
|
"--num_speculative-tokens",
|
||||||
({
|
"5",
|
||||||
"model": "ibm-granite/granite-3b-code-instruct",
|
"--speculative-draft-tensor-parallel-size",
|
||||||
}, {
|
"1",
|
||||||
"speculative_model":
|
])])
|
||||||
"ibm-granite/granite-3b-code-instruct-accelerator",
|
|
||||||
"num_speculative_tokens": 5,
|
|
||||||
"speculative_draft_tensor_parallel_size": 1,
|
|
||||||
})
|
|
||||||
])
|
|
||||||
@pytest.mark.parametrize("batch_size", [2])
|
@pytest.mark.parametrize("batch_size", [2])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
|
def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
|
||||||
baseline_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size: int):
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int,
|
||||||
|
seed: int):
|
||||||
"""Verify spec decode works well with smaller tp for draft models.
|
"""Verify spec decode works well with smaller tp for draft models.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test_tp(model,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=32,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=32,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|||||||
@@ -2,98 +2,97 @@
|
|||||||
tensor parallelism.
|
tensor parallelism.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from .conftest import run_greedy_equality_correctness_test
|
from .conftest import run_equality_correctness_test_tp
|
||||||
|
|
||||||
|
MAIN_MODEL = "JackFram/llama-68m"
|
||||||
|
SPEC_MODEL = "JackFram/llama-68m"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||||
reason="Need at least 4 GPUs to run the test.")
|
reason="Need at least 4 GPUs to run the test.")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[[
|
||||||
# Use a small model for a fast test.
|
|
||||||
# Note this is repeated in the test body; to initialize a tokenizer.
|
|
||||||
"model": "JackFram/llama-68m",
|
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"--enforce_eager",
|
||||||
|
|
||||||
# Required for spec decode.
|
# Required for spec decode.
|
||||||
"use_v2_block_manager": True,
|
"--use-v2-block-manager",
|
||||||
"tensor_parallel_size": 4,
|
"--tensor-parallel-size",
|
||||||
|
"4",
|
||||||
# Use AsyncLLM engine, so that the engine runs in its own process.
|
]])
|
||||||
# Otherwise, since vLLM does not follow true SPMD, the test runner
|
|
||||||
# process will have both the engine and the rank0 worker. NCCL is not
|
|
||||||
# cleaned up properly, and its server host thread leaks, causing the
|
|
||||||
# second run of the test to fail with internal NCCL error.
|
|
||||||
"use_async": True,
|
|
||||||
}])
|
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||||
{
|
[
|
||||||
"speculative_model": "JackFram/llama-68m",
|
"--speculative-model",
|
||||||
"num_speculative_tokens": 5,
|
f"{SPEC_MODEL}",
|
||||||
},
|
"--num-speculative-tokens",
|
||||||
|
"5",
|
||||||
|
],
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"test_llm_kwargs",
|
"test_llm_kwargs",
|
||||||
[
|
[
|
||||||
#TODO(wooyeon): add spec_draft_dp=2 case
|
#TODO(wooyeon): add spec_draft_dp=2 case
|
||||||
{
|
[
|
||||||
"speculative_draft_tensor_parallel_size": 1,
|
"--speculative-draft-tensor-parallel-size",
|
||||||
},
|
"1",
|
||||||
|
],
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [2])
|
@pytest.mark.parametrize("batch_size", [2])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
|
def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
|
||||||
baseline_llm_generator,
|
per_test_common_llm_kwargs,
|
||||||
batch_size: int):
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int,
|
||||||
|
seed: int):
|
||||||
"""Verify spec decode works well with smaller tp for draft models.
|
"""Verify spec decode works well with smaller tp for draft models.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test_tp(MAIN_MODEL,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=32,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=32,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||||
reason="Need at least 4 GPUs to run the test.")
|
reason="Need at least 4 GPUs to run the test.")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[[
|
||||||
"model": "JackFram/llama-160m",
|
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"--enforce-eager",
|
||||||
|
|
||||||
# Required for spec decode.
|
# Required for spec decode.
|
||||||
"use_v2_block_manager": True,
|
"--use-v2-block-manager",
|
||||||
"tensor_parallel_size": 4,
|
"--tensor-parallel-size",
|
||||||
|
"4",
|
||||||
# Use AsyncLLM engine, so that the engine runs in its own process.
|
]])
|
||||||
# Otherwise, since vLLM does not follow true SPMD, the test runner
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
|
||||||
# process will have both the engine and the rank0 worker. NCCL is not
|
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
|
||||||
# cleaned up properly, and its server host thread leaks, causing the
|
|
||||||
# second run of the test to fail with internal NCCL error.
|
|
||||||
"use_async": True,
|
|
||||||
}])
|
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"test_llm_kwargs",
|
"test_llm_kwargs",
|
||||||
[
|
[
|
||||||
{
|
[
|
||||||
"speculative_model": "JackFram/llama-68m",
|
"--speculative-model",
|
||||||
"num_speculative_tokens": 5,
|
f"{SPEC_MODEL}",
|
||||||
|
"--num-speculative-tokens",
|
||||||
|
"5",
|
||||||
|
|
||||||
# Artificially limit the draft model max model len; this forces vLLM
|
# Artificially limit the draft model max model len; this forces vLLM
|
||||||
# to skip speculation once the sequences grow beyond 32-k tokens.
|
# to skip speculation once the sequences grow beyond 32-k tokens.
|
||||||
"speculative_max_model_len": 32,
|
"--speculative-max-model-len",
|
||||||
},
|
"32",
|
||||||
|
],
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [8])
|
@pytest.mark.parametrize("batch_size", [8])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -105,8 +104,9 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
|
|||||||
64,
|
64,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_skip_speculation(baseline_llm_generator, test_llm_generator,
|
def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int, seed: int):
|
||||||
"""Verify job failure with RuntimeError when all sequences skip speculation.
|
"""Verify job failure with RuntimeError when all sequences skip speculation.
|
||||||
We do this by setting the max model len of the draft model to an
|
We do this by setting the max model len of the draft model to an
|
||||||
artificially low value, such that when the sequences grow beyond it, they
|
artificially low value, such that when the sequences grow beyond it, they
|
||||||
@@ -114,9 +114,13 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
|
|||||||
|
|
||||||
TODO: fix it to pass without raising Error. (#5814)
|
TODO: fix it to pass without raising Error. (#5814)
|
||||||
"""
|
"""
|
||||||
with pytest.raises(RuntimeError):
|
with pytest.raises(openai.APIConnectionError):
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test_tp(MAIN_MODEL,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
output_len,
|
||||||
|
seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|||||||
@@ -1,24 +1,22 @@
|
|||||||
import math
|
|
||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
from .conftest import get_logprobs_from_llm_generator
|
from .conftest import run_logprob_correctness_test
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
|
|
||||||
# Required for spec decode.
|
# Required for spec decode.
|
||||||
"use_v2_block_manager": True,
|
"use_v2_block_manager": True,
|
||||||
"max_logprobs": 6,
|
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -36,64 +34,29 @@ from .conftest import get_logprobs_from_llm_generator
|
|||||||
7,
|
7,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
|
@pytest.mark.parametrize("logprobs", [1, 6])
|
||||||
batch_size: int, output_len: int):
|
def test_logprobs_equality(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int, logprobs: int):
|
||||||
"""Verify output logprobs are equal with and without speculative decoding.
|
"""Verify output logprobs are equal with and without speculative decoding.
|
||||||
"""
|
"""
|
||||||
run_greedy_logprobs_correctness_test(baseline_llm_generator,
|
run_logprob_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
output_len,
|
||||||
|
seed,
|
||||||
|
temperature=0.0,
|
||||||
|
logprobs=logprobs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
|
||||||
"enforce_eager": True,
|
|
||||||
|
|
||||||
# Required for spec decode.
|
|
||||||
"use_v2_block_manager": True,
|
|
||||||
"max_logprobs": 6,
|
|
||||||
}])
|
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize("test_llm_kwargs",
|
|
||||||
[{
|
|
||||||
"speculative_model": "JackFram/llama-160m",
|
|
||||||
"num_speculative_tokens": 3,
|
|
||||||
"disable_logprobs_during_spec_decoding": False,
|
|
||||||
}])
|
|
||||||
@pytest.mark.parametrize("batch_size", [1])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [6])
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"output_len",
|
|
||||||
[
|
|
||||||
# Use smaller output len for fast test.
|
|
||||||
7,
|
|
||||||
])
|
|
||||||
@pytest.mark.parametrize("seed", [1])
|
|
||||||
def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
|
|
||||||
batch_size: int, output_len: int,
|
|
||||||
num_logprobs: int):
|
|
||||||
"""Verify output logprobs are equal with and without spec decode.
|
|
||||||
This specifies a number of logprobs >1.
|
|
||||||
"""
|
|
||||||
run_greedy_logprobs_correctness_test(baseline_llm_generator,
|
|
||||||
test_llm_generator,
|
|
||||||
batch_size,
|
|
||||||
max_output_len=output_len,
|
|
||||||
force_output_len=True,
|
|
||||||
logprob_rank=num_logprobs)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"common_llm_kwargs",
|
|
||||||
[{
|
|
||||||
"model": "JackFram/llama-68m",
|
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -121,21 +84,29 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
|
@pytest.mark.parametrize("logprobs", [1, 6])
|
||||||
batch_size: int, output_len: int):
|
def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int,
|
||||||
|
output_len: int, seed: int, logprobs: int):
|
||||||
"""Veriy logprob greedy equality with different speculation lens.
|
"""Veriy logprob greedy equality with different speculation lens.
|
||||||
"""
|
"""
|
||||||
run_greedy_logprobs_correctness_test(baseline_llm_generator,
|
run_logprob_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
output_len,
|
||||||
|
seed,
|
||||||
|
temperature=0.0,
|
||||||
|
logprobs=logprobs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -164,22 +135,30 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_logprobs_when_skip_speculation(baseline_llm_generator,
|
@pytest.mark.parametrize("logprobs", [1])
|
||||||
test_llm_generator, batch_size: int,
|
def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||||
output_len: int):
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int,
|
||||||
|
seed: int, logprobs: int):
|
||||||
"""Verify logprobs greedy equality when some sequences skip speculation.
|
"""Verify logprobs greedy equality when some sequences skip speculation.
|
||||||
"""
|
"""
|
||||||
run_greedy_logprobs_correctness_test(baseline_llm_generator,
|
run_logprob_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
output_len,
|
||||||
|
seed,
|
||||||
|
temperature=0.0,
|
||||||
|
logprobs=logprobs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -203,19 +182,17 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
|
@pytest.mark.parametrize("logprobs", [6])
|
||||||
batch_size: int, output_len: int):
|
def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int, logprobs: int):
|
||||||
"""Verify at least one logprob result has num_logprobs+1, which tests the
|
"""Verify at least one logprob result has num_logprobs+1, which tests the
|
||||||
case where the sampled token is not in top-k logprobs.
|
case where the sampled token is not in top-k logprobs.
|
||||||
|
|
||||||
Ideally, this test should validate equality with non-spec by getting
|
Ideally, this test should validate equality with non-spec by getting
|
||||||
logprobs. This is left as future improvement.
|
logprobs. This is left as future improvement.
|
||||||
"""
|
"""
|
||||||
batch_size = 8
|
|
||||||
max_output_len = output_len
|
|
||||||
force_output_len = True
|
|
||||||
logprob_rank = 5
|
|
||||||
|
|
||||||
temperature = 1.0
|
temperature = 1.0
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
@@ -231,129 +208,40 @@ def test_logprobs_temp_1(baseline_llm_generator, test_llm_generator,
|
|||||||
|
|
||||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||||
|
|
||||||
# If the test requires that we generated max_output_len tokens, then set the
|
|
||||||
# sampling params to ignore eos token.
|
|
||||||
ignore_eos = force_output_len
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
max_tokens=max_output_len,
|
max_tokens=output_len,
|
||||||
ignore_eos=ignore_eos,
|
ignore_eos=True,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
logprobs=logprob_rank,
|
logprobs=logprobs,
|
||||||
)
|
)
|
||||||
|
|
||||||
spec_batch_logprobs = get_logprobs_from_llm_generator(
|
sd_args = {
|
||||||
test_llm_generator, prompts, sampling_params)
|
**common_llm_kwargs,
|
||||||
|
**per_test_common_llm_kwargs,
|
||||||
|
**test_llm_kwargs,
|
||||||
|
}
|
||||||
|
|
||||||
|
with vllm_runner(**sd_args) as vllm_model:
|
||||||
|
sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
|
||||||
|
|
||||||
num_returned_logprobs = [
|
num_returned_logprobs = [
|
||||||
len(logprob_dict) for seq_logprobs in spec_batch_logprobs
|
len(seq_logprobs) for seq_logprobs in sd_outputs[-1]
|
||||||
for logprob_dict in seq_logprobs
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Assert one of the returned logprobs has > num_logprobs (indicating the
|
# Assert one of the returned logprobs has > num_logprobs (indicating the
|
||||||
# sampled token is not in top-k).
|
# sampled token is not in top-k).
|
||||||
assert any([
|
assert any(
|
||||||
num_returned > logprob_rank for num_returned in num_returned_logprobs
|
[num_returned > logprobs for num_returned in num_returned_logprobs])
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
def run_greedy_logprobs_correctness_test(baseline_llm_generator,
|
|
||||||
test_llm_generator,
|
|
||||||
batch_size,
|
|
||||||
max_output_len,
|
|
||||||
force_output_len: bool,
|
|
||||||
logprob_rank: int = 1):
|
|
||||||
"""Helper method that compares the logprobs outputs of both the baseline LLM
|
|
||||||
and the test LLM. It asserts greedy equality of the logprobs when the
|
|
||||||
temperature is zero.
|
|
||||||
"""
|
|
||||||
temperature = 0.0
|
|
||||||
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
"San Francisco is know for its",
|
|
||||||
"Facebook was created in 2004 by",
|
|
||||||
"Curious George is a",
|
|
||||||
"Python 3.11 brings improvements to its",
|
|
||||||
]
|
|
||||||
|
|
||||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
|
||||||
|
|
||||||
# If the test requires that we generated max_output_len tokens, then set the
|
|
||||||
# sampling params to ignore eos token.
|
|
||||||
ignore_eos = force_output_len
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
|
||||||
max_tokens=max_output_len,
|
|
||||||
ignore_eos=ignore_eos,
|
|
||||||
temperature=temperature,
|
|
||||||
logprobs=logprob_rank,
|
|
||||||
)
|
|
||||||
|
|
||||||
spec_batch_logprobs = get_logprobs_from_llm_generator(
|
|
||||||
test_llm_generator, prompts, sampling_params)
|
|
||||||
baseline_batch_logprobs = get_logprobs_from_llm_generator(
|
|
||||||
baseline_llm_generator, prompts, sampling_params)
|
|
||||||
|
|
||||||
assert len(baseline_batch_logprobs) == len(prompts)
|
|
||||||
assert len(spec_batch_logprobs) == len(prompts)
|
|
||||||
|
|
||||||
# For each sequence in the batch.
|
|
||||||
for i, (baseline_logprobs, spec_logprobs) in enumerate(
|
|
||||||
zip(baseline_batch_logprobs, spec_batch_logprobs)):
|
|
||||||
assert len(spec_logprobs) == len(baseline_logprobs)
|
|
||||||
|
|
||||||
# For each generated position of the sequence.
|
|
||||||
for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
|
|
||||||
zip(spec_logprobs, baseline_logprobs)):
|
|
||||||
|
|
||||||
# Map rank to token/logprob in spec output.
|
|
||||||
spec_rank_to_token_id = {
|
|
||||||
value.rank: key
|
|
||||||
for key, value in spec_pos_logprobs.items()
|
|
||||||
}
|
|
||||||
spec_rank_to_logprob = {
|
|
||||||
value.rank: value.logprob
|
|
||||||
for key, value in spec_pos_logprobs.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Map rank to token/logprob in baseline output.
|
|
||||||
baseline_rank_to_token_id = {
|
|
||||||
value.rank: key
|
|
||||||
for key, value in baseline_pos_logprobs.items()
|
|
||||||
}
|
|
||||||
baseline_rank_to_logprob = {
|
|
||||||
value.rank: value.logprob
|
|
||||||
for key, value in baseline_pos_logprobs.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Assert set of ranks returned is equal.
|
|
||||||
assert set(spec_rank_to_token_id.keys()) == set(
|
|
||||||
baseline_rank_to_token_id.keys())
|
|
||||||
|
|
||||||
# Assert each logprob/token id is correct, keyed by rank.
|
|
||||||
for rank in sorted(set(spec_rank_to_token_id.keys())):
|
|
||||||
assert spec_rank_to_token_id[
|
|
||||||
rank] == baseline_rank_to_token_id[rank], f"{rank}"
|
|
||||||
assert math.isclose(
|
|
||||||
a=spec_rank_to_logprob[rank],
|
|
||||||
b=baseline_rank_to_logprob[rank],
|
|
||||||
abs_tol=1e-1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
# Required for spec decode.
|
# Required for spec decode.
|
||||||
"use_v2_block_manager": True,
|
"use_v2_block_manager": True,
|
||||||
"max_logprobs": 6,
|
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -364,57 +252,28 @@ def run_greedy_logprobs_correctness_test(baseline_llm_generator,
|
|||||||
"disable_logprobs_during_spec_decoding": True,
|
"disable_logprobs_during_spec_decoding": True,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_logprobs_disabled(baseline_llm_generator, test_llm_generator):
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"output_len",
|
||||||
|
[
|
||||||
|
# Use smaller output len for fast test.
|
||||||
|
32,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("logprobs", [0])
|
||||||
|
def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int, logprobs: int):
|
||||||
"""Check the behavior when logprobs are disabled.
|
"""Check the behavior when logprobs are disabled.
|
||||||
Token choices should match with the base model.
|
Token choices should match with the base model.
|
||||||
"""
|
"""
|
||||||
prompts = [
|
run_logprob_correctness_test(vllm_runner,
|
||||||
"Hello, my name is",
|
common_llm_kwargs,
|
||||||
"The president of the United States is",
|
per_test_common_llm_kwargs,
|
||||||
"The capital of France is",
|
baseline_llm_kwargs,
|
||||||
"The future of AI is",
|
test_llm_kwargs,
|
||||||
"San Francisco is know for its",
|
batch_size,
|
||||||
"Facebook was created in 2004 by",
|
output_len,
|
||||||
"Curious George is a",
|
seed,
|
||||||
"Python 3.11 brings improvements to its",
|
temperature=0.0,
|
||||||
]
|
logprobs=logprobs)
|
||||||
|
|
||||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(4))]
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
|
||||||
# Use smaller output len for fast test
|
|
||||||
max_tokens=7,
|
|
||||||
ignore_eos=True,
|
|
||||||
temperature=0.0,
|
|
||||||
logprobs=2,
|
|
||||||
)
|
|
||||||
|
|
||||||
spec_batch_logprobs = get_logprobs_from_llm_generator(
|
|
||||||
test_llm_generator, prompts, sampling_params)
|
|
||||||
baseline_batch_logprobs = get_logprobs_from_llm_generator(
|
|
||||||
baseline_llm_generator, prompts, sampling_params)
|
|
||||||
|
|
||||||
assert len(baseline_batch_logprobs) == len(prompts)
|
|
||||||
assert len(spec_batch_logprobs) == len(prompts)
|
|
||||||
|
|
||||||
# For each sequence in the batch.
|
|
||||||
for _, (baseline_logprobs, spec_logprobs) in enumerate(
|
|
||||||
zip(baseline_batch_logprobs, spec_batch_logprobs)):
|
|
||||||
assert len(spec_logprobs) == len(baseline_logprobs)
|
|
||||||
|
|
||||||
# For each generated position of the sequence.
|
|
||||||
for _, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
|
|
||||||
zip(spec_logprobs, baseline_logprobs)):
|
|
||||||
|
|
||||||
assert len(spec_pos_logprobs) == 1
|
|
||||||
spec_top_token_id = list(spec_pos_logprobs)[0]
|
|
||||||
|
|
||||||
spec_top_logprob = spec_pos_logprobs[spec_top_token_id]
|
|
||||||
assert spec_top_logprob.logprob == 0.0
|
|
||||||
assert spec_top_logprob.rank == -1
|
|
||||||
|
|
||||||
# check that the chosen token matches the base model
|
|
||||||
baseline_logprob = baseline_pos_logprobs[spec_top_token_id]
|
|
||||||
assert baseline_logprob.rank == 1
|
|
||||||
assert spec_top_logprob.decoded_token \
|
|
||||||
== baseline_logprob.decoded_token
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ correctess for the target model outputs.
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .conftest import run_greedy_equality_correctness_test
|
from .conftest import run_equality_correctness_test
|
||||||
|
|
||||||
# main model
|
# main model
|
||||||
# lmsys/vicuna-7b-v1.3 was to be used but it's causing
|
# lmsys/vicuna-7b-v1.3 was to be used but it's causing
|
||||||
@@ -55,7 +55,7 @@ PRECISION = "float32"
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -70,15 +70,21 @@ PRECISION = "float32"
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
|
def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||||
test_llm_generator, batch_size: int,
|
per_test_common_llm_kwargs,
|
||||||
output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality with different batch size."""
|
"""Verify greedy equality with different batch size."""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -96,7 +102,7 @@ def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -111,17 +117,21 @@ def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
|
def test_medusa_e2e_greedy_correctness_cuda_graph(
|
||||||
test_llm_generator,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
output_len: int):
|
seed: int):
|
||||||
"""Verify greedy equality with cuda graph enabled and different
|
"""Verify greedy equality with cuda graph enabled and different
|
||||||
batch sizes."""
|
batch sizes."""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -142,7 +152,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -160,18 +170,22 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
def test_medusa_e2e_greedy_correctness_with_preemption(
|
||||||
test_llm_generator,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
output_len: int):
|
seed: int):
|
||||||
"""Verify greedy equality, even when some sequences are preempted mid-
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
generation.
|
generation.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -187,7 +201,7 @@ def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -209,16 +223,22 @@ def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
|
def test_medusa_different_k(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify that medusa speculative decoding produces exact equality
|
"""Verify that medusa speculative decoding produces exact equality
|
||||||
to without spec decode with different values of num_speculative_tokens.
|
to without spec decode with different values of num_speculative_tokens.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -234,7 +254,7 @@ def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -252,17 +272,23 @@ def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_medusa_disable_queue(baseline_llm_generator, test_llm_generator,
|
def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int,
|
||||||
|
output_len: int, seed: int):
|
||||||
"""Verify that medusa speculative decoding produces exact equality
|
"""Verify that medusa speculative decoding produces exact equality
|
||||||
to without spec decode when speculation is disabled for large
|
to without spec decode when speculation is disabled for large
|
||||||
batch sizes.
|
batch sizes.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -25,8 +25,7 @@ import pytest
|
|||||||
|
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
|
from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
|
||||||
|
|
||||||
from .conftest import (run_equality_correctness_test,
|
from .conftest import run_equality_correctness_test
|
||||||
run_greedy_equality_correctness_test)
|
|
||||||
|
|
||||||
# main model
|
# main model
|
||||||
MAIN_MODEL = "JackFram/llama-160m"
|
MAIN_MODEL = "JackFram/llama-160m"
|
||||||
@@ -58,7 +57,7 @@ PRECISION = "float32"
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -72,14 +71,21 @@ PRECISION = "float32"
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
|
def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality with different batch size."""
|
"""Verify greedy equality with different batch size."""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -98,7 +104,7 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -110,17 +116,21 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
|
|||||||
@pytest.mark.parametrize("output_len", [2048])
|
@pytest.mark.parametrize("output_len", [2048])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
|
def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int, seed: int):
|
||||||
"""Verify acceptance rate with different batch size and large output
|
"""Verify acceptance rate with different batch size and large output
|
||||||
length."""
|
length."""
|
||||||
run_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs,
|
||||||
batch_size,
|
batch_size,
|
||||||
max_output_len=output_len,
|
max_output_len=output_len,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
seeded=True,
|
seed=seed,
|
||||||
force_output_len=True,
|
|
||||||
expected_acceptance_rate=0.48)
|
expected_acceptance_rate=0.48)
|
||||||
|
|
||||||
|
|
||||||
@@ -140,7 +150,7 @@ def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
|
|
||||||
# Speculative model
|
# Speculative model
|
||||||
"speculative_model": SPEC_MODEL,
|
"speculative_model": SPEC_MODEL,
|
||||||
@@ -151,28 +161,35 @@ def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
|
|||||||
@pytest.mark.parametrize("output_len", [64])
|
@pytest.mark.parametrize("output_len", [64])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("temperature", [0.1, 1.0])
|
@pytest.mark.parametrize("temperature", [0.1, 1.0])
|
||||||
@pytest.mark.parametrize("seed", [None])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
|
def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
batch_size: int, output_len: int,
|
batch_size: int, output_len: int,
|
||||||
temperature: float):
|
temperature: float, seed: int):
|
||||||
"""Verify seeded runs produce the same output."""
|
"""Verify seeded runs produce the same output."""
|
||||||
run_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs,
|
||||||
batch_size,
|
batch_size,
|
||||||
max_output_len=output_len,
|
max_output_len=output_len,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
seeded=True,
|
seed=seed)
|
||||||
force_output_len=True)
|
|
||||||
|
|
||||||
# Ensure this same test does fail if we _don't_ include per-request seeds
|
# Ensure this same test does fail if we _don't_ include per-request seeds
|
||||||
with pytest.raises(AssertionError):
|
with pytest.raises(AssertionError):
|
||||||
run_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs,
|
||||||
batch_size,
|
batch_size,
|
||||||
max_output_len=output_len,
|
max_output_len=output_len,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
seeded=False,
|
seed=seed,
|
||||||
force_output_len=True)
|
disable_seed=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -193,7 +210,7 @@ def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -210,18 +227,22 @@ def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
def test_mlp_e2e_greedy_correctness_with_preemption(
|
||||||
test_llm_generator,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
output_len: int):
|
seed: int):
|
||||||
"""Verify greedy equality, even when some sequences are preempted mid-
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
generation.
|
generation.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -242,7 +263,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -259,10 +280,10 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
|
def test_mlp_e2e_greedy_correctness_with_padding(
|
||||||
test_llm_generator,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
output_len: int):
|
seed: int):
|
||||||
"""Verify greedy equality when the vocab dimension is padded
|
"""Verify greedy equality when the vocab dimension is padded
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -273,11 +294,15 @@ def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
|
|||||||
with patch(
|
with patch(
|
||||||
"vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
|
"vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
|
||||||
patched_pad_vocab_size):
|
patched_pad_vocab_size):
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -293,7 +318,7 @@ def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -315,16 +340,22 @@ def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
|
def test_mlp_different_k(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, seed: int,
|
||||||
|
output_len: int):
|
||||||
"""Verify that mlp speculative decoding produces exact equality
|
"""Verify that mlp speculative decoding produces exact equality
|
||||||
to without spec decode with different values of num_speculative_tokens.
|
to without spec decode with different values of num_speculative_tokens.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -340,7 +371,7 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
|
|||||||
"dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# Main model
|
# Main model
|
||||||
"model": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -357,14 +388,20 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
|
def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
|
||||||
batch_size: int, output_len: int):
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, seed: int,
|
||||||
|
output_len: int):
|
||||||
"""Verify that mlp speculative decoding produces exact equality
|
"""Verify that mlp speculative decoding produces exact equality
|
||||||
to without spec decode when speculation is disabled for large
|
to without spec decode when speculation is disabled for large
|
||||||
batch sizes.
|
batch sizes.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|||||||
@@ -41,8 +41,9 @@ from transformers import AutoTokenizer
|
|||||||
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
|
from ...utils import fork_new_process_for_each_test
|
||||||
from .conftest import (get_output_from_llm_generator,
|
from .conftest import (get_output_from_llm_generator,
|
||||||
run_greedy_equality_correctness_test)
|
run_equality_correctness_test)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -73,6 +74,7 @@ from .conftest import (get_output_from_llm_generator,
|
|||||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||||
batch_size: int):
|
batch_size: int):
|
||||||
"""Run generation with speculative decoding on a batch. Verify the engine
|
"""Run generation with speculative decoding on a batch. Verify the engine
|
||||||
@@ -116,44 +118,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
|||||||
assert actual_tokens.strip() == expected_tokens.strip()
|
assert actual_tokens.strip() == expected_tokens.strip()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"common_llm_kwargs",
|
|
||||||
[{
|
|
||||||
# Use a small model for a fast test.
|
|
||||||
# Note this is repeated in the test body; to initialize a tokenizer.
|
|
||||||
"model": "JackFram/llama-68m",
|
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
|
||||||
"enforce_eager": True,
|
|
||||||
|
|
||||||
# Required for spec decode.
|
|
||||||
"use_v2_block_manager": True,
|
|
||||||
|
|
||||||
# Use AsyncLLM engine
|
|
||||||
"use_async": True,
|
|
||||||
}])
|
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
|
||||||
{
|
|
||||||
"speculative_model": "JackFram/llama-68m",
|
|
||||||
"num_speculative_tokens": 5,
|
|
||||||
},
|
|
||||||
])
|
|
||||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
|
||||||
@pytest.mark.parametrize("batch_size", [2])
|
|
||||||
@pytest.mark.parametrize("seed", [1])
|
|
||||||
def test_spec_decode_e2e_with_async_engine(test_llm_generator,
|
|
||||||
baseline_llm_generator,
|
|
||||||
batch_size: int):
|
|
||||||
"""Verify spec decode works well with async LLM engine.
|
|
||||||
"""
|
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
||||||
test_llm_generator,
|
|
||||||
batch_size,
|
|
||||||
max_output_len=32,
|
|
||||||
force_output_len=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
@@ -172,10 +136,10 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
|
|||||||
# Try two different tiny base models.
|
# Try two different tiny base models.
|
||||||
# Note that one is equal to the draft model, another isn't.
|
# Note that one is equal to the draft model, another isn't.
|
||||||
{
|
{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -189,13 +153,15 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
|
|||||||
"output_len",
|
"output_len",
|
||||||
[
|
[
|
||||||
# Use long output len for the small model test.
|
# Use long output len for the small model test.
|
||||||
1536,
|
10,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [1])
|
@pytest.mark.parametrize("batch_size", [1])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||||
baseline_llm_generator, test_llm_generator, batch_size: int,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality on a tiny model with batch size of one.
|
"""Verify greedy equality on a tiny model with batch size of one.
|
||||||
|
|
||||||
Since this test is cheaper than other e2e correctness tests, we generate
|
Since this test is cheaper than other e2e correctness tests, we generate
|
||||||
@@ -204,14 +170,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|||||||
When the draft model is the same as the target model, we further check
|
When the draft model is the same as the target model, we further check
|
||||||
whether all speculative tokens are accepted.
|
whether all speculative tokens are accepted.
|
||||||
"""
|
"""
|
||||||
ensure_all_accepted = test_llm_generator.same_draft_target_model
|
ensure_all_accepted = per_test_common_llm_kwargs.get(
|
||||||
run_greedy_equality_correctness_test(
|
"model_name") == test_llm_kwargs.get("speculative_model")
|
||||||
baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True,
|
test_llm_kwargs,
|
||||||
ensure_all_accepted=ensure_all_accepted)
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0,
|
||||||
|
ensure_all_accepted=ensure_all_accepted)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -232,10 +202,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|||||||
# Try two different tiny base models.
|
# Try two different tiny base models.
|
||||||
# Note that one is equal to the draft model, another isn't.
|
# Note that one is equal to the draft model, another isn't.
|
||||||
{
|
{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -253,16 +223,22 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [64])
|
@pytest.mark.parametrize("batch_size", [64])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||||
baseline_llm_generator, test_llm_generator, batch_size: int,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality on a tiny model and large batch size.
|
"""Verify greedy equality on a tiny model and large batch size.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -280,10 +256,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
|||||||
# Try two different tiny base models.
|
# Try two different tiny base models.
|
||||||
# Note that one is equal to the draft model, another isn't.
|
# Note that one is equal to the draft model, another isn't.
|
||||||
{
|
{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -298,24 +274,31 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [32])
|
@pytest.mark.parametrize("batch_size", [32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||||
baseline_llm_generator, test_llm_generator, batch_size: int,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
max_output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||||
|
max_output_len: int, seed: int):
|
||||||
"""Verify greedy equality on a tiny model, with a large batch size, and when
|
"""Verify greedy equality on a tiny model, with a large batch size, and when
|
||||||
sampling respects the EOS token.
|
sampling respects the EOS token.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=False)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0,
|
||||||
|
ignore_eos=False)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
# A "real" model (not tiny).
|
# A "real" model (not tiny).
|
||||||
"model": "meta-llama/Llama-2-7b-chat-hf",
|
"model_name": "meta-llama/Llama-2-7b-chat-hf",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -342,24 +325,30 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
|||||||
256,
|
256,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||||
baseline_llm_generator, test_llm_generator, batch_size: int,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality on a "real" model and batch size of 1. This is
|
"""Verify greedy equality on a "real" model and batch size of 1. This is
|
||||||
separate from large BS tests to make identifying the source of bugs easier.
|
separate from large BS tests to make identifying the source of bugs easier.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
# A "real" model (not tiny).
|
# A "real" model (not tiny).
|
||||||
"model": "meta-llama/Llama-2-7b-chat-hf",
|
"model_name": "meta-llama/Llama-2-7b-chat-hf",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -386,17 +375,23 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
|||||||
64,
|
64,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||||
baseline_llm_generator, test_llm_generator, batch_size: int,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality with a "real" model on a nontrivial batch size.
|
"""Verify greedy equality with a "real" model on a nontrivial batch size.
|
||||||
This is the closest test to a real production workload.
|
This is the closest test to a real production workload.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -415,7 +410,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
|||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||||
{
|
{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
@@ -433,23 +428,29 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||||
baseline_llm_generator, test_llm_generator, batch_size: int,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
output_len: int):
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality, even when some sequences are preempted mid-
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
generation.
|
generation.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -487,22 +488,29 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_spec_decode_different_block_size(baseline_llm_generator,
|
@fork_new_process_for_each_test
|
||||||
test_llm_generator, batch_size: int,
|
def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||||
output_len: int):
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality over different block sizes.
|
"""Verify greedy equality over different block sizes.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -534,24 +542,31 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
|
|||||||
64,
|
64,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_skip_speculation(baseline_llm_generator, test_llm_generator,
|
@fork_new_process_for_each_test
|
||||||
batch_size: int, output_len: int):
|
def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality when some (or all) sequences skip speculation.
|
"""Verify greedy equality when some (or all) sequences skip speculation.
|
||||||
We do this by setting the max model len of the draft model to an
|
We do this by setting the max model len of the draft model to an
|
||||||
artificially low value, such that when the sequences grow beyond it, they
|
artificially low value, such that when the sequences grow beyond it, they
|
||||||
are skipped in speculative decoding.
|
are skipped in speculative decoding.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -571,21 +586,28 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
|
|||||||
@pytest.mark.parametrize("batch_size", [8])
|
@pytest.mark.parametrize("batch_size", [8])
|
||||||
@pytest.mark.parametrize("output_len", [10])
|
@pytest.mark.parametrize("output_len", [10])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_disable_speculation(baseline_llm_generator, test_llm_generator,
|
@fork_new_process_for_each_test
|
||||||
batch_size: int, output_len: int):
|
def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||||
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify greedy equality when all sequences disable speculation.
|
"""Verify greedy equality when all sequences disable speculation.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-68m",
|
"model_name": "JackFram/llama-68m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -613,22 +635,28 @@ def test_disable_speculation(baseline_llm_generator, test_llm_generator,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
|
@fork_new_process_for_each_test
|
||||||
output_len: int):
|
def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||||
|
output_len: int, seed: int):
|
||||||
"""Verify that speculative decoding produces exact equality to without spec
|
"""Verify that speculative decoding produces exact equality to without spec
|
||||||
decode with many different values of k.
|
decode with many different values of k.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"model": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
|
|
||||||
# Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
"enforce_eager": True,
|
"enforce_eager": True,
|
||||||
@@ -657,15 +685,22 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_typical_acceptance_sampling(baseline_llm_generator,
|
@fork_new_process_for_each_test
|
||||||
test_llm_generator, batch_size: int,
|
def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
|
||||||
output_len: int):
|
per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
batch_size: int, output_len: int,
|
||||||
|
seed: int):
|
||||||
"""Verify that speculative decoding produces exact equality to without spec
|
"""Verify that speculative decoding produces exact equality to without spec
|
||||||
decode with TypicalAcceptanceSampler as the draft token acceptance
|
decode with TypicalAcceptanceSampler as the draft token acceptance
|
||||||
sampling method.
|
sampling method.
|
||||||
"""
|
"""
|
||||||
run_greedy_equality_correctness_test(baseline_llm_generator,
|
run_equality_correctness_test(vllm_runner,
|
||||||
test_llm_generator,
|
common_llm_kwargs,
|
||||||
batch_size,
|
per_test_common_llm_kwargs,
|
||||||
max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
force_output_len=True)
|
test_llm_kwargs,
|
||||||
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user