Compare commits
41 Commits
v0.6.1
...
v0.6.1.pos
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9ba0817ff1 | ||
|
|
18e9e1f7b3 | ||
|
|
f57092c00b | ||
|
|
a84e598e21 | ||
|
|
0a4806f0a9 | ||
|
|
ecd7a1d5b6 | ||
|
|
a2469127db | ||
|
|
06311e2956 | ||
|
|
cab69a15e4 | ||
|
|
9b4a3b235e | ||
|
|
acda0b35d0 | ||
|
|
ba77527955 | ||
|
|
6821020109 | ||
|
|
8427550488 | ||
|
|
3f79bc3d1a | ||
|
|
40c396533d | ||
|
|
5ec9c0fb3c | ||
|
|
8f44a92d85 | ||
|
|
360ddbd37e | ||
|
|
a480939e8e | ||
|
|
d31174a4e1 | ||
|
|
b61bd98f90 | ||
|
|
c16369455f | ||
|
|
019877253b | ||
|
|
551ce01078 | ||
|
|
a6c0f3658d | ||
|
|
f2e263b801 | ||
|
|
1f0c75afa9 | ||
|
|
8a23e93302 | ||
|
|
c6202daeed | ||
|
|
e56bf27741 | ||
|
|
520ca380ae | ||
|
|
7de49aa86c | ||
|
|
42ffba11ad | ||
|
|
295c4730a8 | ||
|
|
1bf2dd9df0 | ||
|
|
5a60699c45 | ||
|
|
b6c75e1cf2 | ||
|
|
b71c956deb | ||
|
|
f842a7aff1 | ||
|
|
a65cb16067 |
@@ -23,12 +23,10 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
pip install pytest matplotlib einops transformers_stream_generator
|
||||
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
|
||||
--ignore=tests/models/test_oot_registration.py \
|
||||
--ignore=tests/models/test_registry.py \
|
||||
--ignore=tests/models/test_fp8.py \
|
||||
--ignore=tests/models/test_jamba.py \
|
||||
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||
pytest -v -s tests/models/decoder_only/language \
|
||||
--ignore=tests/models/test_fp8.py \
|
||||
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
||||
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test bash -c "
|
||||
|
||||
@@ -50,6 +50,7 @@ steps:
|
||||
- tests/worker
|
||||
commands:
|
||||
- pytest -v -s async_engine # Async Engine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
@@ -91,7 +92,7 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -162,15 +163,6 @@ steps:
|
||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference_encoder_decoder.py
|
||||
|
||||
- label: Models Test # 1hr10min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
|
||||
|
||||
- label: torch compile integration test
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -178,14 +170,6 @@ steps:
|
||||
- pytest -v -s ./compile/test_full_graph.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
|
||||
- label: Vision Language Models Test # 42min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
- pytest -v -s models -m vlm
|
||||
|
||||
- label: Prefix Caching Test # 7min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
@@ -284,6 +268,45 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 3min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
|
||||
|
||||
- label: Decoder-only Language Models Test # 1h3min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language
|
||||
|
||||
- label: Decoder-only Multi-Modal Models Test # 56min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/audio_language
|
||||
- pytest -v -s models/decoder_only/vision_language
|
||||
|
||||
- label: Other Models Test # 5min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
- pytest -v -s models/embedding/language
|
||||
- pytest -v -s models/encoder_decoder/language
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@@ -309,11 +332,11 @@ steps:
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 28min
|
||||
#mirror_hardwares: [amd]
|
||||
@@ -326,11 +349,10 @@ steps:
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
|
||||
- pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
|
||||
9
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
9
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
@@ -30,6 +30,15 @@ body:
|
||||
</details>
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Model Input Dumps
|
||||
description: |
|
||||
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
||||
placeholder: |
|
||||
Upload the dumped input file.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 🐛 Describe the bug
|
||||
|
||||
19
csrc/ops.h
19
csrc/ops.h
@@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
||||
|
||||
void gelu_quick(torch::Tensor& out, torch::Tensor& input);
|
||||
|
||||
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables);
|
||||
void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
|
||||
int64_t block_size, torch::Tensor& input_tokens,
|
||||
torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions,
|
||||
torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping,
|
||||
torch::Tensor& block_tables);
|
||||
|
||||
void advance_step_flashinfer(
|
||||
int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
|
||||
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||
|
||||
@@ -12,13 +12,11 @@ namespace prepare_inputs {
|
||||
|
||||
//
|
||||
template <int const num_threads>
|
||||
__global__ void advance_step_kernel(int num_seqs, int num_queries,
|
||||
int block_size, long* input_tokens_ptr,
|
||||
long const* sampled_token_ids_ptr,
|
||||
long* input_positions_ptr,
|
||||
int* seq_lens_ptr, long* slot_mapping_ptr,
|
||||
int const* block_tables_ptr,
|
||||
int64_t const block_tables_stride) {
|
||||
__global__ void advance_step_flashattn_kernel(
|
||||
int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
|
||||
long const* sampled_token_ids_ptr, long* input_positions_ptr,
|
||||
int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
|
||||
int64_t const block_tables_stride) {
|
||||
int num_query_blocks = div_ceil(num_queries, num_threads);
|
||||
|
||||
if (blockIdx.x >= num_query_blocks) {
|
||||
@@ -79,16 +77,91 @@ inline void verify_tensor(std::string const& name, torch::Tensor& t,
|
||||
}
|
||||
}
|
||||
|
||||
void advance_step(int num_seqs, int num_queries, int block_size,
|
||||
torch::Tensor& input_tokens, // type: long
|
||||
torch::Tensor& sampled_token_ids, // type: long
|
||||
torch::Tensor& input_positions, // type: long
|
||||
torch::Tensor& seq_lens, // type: int
|
||||
torch::Tensor& slot_mapping, // type: long
|
||||
torch::Tensor& block_tables) { // type: int
|
||||
__global__ void advance_step_flashinfer_kernel(
|
||||
int num_threads, int num_seqs, int num_queries, int block_size,
|
||||
long* input_tokens_ptr, long const* sampled_token_ids_ptr,
|
||||
long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
|
||||
int const* block_tables_ptr, int64_t const block_tables_stride,
|
||||
int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
|
||||
int num_query_blocks = div_ceil(num_queries, num_threads);
|
||||
|
||||
if (blockIdx.x < num_query_blocks) {
|
||||
int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
|
||||
|
||||
if (cur_query_id < num_queries) {
|
||||
// Update input_tokens
|
||||
input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
|
||||
|
||||
int seq_len = seq_lens_ptr[cur_query_id];
|
||||
int next_seq_len = seq_len + 1;
|
||||
int next_input_pos = next_seq_len - 1;
|
||||
|
||||
// Update seq_lens
|
||||
seq_lens_ptr[cur_query_id] = next_seq_len;
|
||||
// Update input_positions
|
||||
input_positions_ptr[cur_query_id] = next_input_pos;
|
||||
|
||||
int const* seq_block_tables_ptr =
|
||||
block_tables_ptr + block_tables_stride * cur_query_id;
|
||||
|
||||
int block_index = next_input_pos / block_size;
|
||||
int block_offset = next_input_pos % block_size;
|
||||
|
||||
// Update paged_kv_last_page_len
|
||||
paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
|
||||
|
||||
int slot_num =
|
||||
seq_block_tables_ptr[block_index] * block_size + block_offset;
|
||||
// Update slot_mapping
|
||||
slot_mapping_ptr[cur_query_id] = slot_num;
|
||||
block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void advance_step_flashinfer_indptr_kernel(
|
||||
int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
|
||||
int* block_table_bound_ptr) {
|
||||
int idx = blockIdx.x * num_threads + threadIdx.x;
|
||||
|
||||
// Update paged_kv_indptr
|
||||
if (idx < num_queries) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i <= idx; ++i) {
|
||||
sum += block_table_bound_ptr[i];
|
||||
}
|
||||
paged_kv_indptr_ptr[idx + 1] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void advance_step_flashinfer_indices_kernel(
|
||||
int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
|
||||
int64_t const block_tables_stride, int* paged_kv_indices_ptr,
|
||||
int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
|
||||
int idx = blockIdx.x * num_threads + threadIdx.x;
|
||||
int row = idx / block_tables_stride;
|
||||
int col = idx % block_tables_stride;
|
||||
|
||||
if (row < num_queries && col < block_table_bound_ptr[row]) {
|
||||
paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
|
||||
block_tables_ptr[row * block_tables_stride + col];
|
||||
}
|
||||
// if cudagraph, fill padded seqs with the last valid seq's indptr
|
||||
if (num_queries < row && row <= num_seqs) {
|
||||
paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
|
||||
}
|
||||
}
|
||||
|
||||
void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
|
||||
torch::Tensor& input_tokens, // type: long
|
||||
torch::Tensor& sampled_token_ids, // type: long
|
||||
torch::Tensor& input_positions, // type: long
|
||||
torch::Tensor& seq_lens, // type: int
|
||||
torch::Tensor& slot_mapping, // type: long
|
||||
torch::Tensor& block_tables) { // type: int
|
||||
|
||||
if (logging) {
|
||||
printf("advance_step:\n");
|
||||
printf("advance_step_flashattn:\n");
|
||||
printf(" num_seqs = %d\n", num_seqs);
|
||||
printf(" num_queries = %d\n", num_queries);
|
||||
printf(" block_size = %d\n", block_size);
|
||||
@@ -108,24 +181,126 @@ void advance_step(int num_seqs, int num_queries, int block_size,
|
||||
int blocks;
|
||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||
|
||||
advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
|
||||
num_seqs, num_queries, block_size,
|
||||
advance_step_flashattn_kernel<max_threads>
|
||||
<<<blocks, max_threads, 0, stream>>>(
|
||||
num_seqs, num_queries, block_size,
|
||||
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
||||
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
|
||||
reinterpret_cast<long*>(input_positions.data_ptr()),
|
||||
reinterpret_cast<int*>(seq_lens.data_ptr()),
|
||||
reinterpret_cast<long*>(slot_mapping.data_ptr()),
|
||||
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
||||
block_tables.stride(0));
|
||||
}
|
||||
|
||||
void advance_step_flashinfer(
|
||||
int num_seqs, int num_queries, int block_size,
|
||||
torch::Tensor& input_tokens, // type: long
|
||||
torch::Tensor& sampled_token_ids, // type: long
|
||||
torch::Tensor& input_positions, // type: long
|
||||
torch::Tensor& seq_lens, // type: int
|
||||
torch::Tensor& slot_mapping, // type: long
|
||||
torch::Tensor& block_tables, // type: int
|
||||
torch::Tensor& paged_kv_indices, // type: int
|
||||
torch::Tensor& paged_kv_indptr, // type: int
|
||||
torch::Tensor& paged_kv_last_page_len, // type: int
|
||||
torch::Tensor& block_table_bound) { // type: int
|
||||
|
||||
if (logging) {
|
||||
printf("advance_step_flashinfer:\n");
|
||||
printf(" num_seqs = %d\n", num_seqs);
|
||||
printf(" num_queries = %d\n", num_queries);
|
||||
printf(" block_size = %d\n", block_size);
|
||||
printf(" block_tables.stride(0) = %d\n", block_tables.stride(0));
|
||||
}
|
||||
// Verify all tensors
|
||||
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
|
||||
// verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
|
||||
// at::kLong);
|
||||
verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
|
||||
verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
|
||||
verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
|
||||
verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
|
||||
|
||||
verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
|
||||
verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
|
||||
verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
|
||||
at::kInt);
|
||||
|
||||
verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
|
||||
|
||||
int dev = sampled_token_ids.get_device();
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
|
||||
|
||||
int blocks;
|
||||
int threads;
|
||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
|
||||
if (logging) {
|
||||
printf("launching kernel with %d blocks\n", blocks);
|
||||
}
|
||||
|
||||
// TODO(will): support arbitrary block_tables stride
|
||||
if ((blocks * threads) / block_tables.stride(0) < num_queries) {
|
||||
TORCH_CHECK(false,
|
||||
"multi-step: not enough threads to map block_table to"
|
||||
"FlashInfer's paged_kv_indices on GPU. Try reducing the number "
|
||||
"of seqs,",
|
||||
" increasing the block size or take smaller steps.",
|
||||
" num_queries = ", num_queries,
|
||||
" block_tables.stride(0) = ", block_tables.stride(0),
|
||||
" blocks = ", blocks, " max_threads = ", threads);
|
||||
}
|
||||
|
||||
advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
|
||||
threads, num_seqs, num_queries, block_size,
|
||||
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
||||
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
|
||||
reinterpret_cast<long*>(input_positions.data_ptr()),
|
||||
reinterpret_cast<int*>(seq_lens.data_ptr()),
|
||||
reinterpret_cast<long*>(slot_mapping.data_ptr()),
|
||||
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
||||
block_tables.stride(0));
|
||||
block_tables.stride(0),
|
||||
reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
|
||||
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
||||
|
||||
advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
|
||||
threads, num_seqs, num_queries,
|
||||
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
|
||||
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
||||
|
||||
advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
|
||||
threads, num_seqs, num_queries,
|
||||
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
||||
block_tables.stride(0),
|
||||
reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
|
||||
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
|
||||
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
||||
}
|
||||
|
||||
} // namespace prepare_inputs
|
||||
|
||||
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
|
||||
prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
|
||||
sampled_token_ids, input_positions, seq_lens,
|
||||
slot_mapping, block_tables);
|
||||
void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
|
||||
int64_t block_size, torch::Tensor& input_tokens,
|
||||
torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions,
|
||||
torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping,
|
||||
torch::Tensor& block_tables) {
|
||||
prepare_inputs::advance_step_flashattn(
|
||||
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
|
||||
input_positions, seq_lens, slot_mapping, block_tables);
|
||||
}
|
||||
|
||||
void advance_step_flashinfer(
|
||||
int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
|
||||
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
|
||||
prepare_inputs::advance_step_flashinfer(
|
||||
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
|
||||
input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
|
||||
paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
|
||||
}
|
||||
@@ -74,11 +74,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
|
||||
// prepare_inputs advance_step
|
||||
ops.def(
|
||||
"advance_step(int num_seqs, int num_queries, int block_size, "
|
||||
"advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
|
||||
"Tensor! input_tokens, Tensor sampled_token_ids, "
|
||||
"Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
|
||||
"Tensor block_tables) -> ()");
|
||||
ops.impl("advance_step", torch::kCUDA, &advance_step);
|
||||
ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
|
||||
|
||||
ops.def(
|
||||
"advance_step_flashinfer("
|
||||
" int num_seqs, int num_queries, int block_size,"
|
||||
" Tensor! input_tokens, Tensor sampled_token_ids,"
|
||||
" Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
|
||||
" Tensor block_tables, Tensor! paged_kv_indices,"
|
||||
" Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
|
||||
" Tensor! block_table_bounds"
|
||||
") -> ()");
|
||||
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
|
||||
|
||||
// Layernorm
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
|
||||
@@ -59,6 +59,20 @@ Build from source
|
||||
$ pip install wheel packaging ninja "setuptools>=49.4.0" numpy
|
||||
$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
- Third, build and install oneDNN library from source:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
|
||||
$ cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
|
||||
-DONEDNN_BUILD_DOC=OFF \
|
||||
-DONEDNN_BUILD_EXAMPLES=OFF \
|
||||
-DONEDNN_BUILD_TESTS=OFF \
|
||||
-DONEDNN_BUILD_GRAPH=OFF \
|
||||
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
|
||||
-DONEDNN_ENABLE_PRIMITIVE=MATMUL
|
||||
$ cmake --build ./oneDNN/build --target install --config Release
|
||||
|
||||
- Finally, build and install vLLM CPU backend:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
@@ -26,6 +26,10 @@ You can install vLLM using pip:
|
||||
$ # Install vLLM with CUDA 12.1.
|
||||
$ pip install vllm
|
||||
|
||||
.. note::
|
||||
|
||||
Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue <https://github.com/vllm-project/vllm/issues/8420>`_ for more details.
|
||||
|
||||
.. note::
|
||||
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
|
||||
@@ -34,7 +38,7 @@ You can install vLLM using pip:
|
||||
.. code-block:: console
|
||||
|
||||
$ # Install vLLM with CUDA 11.8.
|
||||
$ export VLLM_VERSION=0.4.0
|
||||
$ export VLLM_VERSION=0.6.1.post1
|
||||
$ export PYTHON_VERSION=310
|
||||
$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
@@ -48,7 +52,7 @@ You can install vLLM using pip:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ export VLLM_VERSION=0.5.4 # vLLM's main branch version is currently set to latest released tag
|
||||
$ export VLLM_VERSION=0.6.1.post1 # vLLM's main branch version is currently set to latest released tag
|
||||
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
|
||||
$ # You can also access a specific commit
|
||||
$ # export VLLM_COMMIT=...
|
||||
@@ -80,11 +84,11 @@ You can also build and install vLLM from source:
|
||||
|
||||
.. tip::
|
||||
|
||||
Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
|
||||
Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
|
||||
|
||||
.. tip::
|
||||
To avoid your system being overloaded, you can limit the number of compilation jobs
|
||||
to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
|
||||
to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
@@ -99,7 +103,7 @@ You can also build and install vLLM from source:
|
||||
$ # Use `--ipc=host` to make sure the shared memory is large enough.
|
||||
$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
|
||||
|
||||
If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
|
||||
If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
|
||||
@@ -254,7 +254,7 @@ Multimodal Language Models
|
||||
-
|
||||
* - :code:`QWenLMHeadModel`
|
||||
- Qwen-VL
|
||||
- Image\ :sup:`E`
|
||||
- Image\ :sup:`E+`
|
||||
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
|
||||
-
|
||||
* - :code:`Qwen2VLForConditionalGeneration`
|
||||
@@ -342,7 +342,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
|
||||
|
||||
We have the following levels of testing for models:
|
||||
|
||||
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
|
||||
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
|
||||
2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
|
||||
3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
|
||||
4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
|
||||
|
||||
@@ -11,7 +11,7 @@ from vllm.sampling_params import SamplingParams
|
||||
# - Server:
|
||||
#
|
||||
# ```bash
|
||||
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer_mode mistral --limit_mm_per_prompt 'image=4' --max_num_batched_tokens 16384
|
||||
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||
# ```
|
||||
#
|
||||
# - Client:
|
||||
@@ -45,6 +45,7 @@ def run_simple_demo():
|
||||
model_name = "mistralai/Pixtral-12B-2409"
|
||||
sampling_params = SamplingParams(max_tokens=8192)
|
||||
|
||||
# Lower max_num_seqs or max_model_len on low-VRAM GPUs.
|
||||
llm = LLM(model=model_name, tokenizer_mode="mistral")
|
||||
|
||||
prompt = "Describe this image in one sentence."
|
||||
@@ -83,7 +84,7 @@ def run_advanced_demo():
|
||||
model=model_name,
|
||||
tokenizer_mode="mistral",
|
||||
limit_mm_per_prompt={"image": max_img_per_msg},
|
||||
max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,
|
||||
max_model_len=max_img_per_msg * max_tokens_per_img,
|
||||
)
|
||||
|
||||
prompt = "Describe the following image."
|
||||
|
||||
@@ -19,7 +19,39 @@ IMAGE_URLS = [
|
||||
]
|
||||
|
||||
|
||||
def load_phi3v(question, image_urls: List[str]):
|
||||
def load_qwenvl_chat(question: str, image_urls: List[str]):
|
||||
model_name = "Qwen/Qwen-VL-Chat"
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
placeholders = "".join(f"Picture {i}: <img></img>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
|
||||
# This model does not have a chat_template attribute on its tokenizer,
|
||||
# so we need to explicitly pass it. We use ChatML since it's used in the
|
||||
# generation utils of the model:
|
||||
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
|
||||
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
|
||||
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501
|
||||
|
||||
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
chat_template=chat_template)
|
||||
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
return llm, prompt, stop_token_ids, None, chat_template
|
||||
|
||||
|
||||
def load_phi3v(question: str, image_urls: List[str]):
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True,
|
||||
@@ -30,10 +62,10 @@ def load_phi3v(question, image_urls: List[str]):
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids, None
|
||||
return llm, prompt, stop_token_ids, None, None
|
||||
|
||||
|
||||
def load_internvl(question, image_urls: List[str]):
|
||||
def load_internvl(question: str, image_urls: List[str]):
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
llm = LLM(
|
||||
@@ -61,7 +93,7 @@ def load_internvl(question, image_urls: List[str]):
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
return llm, prompt, stop_token_ids, None
|
||||
return llm, prompt, stop_token_ids, None, None
|
||||
|
||||
|
||||
def load_qwen2_vl(question, image_urls: List[str]):
|
||||
@@ -111,18 +143,19 @@ def load_qwen2_vl(question, image_urls: List[str]):
|
||||
else:
|
||||
image_data, _ = process_vision_info(messages)
|
||||
|
||||
return llm, prompt, stop_token_ids, image_data
|
||||
return llm, prompt, stop_token_ids, image_data, None
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"phi3_v": load_phi3v,
|
||||
"internvl_chat": load_internvl,
|
||||
"qwen2_vl": load_qwen2_vl,
|
||||
"qwen_vl_chat": load_qwenvl_chat,
|
||||
}
|
||||
|
||||
|
||||
def run_generate(model, question: str, image_urls: List[str]):
|
||||
llm, prompt, stop_token_ids, image_data = model_example_map[model](
|
||||
llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
|
||||
question, image_urls)
|
||||
if image_data is None:
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
@@ -146,29 +179,32 @@ def run_generate(model, question: str, image_urls: List[str]):
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: List[str]):
|
||||
llm, _, stop_token_ids, _ = model_example_map[model](question, image_urls)
|
||||
llm, _, stop_token_ids, _, chat_template = model_example_map[model](
|
||||
question, image_urls)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
stop_token_ids=stop_token_ids)
|
||||
|
||||
outputs = llm.chat([{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": question,
|
||||
},
|
||||
*({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
outputs = llm.chat(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": question,
|
||||
},
|
||||
} for image_url in image_urls),
|
||||
],
|
||||
}],
|
||||
sampling_params=sampling_params)
|
||||
*({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
} for image_url in image_urls),
|
||||
],
|
||||
}],
|
||||
sampling_params=sampling_params,
|
||||
chat_template=chat_template,
|
||||
)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
|
||||
@@ -16,7 +16,7 @@ prompts = [
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
|
||||
|
||||
llm.start_profile()
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ exclude = [
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words-list = "dout, te, indicies, subtile"
|
||||
skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
|
||||
skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
|
||||
|
||||
[tool.isort]
|
||||
use_parentheses = true
|
||||
@@ -85,5 +85,6 @@ skip_gitignore = true
|
||||
[tool.pytest.ini_options]
|
||||
markers = [
|
||||
"skip_global_cleanup",
|
||||
"vlm: run tests for vision language models only",
|
||||
"core_model: run this model test in each PR instead of just daily",
|
||||
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
|
||||
]
|
||||
|
||||
@@ -7,11 +7,12 @@ py-cpuinfo
|
||||
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
|
||||
tokenizers >= 0.19.1 # Required for Llama 3.
|
||||
protobuf # Required by LlamaTokenizer.
|
||||
fastapi
|
||||
fastapi < 0.113.0; python_version < '3.9'
|
||||
fastapi >= 0.114.1; python_version >= '3.9'
|
||||
aiohttp
|
||||
openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
|
||||
uvicorn[standard]
|
||||
pydantic >= 2.8 # Required for OpenAI server.
|
||||
pydantic >= 2.9 # Required for fastapi >= 0.113.0
|
||||
pillow # Required for image processing
|
||||
prometheus_client >= 0.18.0
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
@@ -26,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
|
||||
worker_use_ray: bool):
|
||||
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
@@ -37,25 +35,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
|
||||
str(tokenizer_pool_size)
|
||||
]
|
||||
|
||||
# Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
|
||||
# to prevent `--engine-use-ray` raises an exception due to it deprecation
|
||||
env_vars = os.environ.copy()
|
||||
env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
|
||||
|
||||
if engine_use_ray:
|
||||
commands.append("--engine-use-ray")
|
||||
if worker_use_ray:
|
||||
commands.append("--worker-use-ray")
|
||||
uvicorn_process = subprocess.Popen(commands, env=env_vars)
|
||||
uvicorn_process = subprocess.Popen(commands)
|
||||
yield
|
||||
uvicorn_process.terminate()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
||||
@pytest.mark.parametrize("worker_use_ray", [False, True])
|
||||
@pytest.mark.parametrize("engine_use_ray", [False, True])
|
||||
def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
|
||||
engine_use_ray: bool):
|
||||
def test_api_server(api_server, tokenizer_pool_size: int,
|
||||
worker_use_ray: bool):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import asyncio
|
||||
import os
|
||||
import uuid
|
||||
from asyncio import CancelledError
|
||||
from copy import copy
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
@@ -12,6 +14,7 @@ from vllm import SamplingParams
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
|
||||
from vllm.outputs import RequestOutput as RealRequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
|
||||
from ..conftest import cleanup
|
||||
from ..utils import wait_for_gpu_memory_to_clear
|
||||
@@ -72,14 +75,12 @@ class MockEngine:
|
||||
|
||||
|
||||
class MockAsyncLLMEngine(AsyncLLMEngine):
|
||||
|
||||
def _init_engine(self, *args, **kwargs):
|
||||
return MockEngine()
|
||||
_engine_class = MockEngine
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_new_requests_event():
|
||||
engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
|
||||
engine = MockAsyncLLMEngine(worker_use_ray=False)
|
||||
engine.start_background_loop()
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.step_calls == 0
|
||||
@@ -112,16 +113,11 @@ async def test_new_requests_event():
|
||||
assert engine.engine.add_request_calls == 3
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
|
||||
# Allow deprecated engine_use_ray to not raise exception
|
||||
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
|
||||
|
||||
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
|
||||
engine = MockAsyncLLMEngine(worker_use_ray=True)
|
||||
assert engine.get_model_config() is not None
|
||||
assert engine.get_tokenizer() is not None
|
||||
assert engine.get_decoding_config() is not None
|
||||
|
||||
os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
|
||||
|
||||
|
||||
def start_engine():
|
||||
wait_for_gpu_memory_to_clear(
|
||||
@@ -130,8 +126,17 @@ def start_engine():
|
||||
timeout_s=60,
|
||||
)
|
||||
|
||||
num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
|
||||
print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
|
||||
|
||||
return AsyncLLMEngine.from_engine_args(
|
||||
AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
|
||||
AsyncEngineArgs(model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
num_scheduler_steps=num_scheduler_steps))
|
||||
|
||||
|
||||
def uid() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="module")
|
||||
@@ -154,59 +159,195 @@ def should_do_global_cleanup_after_test(request) -> bool:
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
async def test_asyncio_run(async_engine):
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_asyncio_run(async_engine, stop):
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
async def run(prompt: str):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
sampling_params,
|
||||
request_id=prompt):
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
return final_output
|
||||
return final_output, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("test0"),
|
||||
run("test1"),
|
||||
run("test0"),
|
||||
)
|
||||
assert len(results) == 2
|
||||
first, second = results
|
||||
|
||||
# remove nondeterministic fields for comparison
|
||||
first[0].metrics = None
|
||||
second[0].metrics = None
|
||||
first[0].request_id = None
|
||||
second[0].request_id = None
|
||||
|
||||
assert str(first) == str(second)
|
||||
|
||||
output_count = results[0][1]
|
||||
if num_scheduler_steps == 1:
|
||||
assert output_count == 32
|
||||
else:
|
||||
assert 1 < output_count < 32
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
async def test_cancellation(async_engine):
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_output_kinds(async_engine, stop):
|
||||
"""Test that output_kind works as expected and that
|
||||
results are equivalent across different kinds."""
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=10,
|
||||
max_tokens=10,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
async def run(prompt: str, kind: RequestOutputKind):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = kind
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return (final_output.prompt_token_ids,
|
||||
final_output.outputs[0].token_ids,
|
||||
final_output.outputs[0].text, output_count)
|
||||
|
||||
async def run_deltas(prompt: str):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = RequestOutputKind.DELTA
|
||||
|
||||
prompt_tokens = None
|
||||
output_tokens: List[int] = []
|
||||
output_text = ""
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
token_ids = output.outputs[0].token_ids
|
||||
text = output.outputs[0].text
|
||||
final_output = output
|
||||
|
||||
# Ensure we get prompt ids iff we haven't yet received output tokens
|
||||
if output_tokens:
|
||||
assert 1 <= len(token_ids) <= num_scheduler_steps
|
||||
assert stop or text
|
||||
assert not output.prompt_token_ids
|
||||
else:
|
||||
assert output.prompt_token_ids
|
||||
prompt_tokens = output.prompt_token_ids
|
||||
|
||||
output_tokens.extend(token_ids)
|
||||
output_text += text
|
||||
|
||||
output_count += 1
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return prompt_tokens, output_tokens, output_text, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("common input prompt", RequestOutputKind.CUMULATIVE),
|
||||
run("common input prompt", RequestOutputKind.FINAL_ONLY),
|
||||
run_deltas("common input prompt"))
|
||||
|
||||
# Make sure outputs are the same
|
||||
prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
|
||||
assert len(prompt_set) == 1
|
||||
|
||||
text_set = set(text for _, _, text, _ in results)
|
||||
assert len(text_set) == 1
|
||||
|
||||
tokens_set = set(tuple(ids) for _, ids, _, _ in results)
|
||||
assert len(tokens_set) == 1
|
||||
|
||||
cumulative, final, deltas = results
|
||||
|
||||
# output message counts
|
||||
assert cumulative[3] == deltas[3]
|
||||
|
||||
if num_scheduler_steps == 1:
|
||||
assert cumulative[3] == 32
|
||||
else:
|
||||
assert 1 < cumulative[3] < 32
|
||||
|
||||
assert final[3] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_cancellation(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=13,
|
||||
max_tokens=13,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stop_at = 5 if num_scheduler_steps == 1 else 1
|
||||
|
||||
request_id = uid()
|
||||
|
||||
i = 0
|
||||
with pytest.raises(CancelledError):
|
||||
async for output in async_engine.generate("test2",
|
||||
sampling_params,
|
||||
request_id="test2"):
|
||||
request_id=request_id):
|
||||
assert not output.finished
|
||||
i += 1
|
||||
if i == 5:
|
||||
await async_engine.abort("test2")
|
||||
if i == stop_at:
|
||||
await async_engine.abort(request_id)
|
||||
|
||||
assert i == 5
|
||||
assert i == stop_at
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
async def test_delayed_generator(async_engine):
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_delayed_generator(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
|
||||
if scheduler_config.num_scheduler_steps != 1:
|
||||
pytest.skip("no need to test this one with multistep")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=10,
|
||||
max_tokens=10,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stream = async_engine.generate("test3",
|
||||
sampling_params,
|
||||
request_id="test3")
|
||||
stream = async_engine.generate("test3", sampling_params, request_id=uid())
|
||||
i = 0
|
||||
final_output: Optional[RealRequestOutput] = None
|
||||
async for output in stream:
|
||||
|
||||
@@ -19,16 +19,11 @@ def server():
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--enforce-eager",
|
||||
"--engine-use-ray",
|
||||
"--chat-template",
|
||||
str(chatml_jinja_path),
|
||||
]
|
||||
|
||||
# Allow `--engine-use-ray`, otherwise the launch of the server throw
|
||||
# an error due to try to use a deprecated feature
|
||||
env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
|
||||
with RemoteOpenAIServer(MODEL_NAME, args,
|
||||
env_dict=env_dict) as remote_server:
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@@ -3,20 +3,27 @@
|
||||
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import weakref
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.utils import is_hip
|
||||
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
]
|
||||
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
@@ -64,3 +71,88 @@ def test_models(
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
test_suite: str,
|
||||
) -> None:
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
if attention_backend:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def test_model_with_failure(vllm_runner) -> None:
|
||||
try:
|
||||
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
|
||||
side_effect=ValueError()):
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
vllm_runner("facebook/opt-125m",
|
||||
dtype="half",
|
||||
enforce_eager=False,
|
||||
gpu_memory_utilization=0.7)
|
||||
matches = re.search(r"input dumped to (.+).pkl",
|
||||
str(exc_info.value))
|
||||
assert matches is not None
|
||||
filename = f"{matches.group(1)}.pkl"
|
||||
|
||||
with open(filename, "rb") as filep:
|
||||
inputs = pickle.load(filep)
|
||||
|
||||
if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
|
||||
raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
|
||||
f"{list(inputs.keys())}")
|
||||
assert isinstance(inputs["arg_1"],
|
||||
ModelInputForGPUWithSamplingMetadata)
|
||||
finally:
|
||||
os.remove(filename)
|
||||
|
||||
@@ -6,11 +6,13 @@ prefill requests are chunked.
|
||||
|
||||
Run `pytest tests/models/test_chunked_prefill.py`.
|
||||
"""
|
||||
import os
|
||||
from contextlib import nullcontext
|
||||
|
||||
import pytest
|
||||
|
||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
@@ -66,6 +68,59 @@ def test_models(
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
if (model == "meta-llama/Llama-2-7b-hf"
|
||||
and distributed_executor_backend == "ray"):
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
assert chunked_prefill_token_size != -1
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,model",
|
||||
[("fp8_e4m3",
|
||||
|
||||
@@ -19,10 +19,13 @@ MODELS = [
|
||||
"facebook/opt-125m",
|
||||
]
|
||||
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
|
||||
"tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_settings():
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
|
||||
"tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute(
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_seqs=max_num_seqs,
|
||||
worker_use_ray=worker_use_ray,
|
||||
disable_log_stats=False,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
|
||||
@@ -16,5 +16,7 @@ def test_full_graph(model):
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B")
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B",
|
||||
enforce_eager=True,
|
||||
load_format="dummy")
|
||||
llm.generate(prompts, sampling_params)
|
||||
|
||||
@@ -6,8 +6,8 @@ import sys
|
||||
import tempfile
|
||||
from collections import UserList
|
||||
from enum import Enum
|
||||
from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
|
||||
TypeVar, Union)
|
||||
from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
|
||||
TypedDict, TypeVar, Union)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
|
||||
from PIL import Image
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
|
||||
BatchFeature)
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
@@ -260,7 +261,7 @@ class HfRunner:
|
||||
*,
|
||||
model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
is_embedding_model: bool = False,
|
||||
auto_cls=AutoModelForCausalLM,
|
||||
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
postprocess_inputs: Callable[[BatchEncoding],
|
||||
BatchEncoding] = identity,
|
||||
) -> None:
|
||||
@@ -292,20 +293,14 @@ class HfRunner:
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
try:
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Unable to auto-load HuggingFace processor for model (%s). "
|
||||
"Using tokenizer instead. Reason: %s", model_name, exc)
|
||||
self.processor = self.tokenizer
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
self.postprocess_inputs = postprocess_inputs
|
||||
|
||||
@@ -658,8 +653,8 @@ class VllmRunner:
|
||||
outputs.append((req_sample_output_ids, req_sample_output_strs))
|
||||
return outputs
|
||||
|
||||
@staticmethod
|
||||
def _final_steps_generate_w_logprobs(
|
||||
self,
|
||||
req_outputs: List[RequestOutput],
|
||||
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
|
||||
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
pytest distributed/test_basic_distributed_correctness.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
test_suite: str,
|
||||
) -> None:
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
if attention_backend:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@@ -1,102 +0,0 @@
|
||||
"""For encoder/decoder models only:
|
||||
Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
pytest distributed/test_basic_distributed_correctness_enc_dec.py
|
||||
```
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..conftest import DecoderPromptType
|
||||
from ..models.utils import check_logprobs_close
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("facebook/bart-large-cnn", "ray"),
|
||||
("facebook/bart-large-cnn", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
) -> None:
|
||||
'''
|
||||
Test vLLM BART inference on more than one GPU, comparing
|
||||
outputs against HF as a baseline.
|
||||
|
||||
Fork a new process for each test, to prevent CUDA from
|
||||
being re-initialized by successive tests within the same
|
||||
process.
|
||||
|
||||
Arguments:
|
||||
|
||||
* model: the HF ID of the specific BART variant under test
|
||||
* distributed_executor_backend
|
||||
* hf_runner: HuggingFace (HF) test model runner
|
||||
* vllm_runner: vLLM test model runner
|
||||
* example_encoder_decoder_prompts: test fixture which provides a
|
||||
dictionary of dummy prompts
|
||||
'''
|
||||
|
||||
dtype = "float"
|
||||
max_tokens = 64
|
||||
num_logprobs = 5
|
||||
|
||||
# Example inputs with non-trivial (i.e. not None/empty) encoder &
|
||||
# decoder prompts.
|
||||
test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
test_prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
test_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@@ -1,75 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
pytest test_chunked_prefill_distributed.py
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("facebook/opt-125m", "ray"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray"),
|
||||
("facebook/opt-125m", "mp"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa
|
||||
assert distributed_executor_backend == "ray"
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
assert chunked_prefill_token_size != -1
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@@ -1,58 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
pytest -s -v test_multimodal_broadcast.py
|
||||
```
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("llava-hf/llava-1.5-7b-hf", "ray"),
|
||||
("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
|
||||
("facebook/chameleon-7b", "ray"),
|
||||
("llava-hf/llava-1.5-7b-hf", "mp"),
|
||||
("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
|
||||
("facebook/chameleon-7b", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model: str,
|
||||
distributed_executor_backend: str) -> None:
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
num_logprobs = 5
|
||||
tensor_parallel_size = 2
|
||||
|
||||
if model.startswith("llava-hf/llava-1.5"):
|
||||
from ..models.test_llava import models, run_test
|
||||
elif model.startswith("llava-hf/llava-v1.6"):
|
||||
from ..models.test_llava_next import run_test # type: ignore[no-redef]
|
||||
from ..models.test_llava_next import models
|
||||
elif model.startswith("facebook/chameleon"):
|
||||
from ..models.test_chameleon import run_test # type: ignore[no-redef]
|
||||
from ..models.test_chameleon import models
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=models[0],
|
||||
# So that LLaVA-NeXT processor may return nested list
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@@ -32,9 +32,11 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
|
||||
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
|
||||
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
|
||||
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
|
||||
# NOTE: InternVL2 multi-node tests are flaky,
|
||||
# use mp backend to skip the multi-node tests
|
||||
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
|
||||
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
|
||||
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
|
||||
torch.distributed.init_process_group(backend="gloo")
|
||||
test_result = all(
|
||||
in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
print("Same node test passed!")
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
print("Same node test passed!")
|
||||
|
||||
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
|
||||
# token ids.
|
||||
llm = LLM(model=model, skip_tokenizer_init=True)
|
||||
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
||||
with pytest.raises(ValueError) as err:
|
||||
|
||||
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
||||
llm.generate("abc", sampling_params)
|
||||
assert "prompts must be None if" in str(err.value)
|
||||
|
||||
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
|
||||
sampling_params=sampling_params)
|
||||
assert len(outputs) > 0
|
||||
|
||||
0
tests/entrypoints/offline_mode/__init__.py
Normal file
0
tests/entrypoints/offline_mode/__init__.py
Normal file
77
tests/entrypoints/offline_mode/test_offline_mode.py
Normal file
77
tests/entrypoints/offline_mode/test_offline_mode.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Tests for HF_HUB_OFFLINE mode"""
|
||||
import importlib
|
||||
import sys
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...conftest import cleanup
|
||||
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_offline_mode(llm: LLM, monkeypatch):
|
||||
# we use the llm fixture to ensure the model files are in-cache
|
||||
del llm
|
||||
|
||||
# Set HF to offline mode and ensure we can still construct an LLM
|
||||
try:
|
||||
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
|
||||
# Need to re-import huggingface_hub and friends to setup offline mode
|
||||
_re_import_modules()
|
||||
# Cached model files should be used in offline mode
|
||||
LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True)
|
||||
finally:
|
||||
# Reset the environment after the test
|
||||
# NB: Assuming tests are run in online mode
|
||||
monkeypatch.delenv("HF_HUB_OFFLINE")
|
||||
_re_import_modules()
|
||||
pass
|
||||
|
||||
|
||||
def _re_import_modules():
|
||||
hf_hub_module_names = [
|
||||
k for k in sys.modules if k.startswith("huggingface_hub")
|
||||
]
|
||||
transformers_module_names = [
|
||||
k for k in sys.modules if k.startswith("transformers")
|
||||
and not k.startswith("transformers_modules")
|
||||
]
|
||||
|
||||
reload_exception = None
|
||||
for module_name in hf_hub_module_names + transformers_module_names:
|
||||
try:
|
||||
importlib.reload(sys.modules[module_name])
|
||||
except Exception as e:
|
||||
reload_exception = e
|
||||
# Try to continue clean up so that other tests are less likely to
|
||||
# be affected
|
||||
|
||||
# Error this test if reloading a module failed
|
||||
if reload_exception is not None:
|
||||
raise reload_exception
|
||||
@@ -10,7 +10,6 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
|
||||
from vllm.attention.backends.xformers import XFormersBackend
|
||||
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
|
||||
make_tensor_with_pad)
|
||||
|
||||
@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
* Backend instance
|
||||
'''
|
||||
if backend_name == STR_XFORMERS_ATTN_VAL:
|
||||
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
|
||||
from vllm.attention.backends.xformers import XFormersBackend
|
||||
|
||||
return XFormersBackend()
|
||||
raise AssertionError(
|
||||
f"Unrecognized backend_name {backend_name} for unit test")
|
||||
|
||||
0
tests/models/decoder_only/__init__.py
Normal file
0
tests/models/decoder_only/__init__.py
Normal file
@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
||||
|
||||
0
tests/models/decoder_only/language/__init__.py
Normal file
0
tests/models/decoder_only/language/__init__.py
Normal file
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from .utils import check_outputs_equal
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from .utils import check_outputs_equal
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["h2oai/h2o-danube3-4b-base"]
|
||||
|
||||
@@ -10,7 +10,7 @@ import pytest
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..models.utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@@ -11,7 +11,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@@ -15,7 +15,7 @@ import pytest
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@@ -10,9 +10,10 @@ from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
@@ -6,7 +6,7 @@ import importlib.metadata
|
||||
|
||||
import pytest
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
TRANSFORMERS_VERSION = tuple(
|
||||
map(int,
|
||||
@@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import check_outputs_equal
|
||||
from vllm.worker.model_runner import _get_graph_batch_size
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["ai21labs/Jamba-tiny-random"]
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.1",
|
||||
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from .utils import check_outputs_equal
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
@@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, AutoTokenizer
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import IMAGE_ASSETS
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@@ -56,7 +54,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalData objects and corresponding
|
||||
MultiModalConfig as input.
|
||||
42
tests/models/decoder_only/vision_language/test_broadcast.py
Normal file
42
tests/models/decoder_only/vision_language/test_broadcast.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import pytest
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", [
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"facebook/chameleon-7b",
|
||||
])
|
||||
def test_models(hf_runner, vllm_runner, image_assets,
|
||||
distributed_executor_backend, model) -> None:
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
num_logprobs = 5
|
||||
tensor_parallel_size = 2
|
||||
|
||||
if model.startswith("llava-hf/llava-1.5"):
|
||||
from .test_llava import models, run_test
|
||||
elif model.startswith("llava-hf/llava-v1.6"):
|
||||
from .test_llava_next import models, run_test # type: ignore[no-redef]
|
||||
elif model.startswith("facebook/chameleon"):
|
||||
from .test_chameleon import models, run_test # type: ignore[no-redef]
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=models[0],
|
||||
# So that LLaVA-NeXT processor may return nested list
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, BatchEncoding
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_outputs_equal
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@@ -36,7 +34,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
@@ -6,10 +6,8 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@@ -46,7 +44,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@@ -6,9 +6,7 @@ import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
|
||||
|
||||
from ..conftest import _ImageAssets, cleanup
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import _ImageAssets, cleanup
|
||||
|
||||
# we use snapshot_download to prevent conflicts between
|
||||
# dynamic_module and trust_remote_code for hf_runner
|
||||
@@ -9,11 +9,9 @@ from transformers import AutoConfig
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@@ -78,7 +76,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@@ -331,6 +329,41 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
|
||||
@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@torch.inference_mode()
|
||||
def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
|
||||
|
||||
inputs_batching = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
inputs_multi_images = [
|
||||
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors])
|
||||
]
|
||||
for inputs in [inputs_batching, inputs_multi_images]:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
|
||||
@pytest.mark.parametrize(
|
||||
@@ -8,11 +8,9 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 4
|
||||
|
||||
@@ -143,7 +141,7 @@ def _run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@@ -239,7 +237,7 @@ def _run_test(
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
dtype, max_tokens, num_logprobs) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@@ -5,10 +5,8 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@@ -62,7 +60,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 4
|
||||
|
||||
@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype, max_tokens, num_logprobs) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_PREFACE = (
|
||||
"A chat between a curious human and an artificial intelligence assistant. "
|
||||
@@ -9,10 +9,8 @@ from transformers import BatchEncoding
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# The image token is placed before "user" on purpose so that the test can pass
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
@@ -65,7 +63,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import is_hip
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@@ -69,7 +67,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import is_cpu, is_hip
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@@ -71,7 +69,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
199
tests/models/decoder_only/vision_language/test_pixtral.py
Normal file
199
tests/models/decoder_only/vision_language/test_pixtral.py
Normal file
@@ -0,0 +1,199 @@
|
||||
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import json
|
||||
import uuid
|
||||
from dataclasses import asdict
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.messages import ImageURLChunk
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.sequence import Logprob, SampleLogprobs
|
||||
|
||||
from ....utils import VLLM_PATH
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
|
||||
MODELS = ["mistralai/Pixtral-12B-2409"]
|
||||
IMG_URLS = [
|
||||
"https://picsum.photos/id/237/400/300",
|
||||
"https://picsum.photos/id/231/200/300",
|
||||
"https://picsum.photos/id/27/500/500",
|
||||
"https://picsum.photos/id/17/150/600",
|
||||
]
|
||||
PROMPT = "Describe each image in one short sentence."
|
||||
|
||||
|
||||
def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}] + [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": url
|
||||
}
|
||||
} for url in urls],
|
||||
}]
|
||||
|
||||
|
||||
def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
|
||||
msg = _create_msg_format(urls)
|
||||
|
||||
tokenizer = MistralTokenizer.from_model("pixtral")
|
||||
|
||||
request = ChatCompletionRequest(messages=msg) # type: ignore[type-var]
|
||||
tokenized = tokenizer.encode_chat_completion(request)
|
||||
|
||||
engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
|
||||
|
||||
images = []
|
||||
for chunk in request.messages[0].content:
|
||||
if isinstance(chunk, ImageURLChunk):
|
||||
images.append(image_from_chunk(chunk))
|
||||
|
||||
mm_data = MultiModalDataBuiltins(image=images)
|
||||
engine_inputs["multi_modal_data"] = mm_data
|
||||
|
||||
return engine_inputs
|
||||
|
||||
|
||||
MSGS = [
|
||||
_create_msg_format(IMG_URLS[:1]),
|
||||
_create_msg_format(IMG_URLS[:2]),
|
||||
_create_msg_format(IMG_URLS),
|
||||
]
|
||||
ENGINE_INPUTS = [
|
||||
_create_engine_inputs(IMG_URLS[:1]),
|
||||
_create_engine_inputs(IMG_URLS[:2]),
|
||||
_create_engine_inputs(IMG_URLS),
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
LIMIT_MM_PER_PROMPT = dict(image=4)
|
||||
|
||||
MAX_MODEL_LEN = [8192, 65536]
|
||||
|
||||
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
||||
assert FIXTURES_PATH.exists()
|
||||
|
||||
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
|
||||
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
|
||||
|
||||
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
|
||||
|
||||
|
||||
# For the test author to store golden output in JSON
|
||||
def _dump_outputs_w_logprobs(
|
||||
outputs: OutputsLogprobs,
|
||||
filename: "StrPath",
|
||||
) -> None:
|
||||
json_data = [(tokens, text,
|
||||
[{k: asdict(v)
|
||||
for k, v in token_logprobs.items()}
|
||||
for token_logprobs in (logprobs or [])])
|
||||
for tokens, text, logprobs in outputs]
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(json_data, f)
|
||||
|
||||
|
||||
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
with open(filename, "rb") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
return [(tokens, text,
|
||||
[{int(k): Logprob(**v)
|
||||
for k, v in token_logprobs.items()}
|
||||
for token_logprobs in logprobs])
|
||||
for tokens, text, logprobs in json_data]
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Model is too big, test passed on A100 locally but will OOM on CI machine."
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_chat(
|
||||
vllm_runner,
|
||||
max_model_len: int,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
enable_chunked_prefill=False,
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
for msg in MSGS:
|
||||
output = vllm_model.model.chat(msg,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
outputs.extend(output)
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Model is too big, test passed on A100 locally but will OOM on CI machine."
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
|
||||
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
|
||||
args = EngineArgs(
|
||||
model=model,
|
||||
tokenizer_mode="mistral",
|
||||
enable_chunked_prefill=False,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
dtype=dtype,
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(args)
|
||||
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
|
||||
|
||||
outputs = []
|
||||
count = 0
|
||||
while True:
|
||||
out = engine.step()
|
||||
count += 1
|
||||
for request_output in out:
|
||||
if request_output.finished:
|
||||
outputs.append(request_output)
|
||||
|
||||
if count == 2:
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
|
||||
SAMPLING_PARAMS)
|
||||
if not engine.has_unfinished_requests():
|
||||
break
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
401
tests/models/decoder_only/vision_language/test_qwen.py
Normal file
401
tests/models/decoder_only/vision_language/test_qwen.py
Normal file
@@ -0,0 +1,401 @@
|
||||
import pathlib
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs import InputContext, LLMInputs
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
|
||||
VllmRunner, _ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
text_only_models = [
|
||||
"Qwen/Qwen-7B-Chat" # Has no visual component
|
||||
]
|
||||
|
||||
multimodal_models = ["Qwen/Qwen-VL"]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"Picture 1: <img></img>\nWhat's the content of the image?: ",
|
||||
"cherry_blossom":
|
||||
"Picture 1: <img></img>\nWhat is the season?: ",
|
||||
})
|
||||
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n" # noqa: E501
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n" # noqa: E501
|
||||
### Multimodal preprocessing tests
|
||||
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
|
||||
# These values are specific to Qwen-VL/Chat; we can get these from the model
|
||||
# config also, but they are hardcoded here to keep the parameterize/fixtures
|
||||
# easy to read.
|
||||
IMG_START_ID = 151857
|
||||
IMG_END_ID = 151858
|
||||
IMG_PAD_ID = 151859
|
||||
TOKS_PER_IMG = 256
|
||||
VIS_ENC_DIM = 4096
|
||||
IMG_SIZE = 448
|
||||
|
||||
|
||||
def build_model_context(model_name: str,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
trust_remote_code: bool = False):
|
||||
"""Creates an InputContext for a given model.
|
||||
|
||||
Args:
|
||||
model_name: Name of the model being considered.
|
||||
tokenizer_name: Name of the tokenizer being considered.
|
||||
trust_remote_code: Whether or not to allow loading remote code.
|
||||
|
||||
Returns:
|
||||
InputContext for the model being considered.
|
||||
"""
|
||||
if tokenizer_name is None:
|
||||
tokenizer_name = model_name
|
||||
model_config = ModelConfig(
|
||||
model_name,
|
||||
tokenizer_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype="float32",
|
||||
seed=0,
|
||||
)
|
||||
return InputContext(model_config)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def input_mapper_for_qwen():
|
||||
# Lazy import to avoid initializing CUDA during test collection
|
||||
from vllm.model_executor.models.qwen import input_mapper_for_qwen
|
||||
return input_mapper_for_qwen
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def input_processor_for_qwen():
|
||||
# Lazy import to avoid initializing CUDA during test collection
|
||||
from vllm.model_executor.models.qwen import input_processor_for_qwen
|
||||
return input_processor_for_qwen
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def qwen_vl_context() -> InputContext:
|
||||
"""Get an InputContext for Qwen-VL."""
|
||||
return build_model_context(model_name="Qwen/Qwen-VL",
|
||||
trust_remote_code=True)
|
||||
|
||||
|
||||
# Happy path tests for single/multi-image scenarios for the multimodal
|
||||
# input processor and mapper, respectively
|
||||
@pytest.mark.parametrize("num_images", [1, 2])
|
||||
def test_input_processor_valid_mm_data(input_processor_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
num_images: int):
|
||||
"""Happy cases for image inputs to Qwen's multimodal input processor."""
|
||||
prompt = "".join(
|
||||
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
|
||||
inputs = LLMInputs(
|
||||
prompt=prompt,
|
||||
# When processing multimodal data for a multimodal model, the qwen
|
||||
# input processor will overwrite the provided prompt_token_ids with
|
||||
# the image prompts
|
||||
prompt_token_ids=None,
|
||||
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
|
||||
)
|
||||
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
||||
assert isinstance(proc_inputs, dict)
|
||||
|
||||
# Each image should have one start / stop and a fixed context of 256
|
||||
proc_tokens = proc_inputs["prompt_token_ids"]
|
||||
assert proc_tokens.count(IMG_START_ID) == num_images
|
||||
assert proc_tokens.count(IMG_END_ID) == num_images
|
||||
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"img_data,expected_shape",
|
||||
[
|
||||
# single / multi-image
|
||||
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
|
||||
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
|
||||
# single / multi-image embeddings
|
||||
(torch.rand(
|
||||
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
(torch.rand(
|
||||
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
(torch.rand(
|
||||
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
])
|
||||
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
img_data: Union[torch.Tensor, List[Image],
|
||||
Image],
|
||||
expected_shape: List[int]):
|
||||
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
|
||||
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||
# Ensure that we get the appropriately shaped pixel_values
|
||||
# for images and image embeddings, respectively.
|
||||
assert isinstance(mapped_img_data, MultiModalInputs)
|
||||
assert "pixel_values" in mapped_img_data
|
||||
assert mapped_img_data["pixel_values"].shape == expected_shape
|
||||
|
||||
|
||||
# Sad path tests for the multimodal input processor and mapper, respectively
|
||||
@pytest.mark.parametrize("mm_data", [
|
||||
{
|
||||
"image": torch.rand((5))
|
||||
},
|
||||
{
|
||||
"image": torch.rand((5, 5, 5, 5, 5))
|
||||
},
|
||||
])
|
||||
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
mm_data: Dict[str, torch.Tensor]):
|
||||
"""Test sad cases validated in Qwen's multimodal input processor."""
|
||||
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
|
||||
trust_remote_code=True)
|
||||
prompt = "Picture 1: <img></img>\n"
|
||||
prompt_token_ids = tokenizer.encode(prompt)
|
||||
inputs = LLMInputs(prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_data=mm_data)
|
||||
# Should fail since we have too many or too few dimensions for embeddings
|
||||
with pytest.raises(ValueError):
|
||||
input_processor_for_qwen(qwen_vl_context, inputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"img_data",
|
||||
[
|
||||
# Wrong context length
|
||||
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
|
||||
# Wrong visual encoder output size
|
||||
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
|
||||
])
|
||||
def test_input_mapper_invalid_mm_data(
|
||||
input_mapper_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
img_data: Union[torch.Tensor, List[Image], Image],
|
||||
):
|
||||
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
||||
with pytest.raises(ValueError):
|
||||
input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||
|
||||
|
||||
### End-to-end generation tests
|
||||
def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
||||
assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
|
||||
"""Given a temporary dir path, export one or more image assets into the
|
||||
tempdir & replace its contents with the local path to the string so that
|
||||
the HF version of Qwen-VL can resolve the path and load the image ni its
|
||||
forward() call.
|
||||
|
||||
Args:
|
||||
tmp_path: Tempdir for test under consideration.
|
||||
prompt: Prompt with image placeholders.
|
||||
assets: List of image assets whose len equals the num placeholders.
|
||||
"""
|
||||
# Ensure that the number of placeholders matches the number of assets;
|
||||
# If this is not true, the test is probably written incorrectly.
|
||||
assert prompt.count("<img></img>") == len(assets)
|
||||
|
||||
# Replace the placeholders with local paths to the exported assets
|
||||
for asset in assets:
|
||||
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
||||
asset.pil_image.save(image_tmp_path)
|
||||
prompt = prompt.replace(
|
||||
"<img></img>",
|
||||
f"<img>{image_tmp_path}</img>",
|
||||
1,
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
# Qwen encodes each image into a fixed content size of 256
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs_per_image = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||
vllm_outputs_per_image):
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", multimodal_models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets, model: str,
|
||||
size_factors: List[float], dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
"""Tests multimodal models with single image prompts."""
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
prompts = [
|
||||
get_prompt_with_path(tmp_path, prompt, [asset])
|
||||
for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
|
||||
inputs = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, prompts)]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", multimodal_models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets, model: str,
|
||||
size_factors: List[float], dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
"""Tests multimodal models with multi-image prompts."""
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
# Put all of the images into one prompt.
|
||||
prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
|
||||
image_assets)
|
||||
inputs = [([prompt for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors])]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
# Ensure that a text-only Qwen model can still be loaded and
|
||||
# used for inference in VLLM without throwing.
|
||||
@pytest.mark.parametrize("model", text_only_models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_text_only_qwen_model_can_be_loaded_and_run(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
example_prompts: List[str],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
):
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
example_prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
)
|
||||
0
tests/models/embedding/__init__.py
Normal file
0
tests/models/embedding/__init__.py
Normal file
0
tests/models/embedding/language/__init__.py
Normal file
0
tests/models/embedding/language/__init__.py
Normal file
0
tests/models/encoder_decoder/__init__.py
Normal file
0
tests/models/encoder_decoder/__init__.py
Normal file
0
tests/models/encoder_decoder/language/__init__.py
Normal file
0
tests/models/encoder_decoder/language/__init__.py
Normal file
@@ -1,8 +1,8 @@
|
||||
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_bart.py`.
|
||||
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
|
||||
"""
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
@@ -16,8 +16,10 @@ if not is_cpu():
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import DecoderPromptType
|
||||
from .utils import check_logprobs_close
|
||||
from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
|
||||
HfRunner, VllmRunner)
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
|
||||
|
||||
@@ -34,20 +36,18 @@ if not is_cpu():
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
'''
|
||||
Test the vLLM BART model for a variety of encoder/decoder input prompts,
|
||||
@@ -116,8 +116,29 @@ if not is_cpu():
|
||||
token during the process of validating the vLLM decoded output.
|
||||
'''
|
||||
|
||||
test_case_prompts = example_encoder_decoder_prompts[
|
||||
decoder_prompt_type]
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default).
|
||||
|
||||
# Note: currently encoder/decoder models are only compatible with
|
||||
# enforce_eager=True. Normally this is not a problem because
|
||||
# for encoder/decoder models vLLM will
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-exisitng
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
@@ -135,26 +156,12 @@ if not is_cpu():
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
test_case_prompts,
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
# Note: currently encoder/decoder models are only compatible with
|
||||
# enforce_eager=True. Normally this is not a problem because
|
||||
# for encoder/decoder models vLLM will
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-exisitng
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
test_case_prompts, max_tokens, num_logprobs)
|
||||
|
||||
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
|
||||
else 0)
|
||||
|
||||
@@ -168,3 +175,49 @@ if not is_cpu():
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
|
||||
model, dtype, max_tokens, num_logprobs,
|
||||
decoder_prompt_type) -> None:
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
|
||||
def test_models_distributed(hf_runner, vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
distributed_executor_backend, model, dtype,
|
||||
max_tokens, num_logprobs,
|
||||
decoder_prompt_type) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
1
tests/models/fixtures/pixtral_chat.json
Normal file
1
tests/models/fixtures/pixtral_chat.json
Normal file
File diff suppressed because one or more lines are too long
1
tests/models/fixtures/pixtral_chat_engine.json
Normal file
1
tests/models/fixtures/pixtral_chat_engine.json
Normal file
File diff suppressed because one or more lines are too long
@@ -1,64 +0,0 @@
|
||||
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
|
||||
MODELS = ["mistralai/Pixtral-12B-2409"]
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Model is too big, test passed on A100 locally but will OOM on CI machine."
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
image_urls = [
|
||||
"https://picsum.photos/id/237/200/300",
|
||||
"https://picsum.photos/seed/picsum/200/300"
|
||||
]
|
||||
expected = [
|
||||
"The image depicts a black dog lying on a wooden surface, looking directly at the camera with a calm expression.", # noqa
|
||||
"The image depicts a serene landscape with a snow-covered mountain under a pastel-colored sky during sunset." # noqa
|
||||
]
|
||||
prompt = "Describe the image in one short sentence."
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=512, temperature=0.0)
|
||||
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
tokenizer_mode="mistral") as vllm_model:
|
||||
|
||||
for i, image_url in enumerate(image_urls):
|
||||
messages = [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}, {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
}]
|
||||
},
|
||||
]
|
||||
|
||||
outputs = vllm_model.model.chat(messages,
|
||||
sampling_params=sampling_params)
|
||||
assert outputs[0].outputs[0].text == expected[i]
|
||||
@@ -1,165 +0,0 @@
|
||||
import pathlib
|
||||
from typing import List, Optional, Type
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
|
||||
text_only_models = [
|
||||
"Qwen/Qwen-7B-Chat" # Has no visual component
|
||||
]
|
||||
|
||||
multimodal_models = ["Qwen/Qwen-VL"]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"Picture 1: <img></img>\nWhat's the content of the image?: ",
|
||||
"cherry_blossom":
|
||||
"Picture 1: <img></img>\nWhat is the season?: ",
|
||||
})
|
||||
|
||||
|
||||
### Tests for multimodal Qwen models
|
||||
def run_test(
|
||||
tmp_path: pathlib.PosixPath,
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: List[float],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
# Export the images to a tempdir and substitute it into the hf prompt;
|
||||
# the contents between <img>/</img> will be ignored by VLLM, but the
|
||||
# transformers implementation for the visual transformer parses this to
|
||||
# reload it in the forward call; the contents are treated as a URL or a
|
||||
# local path.
|
||||
for idx, asset in enumerate(image_assets):
|
||||
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
||||
asset.pil_image.save(image_tmp_path)
|
||||
HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace(
|
||||
"<img></img>", f"<img>{image_tmp_path}</img>")
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
# Qwen encodes images into a fixed content size of 256
|
||||
with vllm_runner(model,
|
||||
max_model_len=300,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs_per_image = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||
vllm_outputs_per_image):
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", multimodal_models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets,
|
||||
model, size_factors, dtype, max_tokens,
|
||||
num_logprobs) -> None:
|
||||
run_test(
|
||||
tmp_path,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors=size_factors,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
# Ensure that a text-only Qwen model can still be loaded and
|
||||
# used for inference in VLLM without throwing.
|
||||
@pytest.mark.parametrize("model", text_only_models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_text_only_qwen_model_can_be_loaded_and_run(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
example_prompts,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
):
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
example_prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
)
|
||||
@@ -1,9 +1,10 @@
|
||||
# Test the AsyncLLMEngine with multi-step-decoding
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
|
||||
from ..models.utils import check_logprobs_close
|
||||
from ..utils import (completions_with_server_args, get_client_text_generations,
|
||||
get_client_text_logprob_generations)
|
||||
@@ -33,8 +34,9 @@ DEFAULT_SERVER_ARGS: List[str] = [
|
||||
@pytest.mark.parametrize("eager_mode", [False, True])
|
||||
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
|
||||
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
|
||||
@pytest.mark.parametrize("num_logprobs", [None, 5])
|
||||
@pytest.mark.parametrize("is_async", [False, True])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("is_async", [True])
|
||||
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_step(
|
||||
example_prompts,
|
||||
@@ -46,6 +48,8 @@ async def test_multi_step(
|
||||
num_prompts: int,
|
||||
is_async: bool,
|
||||
num_logprobs: Optional[int],
|
||||
attention_backend: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
|
||||
client/server environment.
|
||||
@@ -71,6 +75,8 @@ async def test_multi_step(
|
||||
completions endpoint; `None` -> no logprobs
|
||||
"""
|
||||
|
||||
override_backend_env_variable(monkeypatch, attention_backend)
|
||||
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
|
||||
@@ -10,6 +10,8 @@ import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
models_4bit_to_test = [
|
||||
('huggyllama/llama-7b', 'quantize model inflight'),
|
||||
]
|
||||
@@ -29,6 +31,7 @@ models_pre_quant_8bit_to_test = [
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@@ -41,6 +44,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_qaunt_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@@ -52,6 +56,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_quant_8bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@@ -77,18 +82,8 @@ def validate_generated_texts(hf_runner,
|
||||
model_name,
|
||||
hf_model_kwargs=None):
|
||||
|
||||
if hf_model_kwargs is None:
|
||||
hf_model_kwargs = {}
|
||||
|
||||
# Run with HF runner
|
||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||
hf_outputs = llm.generate_greedy(prompts, 8)
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
torch.cuda.synchronize()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
# NOTE: run vLLM first, as it requires a clean process
|
||||
# when using distributed inference
|
||||
|
||||
#Run with vLLM runner
|
||||
with vllm_runner(model_name,
|
||||
@@ -104,6 +99,19 @@ def validate_generated_texts(hf_runner,
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if hf_model_kwargs is None:
|
||||
hf_model_kwargs = {}
|
||||
|
||||
# Run with HF runner
|
||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||
hf_outputs = llm.generate_greedy(prompts, 8)
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
torch.cuda.synchronize()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Compare the generated strings
|
||||
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
|
||||
hf_str = hf_log["generated_text"]
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def is_quant_method_supported(quant_method: str) -> bool:
|
||||
# Currently, all quantization methods require Nvidia or AMD GPUs
|
||||
if not torch.cuda.is_available():
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
return False
|
||||
|
||||
capability = current_platform.get_device_capability()
|
||||
|
||||
@@ -10,6 +10,7 @@ from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import requests
|
||||
from openai.types.completion import Completion
|
||||
from transformers import AutoTokenizer
|
||||
@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.model_executor.model_loader.loader import get_model_loader
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
|
||||
from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
|
||||
get_open_port, is_hip)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
from amdsmi import (amdsmi_get_gpu_vram_usage,
|
||||
@@ -356,12 +358,23 @@ def error_on_warning():
|
||||
yield
|
||||
|
||||
|
||||
def get_physical_device_indices(devices):
|
||||
visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
||||
if visible_devices is None:
|
||||
return devices
|
||||
|
||||
visible_indices = [int(x) for x in visible_devices.split(",")]
|
||||
index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
|
||||
return [index_mapping[i] for i in devices if i in index_mapping]
|
||||
|
||||
|
||||
@_nvml()
|
||||
def wait_for_gpu_memory_to_clear(devices: List[int],
|
||||
threshold_bytes: int,
|
||||
timeout_s: float = 120) -> None:
|
||||
# Use nvml instead of pytorch to reduce measurement error from torch cuda
|
||||
# context.
|
||||
devices = get_physical_device_indices(devices)
|
||||
start_time = time.time()
|
||||
while True:
|
||||
output: Dict[int, str] = {}
|
||||
@@ -441,6 +454,22 @@ def fork_new_process_for_each_test(
|
||||
return wrapper
|
||||
|
||||
|
||||
def multi_gpu_test(*, num_gpus: int):
|
||||
"""
|
||||
Decorate a test to be run only when multiple GPUs are available.
|
||||
"""
|
||||
test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
|
||||
test_skipif = pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < num_gpus,
|
||||
reason=f"Need at least {num_gpus} GPUs to run the test.",
|
||||
)
|
||||
|
||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
return test_selector(test_skipif(fork_new_process_for_each_test(f)))
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
async def completions_with_server_args(
|
||||
prompts: List[str],
|
||||
model_name: str,
|
||||
|
||||
@@ -161,16 +161,36 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
|
||||
torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
|
||||
|
||||
|
||||
def advance_step(num_seqs: int, num_queries: int, block_size: int,
|
||||
input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
|
||||
input_positions: torch.Tensor, seq_lens: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
block_tables: torch.Tensor) -> None:
|
||||
def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
|
||||
input_tokens: torch.Tensor,
|
||||
sampled_token_ids: torch.Tensor,
|
||||
input_positions: torch.Tensor,
|
||||
seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
|
||||
block_tables: torch.Tensor) -> None:
|
||||
"""Advance a step on GPU for existing inputs for a multi-step runner"""
|
||||
return torch.ops._C.advance_step(num_seqs, num_queries, block_size,
|
||||
input_tokens, sampled_token_ids,
|
||||
input_positions, seq_lens, slot_mapping,
|
||||
block_tables)
|
||||
return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
|
||||
block_size, input_tokens,
|
||||
sampled_token_ids,
|
||||
input_positions, seq_lens,
|
||||
slot_mapping, block_tables)
|
||||
|
||||
|
||||
def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
|
||||
input_tokens: torch.Tensor,
|
||||
sampled_token_ids: torch.Tensor,
|
||||
input_positions: torch.Tensor,
|
||||
seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
|
||||
block_tables: torch.Tensor,
|
||||
paged_kv_indices: torch.Tensor,
|
||||
paged_kv_indptr: torch.Tensor,
|
||||
paged_kv_last_page_len: torch.Tensor,
|
||||
block_table_bound: torch.Tensor) -> None:
|
||||
|
||||
return torch.ops._C.advance_step_flashinfer(
|
||||
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
|
||||
input_positions, seq_lens, slot_mapping, block_tables,
|
||||
paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
|
||||
block_table_bound)
|
||||
|
||||
|
||||
# quantization ops
|
||||
|
||||
@@ -83,7 +83,9 @@ class AttentionBackend(ABC):
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def advance_step(self, num_seqs: int, num_queries: int):
|
||||
def advance_step(self, model_input: "ModelRunnerInputBase",
|
||||
sampled_token_ids: Optional[torch.Tensor],
|
||||
block_size: int, num_seqs: int, num_queries: int) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
||||
@@ -122,6 +122,40 @@ def _(
|
||||
return torch.empty_like(decode_query)
|
||||
|
||||
|
||||
@torch.library.custom_op("vllm::reshape_and_cache_flash",
|
||||
mutates_args=["kv_cache"])
|
||||
def reshape_and_cache_flash(
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
kv_cache_dtype: str,
|
||||
k_scale: float,
|
||||
v_scale: float,
|
||||
) -> None:
|
||||
"""Inductor cannot deal with inplace operations on views.
|
||||
See https://github.com/pytorch/pytorch/issues/131192
|
||||
and https://github.com/pytorch/pytorch/issues/130174
|
||||
This is a workaround to hide the view operation from the inductor.
|
||||
"""
|
||||
return torch.ops._C_cache_ops.reshape_and_cache_flash(
|
||||
key, value, kv_cache[0], kv_cache[1], slot_mapping, kv_cache_dtype,
|
||||
k_scale, v_scale)
|
||||
|
||||
|
||||
@reshape_and_cache_flash.register_fake # type: ignore
|
||||
def _(
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
kv_cache_dtype: str,
|
||||
k_scale: float,
|
||||
v_scale: float,
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class FlashAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
@@ -346,15 +380,15 @@ class FlashAttentionMetadata(AttentionMetadata):
|
||||
self.seq_lens[i] += 1
|
||||
self.max_decode_seq_len = max(self.seq_lens)
|
||||
|
||||
ops.advance_step(num_seqs=num_seqs,
|
||||
num_queries=num_queries,
|
||||
block_size=block_size,
|
||||
input_tokens=model_input.input_tokens,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
input_positions=model_input.input_positions,
|
||||
seq_lens=self.seq_lens_tensor,
|
||||
slot_mapping=self.slot_mapping,
|
||||
block_tables=self.block_tables)
|
||||
ops.advance_step_flashattn(num_seqs=num_seqs,
|
||||
num_queries=num_queries,
|
||||
block_size=block_size,
|
||||
input_tokens=model_input.input_tokens,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
input_positions=model_input.input_positions,
|
||||
seq_lens=self.seq_lens_tensor,
|
||||
slot_mapping=self.slot_mapping,
|
||||
block_tables=self.block_tables)
|
||||
|
||||
|
||||
class FlashAttentionMetadataBuilder(
|
||||
@@ -653,11 +687,10 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
# Reshape the input keys and values and store them in the cache.
|
||||
# If kv_cache is not provided, the new key and value tensors are
|
||||
# not cached. This happens during the initial memory profiling run.
|
||||
ops.reshape_and_cache_flash(
|
||||
torch.ops.vllm.reshape_and_cache_flash(
|
||||
key,
|
||||
value,
|
||||
key_cache,
|
||||
value_cache,
|
||||
kv_cache,
|
||||
attn_metadata.slot_mapping.flatten(),
|
||||
self.kv_cache_dtype,
|
||||
k_scale,
|
||||
@@ -669,7 +702,6 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
assert key.shape[0] == num_prefill_tokens + num_decode_tokens
|
||||
assert value.shape[0] == num_prefill_tokens + num_decode_tokens
|
||||
|
||||
output = torch.empty_like(query)
|
||||
# Query for decode. KV is not needed because it is already cached.
|
||||
decode_query = query[num_prefill_tokens:]
|
||||
# QKV for prefill.
|
||||
@@ -680,6 +712,9 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
assert query.shape[0] == num_prefill_tokens
|
||||
assert decode_query.shape[0] == num_decode_tokens
|
||||
|
||||
prefill_output: Optional[torch.Tensor] = None
|
||||
decode_output: Optional[torch.Tensor] = None
|
||||
|
||||
if prefill_meta := attn_metadata.prefill_metadata:
|
||||
# Prompt run.
|
||||
if (kv_cache is None or prefill_meta.block_tables is None
|
||||
@@ -687,7 +722,7 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
# normal attention
|
||||
# When block_tables are not filled, it means q and k are the
|
||||
# prompt, and they have the same length.
|
||||
out = torch.ops.vllm.flash_attn_varlen_func(
|
||||
prefill_output = torch.ops.vllm.flash_attn_varlen_func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
@@ -701,42 +736,44 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
alibi_slopes=self.alibi_slopes,
|
||||
softcap=self.logits_soft_cap,
|
||||
)
|
||||
assert output[:num_prefill_tokens].shape == out.shape
|
||||
output[:num_prefill_tokens] = out
|
||||
else:
|
||||
# prefix-enabled attention
|
||||
assert prefill_meta.seq_lens is not None
|
||||
max_seq_len = max(prefill_meta.seq_lens)
|
||||
output[:
|
||||
num_prefill_tokens] = torch.ops.vllm.flash_attn_varlen_func( # noqa
|
||||
q=query,
|
||||
k=key_cache,
|
||||
v=value_cache,
|
||||
cu_seqlens_q=prefill_meta.query_start_loc,
|
||||
max_seqlen_q=prefill_meta.max_query_len,
|
||||
cu_seqlens_k=prefill_meta.seq_start_loc,
|
||||
max_seqlen_k=max_seq_len,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
alibi_slopes=self.alibi_slopes,
|
||||
block_table=prefill_meta.block_tables,
|
||||
softcap=self.logits_soft_cap,
|
||||
)
|
||||
|
||||
if decode_meta := attn_metadata.decode_metadata:
|
||||
# Decoding run.
|
||||
output[
|
||||
num_prefill_tokens:] = torch.ops.vllm.flash_attn_with_kvcache(
|
||||
decode_query.unsqueeze(1),
|
||||
key_cache,
|
||||
value_cache,
|
||||
block_table=decode_meta.block_tables,
|
||||
cache_seqlens=decode_meta.seq_lens_tensor,
|
||||
prefill_output = torch.ops.vllm.flash_attn_varlen_func( # noqa
|
||||
q=query,
|
||||
k=key_cache,
|
||||
v=value_cache,
|
||||
cu_seqlens_q=prefill_meta.query_start_loc,
|
||||
max_seqlen_q=prefill_meta.max_query_len,
|
||||
cu_seqlens_k=prefill_meta.seq_start_loc,
|
||||
max_seqlen_k=max_seq_len,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
alibi_slopes=self.alibi_slopes,
|
||||
block_table=prefill_meta.block_tables,
|
||||
softcap=self.logits_soft_cap,
|
||||
).squeeze(1)
|
||||
)
|
||||
|
||||
# Reshape the output tensor.
|
||||
if decode_meta := attn_metadata.decode_metadata:
|
||||
# Decoding run.
|
||||
decode_output = torch.ops.vllm.flash_attn_with_kvcache(
|
||||
decode_query.unsqueeze(1),
|
||||
key_cache,
|
||||
value_cache,
|
||||
block_table=decode_meta.block_tables,
|
||||
cache_seqlens=decode_meta.seq_lens_tensor,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
alibi_slopes=self.alibi_slopes,
|
||||
softcap=self.logits_soft_cap,
|
||||
).squeeze(1)
|
||||
|
||||
if prefill_output is None:
|
||||
assert decode_output is not None
|
||||
return decode_output.view(num_decode_tokens, hidden_size)
|
||||
if decode_output is None:
|
||||
assert prefill_output is not None
|
||||
return prefill_output.view(num_prefill_tokens, hidden_size)
|
||||
output = torch.cat([prefill_output, decode_output], dim=0)
|
||||
return output.view(num_tokens, hidden_size)
|
||||
|
||||
@@ -30,7 +30,8 @@ from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
|
||||
make_tensor_with_pad)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUBuilder
|
||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
|
||||
ModelInputForGPUWithSamplingMetadata)
|
||||
|
||||
|
||||
class FlashInferBackend(AttentionBackend):
|
||||
@@ -268,6 +269,10 @@ class FlashInferMetadata(AttentionMetadata):
|
||||
query_start_loc: Optional[torch.Tensor] = None
|
||||
block_tables: Optional[torch.Tensor] = None
|
||||
|
||||
# used for GPU in-place advance_step
|
||||
seq_lens_tensor: Optional[torch.Tensor] = None
|
||||
block_table_bound: Optional[torch.Tensor] = None
|
||||
|
||||
# An example for paged_kv_indices, paged_kv_indptr:
|
||||
# request 1, page indices [0, 5, 8]
|
||||
# request 2, page indices [1, 6, 7]
|
||||
@@ -318,6 +323,8 @@ class FlashInferMetadata(AttentionMetadata):
|
||||
assert self.paged_kv_indices is not None
|
||||
assert self.paged_kv_indptr is not None
|
||||
assert self.paged_kv_last_page_len is not None
|
||||
assert self.block_table_bound is not None
|
||||
assert self.seq_lens_tensor is not None
|
||||
batch_size = self.query_start_loc.shape[0] - 1
|
||||
assert batch_size >= 0
|
||||
# We will use flash attention for profiling to
|
||||
@@ -327,6 +334,8 @@ class FlashInferMetadata(AttentionMetadata):
|
||||
self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
|
||||
self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
|
||||
self.device)
|
||||
self.block_table_bound = self.block_table_bound.to(self.device)
|
||||
self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
|
||||
self.paged_kv_indices = self.paged_kv_indices.to(self.device)
|
||||
self.prefill_wrapper.end_forward()
|
||||
self.prefill_wrapper.begin_forward(
|
||||
@@ -335,14 +344,18 @@ class FlashInferMetadata(AttentionMetadata):
|
||||
self.num_qo_heads, self.num_kv_heads, self.head_dim,
|
||||
self.page_size)
|
||||
else:
|
||||
if not self.use_cuda_graph:
|
||||
assert self.paged_kv_indices is not None
|
||||
assert self.paged_kv_indptr is not None
|
||||
assert self.paged_kv_last_page_len is not None
|
||||
self.paged_kv_indices = self.paged_kv_indices.to(self.device)
|
||||
self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
|
||||
self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
|
||||
self.device)
|
||||
assert self.paged_kv_indices is not None
|
||||
assert self.paged_kv_indptr is not None
|
||||
assert self.paged_kv_last_page_len is not None
|
||||
self.paged_kv_indices = self.paged_kv_indices.to(self.device)
|
||||
self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
|
||||
self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
|
||||
self.device)
|
||||
# handle model warmup path
|
||||
if self.block_table_bound is not None:
|
||||
self.block_table_bound = self.block_table_bound.to(self.device)
|
||||
if self.seq_lens_tensor is not None:
|
||||
self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
|
||||
|
||||
assert self.decode_wrapper is not None
|
||||
self.decode_wrapper.end_forward()
|
||||
@@ -391,6 +404,48 @@ class FlashInferMetadata(AttentionMetadata):
|
||||
|
||||
return self
|
||||
|
||||
def advance_step(
|
||||
self,
|
||||
model_input: "ModelInputForGPUWithSamplingMetadata",
|
||||
sampled_token_ids: Optional[torch.Tensor],
|
||||
block_size: int,
|
||||
num_seqs: int,
|
||||
num_queries: int,
|
||||
):
|
||||
"""
|
||||
Update metadata in-place to advance one decode step.
|
||||
"""
|
||||
|
||||
assert num_seqs > 0
|
||||
assert num_queries > 0
|
||||
assert model_input.attn_metadata is not None
|
||||
assert sampled_token_ids is not None
|
||||
|
||||
# When using cudagraph, the num_seqs is padded to the next captured
|
||||
# batch sized, but num_queries tracks the actual number of requests in
|
||||
# the batch. For --enforce-eager mode, num_seqs == num_queries
|
||||
if num_seqs != num_queries:
|
||||
assert num_seqs > num_queries
|
||||
assert self.use_cuda_graph
|
||||
|
||||
model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
|
||||
|
||||
# Update GPU tensors
|
||||
ops.advance_step_flashinfer(
|
||||
num_seqs=num_seqs,
|
||||
num_queries=num_queries,
|
||||
block_size=block_size,
|
||||
input_tokens=model_input.input_tokens,
|
||||
sampled_token_ids=model_input.input_tokens,
|
||||
input_positions=model_input.input_positions,
|
||||
seq_lens=self.seq_lens_tensor,
|
||||
slot_mapping=self.slot_mapping,
|
||||
block_tables=self.block_tables,
|
||||
paged_kv_indices=self.paged_kv_indices,
|
||||
paged_kv_indptr=self.paged_kv_indptr,
|
||||
paged_kv_last_page_len=self.paged_kv_last_page_len,
|
||||
block_table_bound=self.block_table_bound)
|
||||
|
||||
|
||||
class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
|
||||
@@ -428,7 +483,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
self.paged_kv_indptr: List[int] = [0]
|
||||
# paged_kv_last_page_len is the length of the last page of each request
|
||||
self.paged_kv_last_page_len: List[int] = []
|
||||
|
||||
self.total_blocks = 0
|
||||
self.is_profile_run: bool = False
|
||||
|
||||
def _add_seq_group(
|
||||
@@ -499,6 +554,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
# block_table_bound is 1 with 1 valid block.
|
||||
# If seq_len = 15, block_size = 16,
|
||||
# block_table_bound is 0 + 1 with 1 valid block.
|
||||
self.total_blocks += len(block_table)
|
||||
block_table_bound = seq_len // self.block_size + 1 \
|
||||
if seq_len % self.block_size != 0 \
|
||||
else seq_len // self.block_size
|
||||
@@ -541,9 +597,19 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
# The shape of graph_block_tables is
|
||||
# [max batch size, max context len // block size].
|
||||
input_block_tables = self.runner.graph_block_tables[:batch_size]
|
||||
max_blocks = input_block_tables.shape[1]
|
||||
for i, block_table in enumerate(self.block_tables):
|
||||
if block_table:
|
||||
input_block_tables[i, :len(block_table)] = block_table
|
||||
num_blocks = len(block_table)
|
||||
if num_blocks <= max_blocks:
|
||||
input_block_tables[i, :num_blocks] = block_table
|
||||
else:
|
||||
# It may be possible to have more blocks allocated due
|
||||
# to lookahead slots of multi-step, however, they are
|
||||
# not used anyway, so can be safely ignored.
|
||||
input_block_tables[
|
||||
i, :max_blocks] = block_table[:max_blocks]
|
||||
|
||||
block_tables = torch.from_numpy(input_block_tables).to(
|
||||
device, non_blocking=True)
|
||||
|
||||
@@ -583,6 +649,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
out=query_start_loc[1:])
|
||||
|
||||
if len(self.paged_kv_indptr) > 0:
|
||||
# extend to the maximum number of blocks as returned by the
|
||||
# scheduler
|
||||
self.paged_kv_indices.extend(
|
||||
[0] * (self.total_blocks - len(self.paged_kv_indices)))
|
||||
paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
|
||||
device="cpu",
|
||||
dtype=torch.int)
|
||||
@@ -591,10 +661,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
dtype=torch.int)
|
||||
paged_kv_last_page_len_tensor = torch.tensor(
|
||||
self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
|
||||
block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
|
||||
1,
|
||||
device="cpu",
|
||||
dtype=torch.int)
|
||||
else:
|
||||
paged_kv_indices_tensor = None
|
||||
paged_kv_indptr_tensor = None
|
||||
paged_kv_last_page_len_tensor = None
|
||||
block_table_bound_tensor = None
|
||||
|
||||
if self.runner.kv_cache_dtype.startswith("fp8"):
|
||||
kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
|
||||
@@ -613,6 +688,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
paged_kv_indptr=paged_kv_indptr_tensor,
|
||||
paged_kv_indices=paged_kv_indices_tensor,
|
||||
paged_kv_last_page_len=paged_kv_last_page_len_tensor,
|
||||
block_table_bound=block_table_bound_tensor,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
num_qo_heads=self.runner.model_config.get_num_attention_heads(
|
||||
self.runner.parallel_config),
|
||||
num_kv_heads=self.runner.model_config.get_num_kv_heads(
|
||||
|
||||
@@ -869,6 +869,13 @@ class ParallelConfig:
|
||||
f"distributed executor backend "
|
||||
f"'{self.distributed_executor_backend}'.")
|
||||
|
||||
if current_platform.is_tpu() and self.world_size > 1:
|
||||
if self.distributed_executor_backend is None:
|
||||
self.distributed_executor_backend = "ray"
|
||||
if self.distributed_executor_backend != "ray":
|
||||
raise ValueError(
|
||||
"TPU backend only supports Ray for distributed inference.")
|
||||
|
||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||
# We use multiprocessing by default if world_size fits on the
|
||||
# current node and we aren't in a ray placement group.
|
||||
@@ -876,7 +883,7 @@ class ParallelConfig:
|
||||
from vllm.executor import ray_utils
|
||||
backend = "mp"
|
||||
ray_found = ray_utils.ray_is_available()
|
||||
if (torch.cuda.is_available()
|
||||
if (current_platform.is_cuda()
|
||||
and cuda_device_count_stateless() < self.world_size):
|
||||
if not ray_found:
|
||||
raise ValueError("Unable to load Ray which is "
|
||||
|
||||
@@ -843,6 +843,13 @@ class EngineArgs:
|
||||
device_config = DeviceConfig(device=self.device)
|
||||
model_config = self.create_model_config()
|
||||
|
||||
if model_config.is_multimodal_model:
|
||||
if self.enable_prefix_caching:
|
||||
logger.warning(
|
||||
"--enable-prefix-caching is currently not "
|
||||
"supported for multimodal models and has been disabled.")
|
||||
self.enable_prefix_caching = False
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size if self.device != "neuron" else
|
||||
self.max_model_len, # neuron needs block_size = max_model_len
|
||||
@@ -874,7 +881,10 @@ class EngineArgs:
|
||||
# If not explicitly set, enable chunked prefill by default for
|
||||
# long context (> 32K) models. This is to avoid OOM errors in the
|
||||
# initial memory profiling phase.
|
||||
if use_long_context:
|
||||
|
||||
# Chunked prefill is currently disabled for multimodal models by
|
||||
# default.
|
||||
if use_long_context and not model_config.is_multimodal_model:
|
||||
is_gpu = device_config.device_type == "cuda"
|
||||
use_sliding_window = (model_config.get_sliding_window()
|
||||
is not None)
|
||||
@@ -1035,7 +1045,6 @@ class EngineArgs:
|
||||
@dataclass
|
||||
class AsyncEngineArgs(EngineArgs):
|
||||
"""Arguments for asynchronous vLLM engine."""
|
||||
engine_use_ray: bool = False
|
||||
disable_log_requests: bool = False
|
||||
|
||||
@staticmethod
|
||||
@@ -1043,16 +1052,6 @@ class AsyncEngineArgs(EngineArgs):
|
||||
async_args_only: bool = False) -> FlexibleArgumentParser:
|
||||
if not async_args_only:
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
parser.add_argument('--engine-use-ray',
|
||||
action='store_true',
|
||||
help='Use Ray to start the LLM engine in a '
|
||||
'separate process as the server process.'
|
||||
'(DEPRECATED. This argument is deprecated '
|
||||
'and will be removed in a future update. '
|
||||
'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
|
||||
'use it. See '
|
||||
'https://github.com/vllm-project/vllm/issues/7045.'
|
||||
')')
|
||||
parser.add_argument('--disable-log-requests',
|
||||
action='store_true',
|
||||
help='Disable logging requests.')
|
||||
|
||||
@@ -4,22 +4,18 @@ from functools import partial
|
||||
from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
|
||||
Mapping, Optional, Set, Tuple, Type, Union)
|
||||
|
||||
from typing_extensions import assert_never
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_timeout import asyncio_timeout
|
||||
from vllm.engine.llm_engine import (DecoderPromptComponents, LLMEngine,
|
||||
PromptComponents, SchedulerOutputState)
|
||||
from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
|
||||
from vllm.engine.metrics_types import StatLoggerBase
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster, ray
|
||||
from vllm.inputs import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
|
||||
SingletonPromptInputs)
|
||||
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
|
||||
from vllm.executor.gpu_executor import GPUExecutorAsync
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster
|
||||
from vllm.inputs import PromptInputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
@@ -30,7 +26,6 @@ from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import print_warning_once
|
||||
|
||||
logger = init_logger(__name__)
|
||||
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
|
||||
@@ -404,139 +399,6 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
"""Stop the remote worker execution loop."""
|
||||
await self.model_executor.stop_remote_worker_execution_loop_async()
|
||||
|
||||
async def _tokenize_prompt_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> List[int]:
|
||||
"""Async version of :meth:`_tokenize_prompt`."""
|
||||
tokenizer = self.get_tokenizer_group(
|
||||
missing_msg="prompts must be None if skip_tokenizer_init is True")
|
||||
|
||||
return await tokenizer.encode_async(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
|
||||
async def _extract_prompt_components_async(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> PromptComponents:
|
||||
"""Async version of :meth:`_extract_prompt_components`."""
|
||||
if isinstance(inputs, str):
|
||||
prompt = inputs
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
multi_modal_data = None
|
||||
elif isinstance(inputs, dict):
|
||||
if "prompt_token_ids" in inputs:
|
||||
prompt = None
|
||||
prompt_token_ids = inputs["prompt_token_ids"]
|
||||
else:
|
||||
# NOTE: This extra assignment is required to pass mypy
|
||||
prompt = parsed_prompt = inputs["prompt"]
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
parsed_prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
multi_modal_data = inputs.get("multi_modal_data")
|
||||
else:
|
||||
assert_never(inputs)
|
||||
|
||||
return prompt, prompt_token_ids, multi_modal_data
|
||||
|
||||
async def _process_encoder_decoder_prompt_async(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderLLMInputs:
|
||||
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
|
||||
encoder_comps: PromptComponents
|
||||
decoder_comps: DecoderPromptComponents
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
encoder_task = self._extract_prompt_components_async(
|
||||
inputs["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
if (decoder_input := inputs["decoder_prompt"]) is None:
|
||||
encoder_comps = await encoder_task
|
||||
decoder_comps = None, None, None
|
||||
else:
|
||||
decoder_task = self._extract_prompt_components_async(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
encoder_comps, decoder_comps = await asyncio.gather(
|
||||
encoder_task, decoder_task)
|
||||
else:
|
||||
encoder_comps = await self._extract_prompt_components_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
decoder_comps = None, None, None
|
||||
|
||||
return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
|
||||
|
||||
async def _process_decoder_only_prompt_async(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> LLMInputs:
|
||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
||||
prompt_comps = await self._extract_prompt_components_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
prompt_comps,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
async def process_model_inputs_async(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
|
||||
"""Async version of :meth:`process_model_inputs`."""
|
||||
if self.is_encoder_decoder_model():
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
model_inputs = await self._process_encoder_decoder_prompt_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
else:
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
"to decoder-only models")
|
||||
|
||||
# Decoder-only operation
|
||||
model_inputs = await self._process_decoder_only_prompt_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
return self.input_processor(model_inputs)
|
||||
|
||||
async def add_request_async(
|
||||
self,
|
||||
request_id: str,
|
||||
@@ -554,12 +416,13 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
|
||||
processed_inputs = await self.process_model_inputs_async(
|
||||
preprocessed_inputs = await self.input_preprocessor.preprocess_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
processed_inputs = self.input_processor(preprocessed_inputs)
|
||||
|
||||
self._add_processed_request(
|
||||
request_id=request_id,
|
||||
@@ -590,9 +453,6 @@ class AsyncLLMEngine:
|
||||
worker_use_ray: Whether to use Ray for model workers. Required for
|
||||
distributed execution. Should be the same as
|
||||
`parallel_config.worker_use_ray`.
|
||||
engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
|
||||
async frontend will be executed in a separate process as the
|
||||
model workers.
|
||||
log_requests: Whether to log the requests.
|
||||
start_engine_loop: If True, the background task to run the engine
|
||||
will be automatically started in the generate call.
|
||||
@@ -604,41 +464,23 @@ class AsyncLLMEngine:
|
||||
|
||||
def __init__(self,
|
||||
worker_use_ray: bool,
|
||||
engine_use_ray: bool,
|
||||
*args,
|
||||
log_requests: bool = True,
|
||||
start_engine_loop: bool = True,
|
||||
**kwargs) -> None:
|
||||
self.worker_use_ray = worker_use_ray
|
||||
self.engine_use_ray = engine_use_ray
|
||||
self.log_requests = log_requests
|
||||
self.engine = self._init_engine(*args, **kwargs)
|
||||
self.engine = self._engine_class(*args, **kwargs)
|
||||
|
||||
# This ensures quick processing of request outputs
|
||||
# so the append to asyncio queues is not delayed,
|
||||
# especially for multi-step.
|
||||
#
|
||||
# TODO: Currently, disabled for engine_use_ray, ask
|
||||
# Cody/Will/Woosuk about this case.
|
||||
self.use_process_request_outputs_callback = not self.engine_use_ray
|
||||
self.use_process_request_outputs_callback = True
|
||||
if self.use_process_request_outputs_callback:
|
||||
self.engine.process_request_outputs_callback = \
|
||||
self.process_request_outputs
|
||||
|
||||
if self.engine_use_ray:
|
||||
print_warning_once(
|
||||
"DEPRECATED. `--engine-use-ray` is deprecated and will "
|
||||
"be removed in a future update. "
|
||||
"See https://github.com/vllm-project/vllm/issues/7045.")
|
||||
|
||||
if envs.VLLM_ALLOW_ENGINE_USE_RAY:
|
||||
print_warning_once(
|
||||
"VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
|
||||
else:
|
||||
raise ValueError("`--engine-use-ray` is deprecated. "
|
||||
"Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
|
||||
"force use it")
|
||||
|
||||
self.background_loop: Optional[asyncio.Future] = None
|
||||
# We need to keep a reference to unshielded
|
||||
# task as well to prevent it from being garbage
|
||||
@@ -725,16 +567,11 @@ class AsyncLLMEngine:
|
||||
# Create the engine configs.
|
||||
engine_config = engine_args.create_engine_config()
|
||||
|
||||
if engine_args.engine_use_ray:
|
||||
from vllm.executor import ray_utils
|
||||
ray_utils.assert_ray_available()
|
||||
|
||||
executor_class = cls._get_executor_cls(engine_config)
|
||||
|
||||
# Create the async LLM engine.
|
||||
engine = cls(
|
||||
executor_class.uses_ray,
|
||||
engine_args.engine_use_ray,
|
||||
**engine_config.to_dict(),
|
||||
executor_class=executor_class,
|
||||
log_requests=not engine_args.disable_log_requests,
|
||||
@@ -777,10 +614,6 @@ class AsyncLLMEngine:
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_tokenizer.remote( # type: ignore
|
||||
lora_request)
|
||||
|
||||
return await (self.engine.get_tokenizer_group().
|
||||
get_lora_tokenizer_async(lora_request))
|
||||
|
||||
@@ -814,26 +647,6 @@ class AsyncLLMEngine:
|
||||
self._background_loop_unshielded = None
|
||||
self.background_loop = None
|
||||
|
||||
def _init_engine(self, *args,
|
||||
**kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
|
||||
if not self.engine_use_ray:
|
||||
engine_class = self._engine_class
|
||||
elif self.worker_use_ray:
|
||||
engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
|
||||
else:
|
||||
# FIXME(woosuk): This is a bit hacky. Be careful when changing the
|
||||
# order of the arguments.
|
||||
cache_config = kwargs["cache_config"]
|
||||
parallel_config = kwargs["parallel_config"]
|
||||
if (parallel_config.tensor_parallel_size == 1
|
||||
and parallel_config.pipeline_parallel_size == 1):
|
||||
num_gpus = cache_config.gpu_memory_utilization
|
||||
else:
|
||||
num_gpus = 1
|
||||
engine_class = ray.remote(num_gpus=num_gpus)(
|
||||
self._engine_class).remote
|
||||
return engine_class(*args, **kwargs)
|
||||
|
||||
async def engine_step(self, virtual_engine: int) -> bool:
|
||||
"""Kick the engine to process the waiting requests.
|
||||
|
||||
@@ -844,13 +657,8 @@ class AsyncLLMEngine:
|
||||
|
||||
for new_request in new_requests:
|
||||
# Add the request into the vLLM engine's waiting queue.
|
||||
# TODO: Maybe add add_request_batch to reduce Ray overhead
|
||||
try:
|
||||
if self.engine_use_ray:
|
||||
await self.engine.add_request.remote( # type: ignore
|
||||
**new_request)
|
||||
else:
|
||||
await self.engine.add_request_async(**new_request)
|
||||
await self.engine.add_request_async(**new_request)
|
||||
except ValueError as e:
|
||||
# TODO: use a vLLM specific error for failed validation
|
||||
self._request_tracker.process_exception(
|
||||
@@ -862,10 +670,7 @@ class AsyncLLMEngine:
|
||||
if aborted_requests:
|
||||
await self._engine_abort(aborted_requests)
|
||||
|
||||
if self.engine_use_ray:
|
||||
request_outputs = await self.engine.step.remote() # type: ignore
|
||||
else:
|
||||
request_outputs = await self.engine.step_async(virtual_engine)
|
||||
request_outputs = await self.engine.step_async(virtual_engine)
|
||||
|
||||
# Put the outputs into the corresponding streams.
|
||||
# If used as a callback, then already invoked inside
|
||||
@@ -891,16 +696,10 @@ class AsyncLLMEngine:
|
||||
return all_finished
|
||||
|
||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
||||
if self.engine_use_ray:
|
||||
await self.engine.abort_request.remote(request_ids) # type: ignore
|
||||
else:
|
||||
self.engine.abort_request(request_ids)
|
||||
self.engine.abort_request(request_ids)
|
||||
|
||||
async def run_engine_loop(self):
|
||||
if self.engine_use_ray:
|
||||
pipeline_parallel_size = 1 # type: ignore
|
||||
else:
|
||||
pipeline_parallel_size = \
|
||||
pipeline_parallel_size = \
|
||||
self.engine.parallel_config.pipeline_parallel_size
|
||||
has_requests_in_progress = [False] * pipeline_parallel_size
|
||||
while True:
|
||||
@@ -912,12 +711,7 @@ class AsyncLLMEngine:
|
||||
# timeout, and unblocks the RPC thread in the workers so that
|
||||
# they can process any other queued control plane messages,
|
||||
# such as add/remove lora adapters.
|
||||
if self.engine_use_ray:
|
||||
await (self.engine.stop_remote_worker_execution_loop.
|
||||
remote() # type: ignore
|
||||
)
|
||||
else:
|
||||
await self.engine.stop_remote_worker_execution_loop_async()
|
||||
await self.engine.stop_remote_worker_execution_loop_async()
|
||||
await self._request_tracker.wait_for_new_requests()
|
||||
logger.debug("Got new requests!")
|
||||
requests_in_progress = [
|
||||
@@ -938,17 +732,9 @@ class AsyncLLMEngine:
|
||||
for task in done:
|
||||
result = task.result()
|
||||
virtual_engine = requests_in_progress.index(task)
|
||||
if self.engine_use_ray:
|
||||
has_unfinished_requests = (
|
||||
await (self.engine.
|
||||
has_unfinished_requests_for_virtual_engine.
|
||||
remote( # type: ignore
|
||||
virtual_engine)))
|
||||
else:
|
||||
has_unfinished_requests = (
|
||||
self.engine.
|
||||
has_unfinished_requests_for_virtual_engine(
|
||||
virtual_engine))
|
||||
has_unfinished_requests = (
|
||||
self.engine.has_unfinished_requests_for_virtual_engine(
|
||||
virtual_engine))
|
||||
if result or has_unfinished_requests:
|
||||
requests_in_progress[virtual_engine] = (
|
||||
asyncio.create_task(
|
||||
@@ -1190,52 +976,29 @@ class AsyncLLMEngine:
|
||||
|
||||
async def get_model_config(self) -> ModelConfig:
|
||||
"""Get the model configuration of the vLLM engine."""
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_model_config.remote() # type: ignore
|
||||
else:
|
||||
return self.engine.get_model_config()
|
||||
return self.engine.get_model_config()
|
||||
|
||||
async def get_parallel_config(self) -> ParallelConfig:
|
||||
"""Get the parallel configuration of the vLLM engine."""
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_parallel_config.remote( # type: ignore
|
||||
)
|
||||
else:
|
||||
return self.engine.get_parallel_config()
|
||||
return self.engine.get_parallel_config()
|
||||
|
||||
async def get_decoding_config(self) -> DecodingConfig:
|
||||
"""Get the decoding configuration of the vLLM engine."""
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_decoding_config.remote( # type: ignore
|
||||
)
|
||||
else:
|
||||
return self.engine.get_decoding_config()
|
||||
return self.engine.get_decoding_config()
|
||||
|
||||
async def get_scheduler_config(self) -> SchedulerConfig:
|
||||
"""Get the scheduling configuration of the vLLM engine."""
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_scheduler_config.remote( # type: ignore
|
||||
)
|
||||
else:
|
||||
return self.engine.get_scheduler_config()
|
||||
return self.engine.get_scheduler_config()
|
||||
|
||||
async def get_lora_config(self) -> LoRAConfig:
|
||||
"""Get the lora configuration of the vLLM engine."""
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.get_lora_config.remote( # type: ignore
|
||||
)
|
||||
else:
|
||||
return self.engine.get_lora_config()
|
||||
return self.engine.get_lora_config()
|
||||
|
||||
async def do_log_stats(
|
||||
self,
|
||||
scheduler_outputs: Optional[SchedulerOutputs] = None,
|
||||
model_output: Optional[List[SamplerOutput]] = None) -> None:
|
||||
if self.engine_use_ray:
|
||||
await self.engine.do_log_stats.remote( # type: ignore
|
||||
scheduler_outputs, model_output)
|
||||
else:
|
||||
self.engine.do_log_stats()
|
||||
self.engine.do_log_stats()
|
||||
|
||||
async def check_health(self) -> None:
|
||||
"""Raises an error if engine is unhealthy."""
|
||||
@@ -1244,40 +1007,30 @@ class AsyncLLMEngine:
|
||||
if self.is_stopped:
|
||||
raise AsyncEngineDeadError("Background loop is stopped.")
|
||||
|
||||
if self.engine_use_ray:
|
||||
try:
|
||||
await self.engine.check_health.remote() # type: ignore
|
||||
except ray.exceptions.RayActorError as e:
|
||||
raise RuntimeError("Engine is dead.") from e
|
||||
else:
|
||||
await self.engine.check_health_async()
|
||||
await self.engine.check_health_async()
|
||||
logger.debug("Health check took %fs", time.perf_counter() - t)
|
||||
|
||||
async def is_tracing_enabled(self) -> bool:
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.is_tracing_enabled.remote( # type: ignore
|
||||
)
|
||||
else:
|
||||
return self.engine.is_tracing_enabled()
|
||||
return self.engine.is_tracing_enabled()
|
||||
|
||||
def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
|
||||
if self.engine_use_ray:
|
||||
ray.get(
|
||||
self.engine.add_logger.remote( # type: ignore
|
||||
logger_name=logger_name, logger=logger))
|
||||
else:
|
||||
self.engine.add_logger(logger_name=logger_name, logger=logger)
|
||||
self.engine.add_logger(logger_name=logger_name, logger=logger)
|
||||
|
||||
def remove_logger(self, logger_name: str) -> None:
|
||||
if self.engine_use_ray:
|
||||
ray.get(
|
||||
self.engine.remove_logger.remote( # type: ignore
|
||||
logger_name=logger_name))
|
||||
else:
|
||||
self.engine.remove_logger(logger_name=logger_name)
|
||||
self.engine.remove_logger(logger_name=logger_name)
|
||||
|
||||
async def start_profile(self) -> None:
|
||||
self.engine.model_executor._run_workers("start_profile")
|
||||
# using type instead of isinstance to check to avoid capturing
|
||||
# inherited classes
|
||||
if type(self.engine.model_executor) == GPUExecutorAsync:
|
||||
self.engine.model_executor.start_profile()
|
||||
else:
|
||||
self.engine.model_executor._run_workers("start_profile")
|
||||
|
||||
async def stop_profile(self) -> None:
|
||||
self.engine.model_executor._run_workers("stop_profile")
|
||||
# using type instead of isinstance to check to avoid capturing
|
||||
# inherited classes
|
||||
if type(self.engine.model_executor) == GPUExecutorAsync:
|
||||
self.engine.model_executor.stop_profile()
|
||||
else:
|
||||
self.engine.model_executor._run_workers("stop_profile")
|
||||
|
||||
@@ -3,13 +3,13 @@ import time
|
||||
from collections import deque
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import (TYPE_CHECKING, Any, ClassVar, Deque, Dict, Iterable, List,
|
||||
Mapping, NamedTuple, Optional)
|
||||
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
|
||||
Iterable, List, Mapping, NamedTuple, Optional)
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Set, Tuple, Type, Union
|
||||
from typing import Set, Type, Union
|
||||
|
||||
import torch
|
||||
from typing_extensions import TypeVar, assert_never
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
|
||||
@@ -26,20 +26,19 @@ from vllm.engine.output_processor.interfaces import (
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.engine.output_processor.util import create_output_by_sequence_group
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.executor.gpu_executor import GPUExecutor
|
||||
from vllm.executor.ray_utils import initialize_ray_cluster
|
||||
from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
|
||||
InputRegistry, LLMInputs, PromptInputs,
|
||||
SingletonPromptInputs)
|
||||
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
|
||||
InputRegistry, LLMInputs, PromptInputs)
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
|
||||
RequestOutputFactory)
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
|
||||
Sequence, SequenceGroup, SequenceGroupMetadata,
|
||||
SequenceStatus)
|
||||
@@ -75,11 +74,6 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
|
||||
_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
|
||||
_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
|
||||
|
||||
PromptComponents = Tuple[Optional[str], List[int],
|
||||
Optional[MultiModalDataDict]]
|
||||
DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
|
||||
Optional[MultiModalDataDict]]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchedulerOutputState:
|
||||
@@ -225,9 +219,6 @@ class LLMEngine:
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
|
||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
||||
# To improve performance, only final requests outputs may be required.
|
||||
# If this set to true, then no intermediate outputs will be returned.
|
||||
step_return_finished_only: bool = False,
|
||||
) -> None:
|
||||
logger.info(
|
||||
"Initializing an LLM engine (v%s) with config: "
|
||||
@@ -295,7 +286,6 @@ class LLMEngine:
|
||||
self.observability_config = observability_config or ObservabilityConfig(
|
||||
)
|
||||
self.log_stats = log_stats
|
||||
self.step_return_finished_only = step_return_finished_only
|
||||
|
||||
if not self.model_config.skip_tokenizer_init:
|
||||
self.tokenizer = self._init_tokenizer()
|
||||
@@ -317,6 +307,9 @@ class LLMEngine:
|
||||
self.generation_config_fields = _load_generation_config_dict(
|
||||
model_config)
|
||||
|
||||
self.input_preprocessor = InputPreprocessor(model_config,
|
||||
self.tokenizer)
|
||||
|
||||
self.input_registry = input_registry
|
||||
self.input_processor = input_registry.create_input_processor(
|
||||
model_config)
|
||||
@@ -397,7 +390,7 @@ class LLMEngine:
|
||||
|
||||
# Currently used by AsyncLLMEngine to ensure quick append
|
||||
# of request outputs to asyncio queues
|
||||
self.process_request_outputs_callback = None
|
||||
self.process_request_outputs_callback: Optional[Callable] = None
|
||||
|
||||
# Create the scheduler.
|
||||
# NOTE: the cache_config here have been updated with the numbers of
|
||||
@@ -575,19 +568,15 @@ class LLMEngine:
|
||||
if model_executor := getattr(self, "model_executor", None):
|
||||
model_executor.shutdown()
|
||||
|
||||
MISSING_TOKENIZER_GROUP_MSG = ("Unable to get tokenizer because "
|
||||
"skip_tokenizer_init is True")
|
||||
|
||||
def get_tokenizer_group(
|
||||
self,
|
||||
group_type: Type[_G] = BaseTokenizerGroup,
|
||||
*,
|
||||
missing_msg: str = MISSING_TOKENIZER_GROUP_MSG,
|
||||
) -> _G:
|
||||
tokenizer_group = self.tokenizer
|
||||
|
||||
if tokenizer_group is None:
|
||||
raise ValueError(missing_msg)
|
||||
raise ValueError("Unable to get tokenizer because "
|
||||
"skip_tokenizer_init is True")
|
||||
if not isinstance(tokenizer_group, group_type):
|
||||
raise TypeError("Invalid type of tokenizer group. "
|
||||
f"Expected type: {group_type}, but "
|
||||
@@ -619,52 +608,6 @@ class LLMEngine:
|
||||
self.prompt_adapter_config.verify_with_model_config(
|
||||
self.model_config)
|
||||
|
||||
def _get_bos_token_id(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
if self.tokenizer is None:
|
||||
logger.warning("Using None for BOS token id because tokenizer "
|
||||
"is not initialized")
|
||||
return None
|
||||
|
||||
return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
|
||||
|
||||
def _get_eos_token_id(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
if self.tokenizer is None:
|
||||
logger.warning("Using None for EOS token id because tokenizer "
|
||||
"is not initialized")
|
||||
return None
|
||||
|
||||
return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
|
||||
|
||||
def _get_decoder_start_token_id(self) -> Optional[int]:
|
||||
'''
|
||||
Obtain the decoder start token id employed by an encoder/decoder
|
||||
model. Returns None for non-encoder/decoder models or if the
|
||||
model config is unavailable.
|
||||
'''
|
||||
|
||||
if not self.is_encoder_decoder_model():
|
||||
logger.warning("Using None for decoder start token id because "
|
||||
"this is not an encoder/decoder model.")
|
||||
return None
|
||||
|
||||
if (self.model_config is None or self.model_config.hf_config is None):
|
||||
logger.warning("Using None for decoder start token id because "
|
||||
"model config is not available.")
|
||||
return None
|
||||
|
||||
dec_start_token_id = getattr(self.model_config.hf_config,
|
||||
'decoder_start_token_id', None)
|
||||
if dec_start_token_id is None:
|
||||
logger.warning("Falling back on <BOS> for decoder start token id "
|
||||
"because decoder start token id is not available.")
|
||||
dec_start_token_id = self._get_bos_token_id()
|
||||
|
||||
return dec_start_token_id
|
||||
|
||||
def _add_processed_request(
|
||||
self,
|
||||
request_id: str,
|
||||
@@ -679,7 +622,7 @@ class LLMEngine:
|
||||
# Create the sequences.
|
||||
block_size = self.cache_config.block_size
|
||||
seq_id = next(self.seq_counter)
|
||||
eos_token_id = self._get_eos_token_id(lora_request)
|
||||
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
||||
|
||||
seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id,
|
||||
lora_request, prompt_adapter_request)
|
||||
@@ -729,334 +672,6 @@ class LLMEngine:
|
||||
def stop_remote_worker_execution_loop(self) -> None:
|
||||
self.model_executor.stop_remote_worker_execution_loop()
|
||||
|
||||
_LLMInputComponentsType = Tuple[str, List[int]]
|
||||
|
||||
def _prepare_decoder_input_ids_for_generation(
|
||||
self,
|
||||
decoder_input_ids: Optional[List[int]],
|
||||
) -> List[int]:
|
||||
"""
|
||||
Prepares `decoder_input_ids` for generation with encoder-decoder models.
|
||||
|
||||
Based on
|
||||
|
||||
https://github.com/huggingface/transformers/blob/
|
||||
4037a2b5b1278736e566aec12e169100275545ea/
|
||||
src/transformers/generation/utils.py
|
||||
|
||||
specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
|
||||
|
||||
Arguments:
|
||||
|
||||
* decoder_input_ids: input token ids to preprocess
|
||||
|
||||
Returns:
|
||||
|
||||
* Processed token list
|
||||
"""
|
||||
|
||||
decoder_start_token_id = self._get_decoder_start_token_id()
|
||||
assert decoder_start_token_id is not None
|
||||
|
||||
if decoder_input_ids is None:
|
||||
# no decoder prompt input ->
|
||||
# use decoder_start_token_id as decoder_input_ids
|
||||
decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
|
||||
|
||||
if (len(decoder_input_ids) == 0
|
||||
or decoder_input_ids[0] != decoder_start_token_id):
|
||||
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
|
||||
|
||||
return decoder_input_ids
|
||||
|
||||
def _tokenize_prompt(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> List[int]:
|
||||
'''
|
||||
Wrapper around application of the model's tokenizer.
|
||||
|
||||
Arguments:
|
||||
|
||||
* prompt
|
||||
* request_id
|
||||
* lora_request
|
||||
|
||||
Returns:
|
||||
|
||||
* prompt token ids
|
||||
'''
|
||||
|
||||
tokenizer = self.get_tokenizer_group(
|
||||
missing_msg="prompts must be None if skip_tokenizer_init is True")
|
||||
|
||||
return tokenizer.encode(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
|
||||
def _extract_prompt_components(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> PromptComponents:
|
||||
'''
|
||||
Extract the components of any single encoder or decoder input prompt.
|
||||
|
||||
Arguments:
|
||||
|
||||
* request_id
|
||||
* inputs: single encoder or decoder input prompt
|
||||
* lora_request: this is only valid for decoder prompts
|
||||
|
||||
Returns:
|
||||
|
||||
* prompt
|
||||
* prompt_token_ids
|
||||
* multi_modal_data
|
||||
'''
|
||||
|
||||
if isinstance(inputs, str):
|
||||
prompt = inputs
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
multi_modal_data = None
|
||||
elif isinstance(inputs, dict):
|
||||
if "prompt_token_ids" in inputs:
|
||||
prompt = None
|
||||
prompt_token_ids = inputs["prompt_token_ids"]
|
||||
else:
|
||||
# NOTE: This extra assignment is required to pass mypy
|
||||
prompt = parsed_prompt = inputs["prompt"]
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
parsed_prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
multi_modal_data = inputs.get("multi_modal_data")
|
||||
else:
|
||||
assert_never(inputs)
|
||||
|
||||
return prompt, prompt_token_ids, multi_modal_data
|
||||
|
||||
def _apply_prompt_adapter(
|
||||
self,
|
||||
prompt_token_ids: List[int],
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
) -> List[int]:
|
||||
if prompt_adapter_request:
|
||||
prompt_token_ids = (
|
||||
[0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
|
||||
+ prompt_token_ids)
|
||||
|
||||
return prompt_token_ids
|
||||
|
||||
def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
|
||||
'''
|
||||
Specifically for encoder/decoder models:
|
||||
generate a default decoder prompt for when
|
||||
the user specifies only the encoder prompt.
|
||||
|
||||
Encoder/decoder models utilize the decoder
|
||||
prompt in different ways; as new models are
|
||||
added, it is intended that this function
|
||||
will be extended to produce differing
|
||||
default decoder prompts, depending on the
|
||||
model variety.
|
||||
|
||||
Absent a special case, the default behavior
|
||||
of this method is to mirror the behavior of
|
||||
the HuggingFace (HF) GenerationMixin for a None
|
||||
decoder prompt, which is to employ a logit processor
|
||||
setting to force the first decoded token to be <BOS>.
|
||||
Here, this behavior is approximated by having the
|
||||
"default" decoder prompt be <BOS>.
|
||||
|
||||
However, it is possible that in the future
|
||||
other models may have different or more
|
||||
complex logic for the default decoder prompt.
|
||||
This motivates having a special helper method
|
||||
for default decoder prompts.
|
||||
|
||||
Returns:
|
||||
|
||||
* prompt_token_ids
|
||||
'''
|
||||
|
||||
bos_token_id = self._get_bos_token_id()
|
||||
assert bos_token_id is not None
|
||||
return [bos_token_id]
|
||||
|
||||
def _build_enc_dec_llm_inputs(
|
||||
self,
|
||||
encoder_comps: PromptComponents,
|
||||
decoder_comps: DecoderPromptComponents,
|
||||
) -> EncoderDecoderLLMInputs:
|
||||
encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
|
||||
decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
|
||||
|
||||
if encoder_mm_data is not None or decoder_mm_data is not None:
|
||||
raise ValueError("Multi-modal encoder-decoder models are "
|
||||
"not supported yet")
|
||||
|
||||
decoder_prompt_ids = (
|
||||
self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
|
||||
|
||||
return EncoderDecoderLLMInputs(
|
||||
prompt_token_ids=decoder_prompt_ids,
|
||||
prompt=decoder_prompt,
|
||||
encoder_prompt_token_ids=encoder_prompt_ids,
|
||||
encoder_prompt=encoder_prompt,
|
||||
)
|
||||
|
||||
def _process_encoder_decoder_prompt(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderLLMInputs:
|
||||
'''
|
||||
For encoder/decoder models only:
|
||||
Process an input prompt into an
|
||||
:class:`EncoderDecoderLLMInputs` instance.
|
||||
|
||||
There are two types of input prompts:
|
||||
singleton prompts which carry only the
|
||||
encoder prompt, and explicit encoder/decoder
|
||||
prompts which carry both the encoder and the
|
||||
decoder prompts as member variables.
|
||||
|
||||
This function handles the following scenarios:
|
||||
* Singleton encoder prompt: extract encoder prompt
|
||||
token ids & infer default decoder prompt token ids
|
||||
* Explicit encoder/decoder prompt: extract encoder
|
||||
and decoder prompt token ids
|
||||
|
||||
Note that for Explicit encoder/decoder prompts,
|
||||
each sub-prompt (encoder or decoder prompt) can
|
||||
have any possible singleton type; thus this
|
||||
method relies on helper functions to obtain
|
||||
token ids for the sub-prompts.
|
||||
|
||||
Arguments:
|
||||
|
||||
* inputs: an input prompt
|
||||
* request_id
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`EncoderDecoderLLMInputs` instance
|
||||
'''
|
||||
|
||||
encoder_comps: PromptComponents
|
||||
decoder_comps: DecoderPromptComponents
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
encoder_comps = self._extract_prompt_components(
|
||||
inputs["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
if (decoder_input := inputs["decoder_prompt"]) is None:
|
||||
decoder_comps = None, None, None
|
||||
else:
|
||||
decoder_comps = self._extract_prompt_components(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
else:
|
||||
encoder_comps = self._extract_prompt_components(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
decoder_comps = None, None, None
|
||||
|
||||
return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
|
||||
|
||||
def _build_decoder_only_llm_inputs(
|
||||
self,
|
||||
prompt_comps: PromptComponents,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
) -> LLMInputs:
|
||||
prompt, prompt_token_ids, multi_modal_data = prompt_comps
|
||||
|
||||
prompt_token_ids = self._apply_prompt_adapter(
|
||||
prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
|
||||
|
||||
return LLMInputs(prompt_token_ids=prompt_token_ids,
|
||||
prompt=prompt,
|
||||
multi_modal_data=multi_modal_data)
|
||||
|
||||
def _process_decoder_only_prompt(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> LLMInputs:
|
||||
'''
|
||||
For decoder-only models:
|
||||
Process an input prompt into an :class:`LLMInputs` instance.
|
||||
|
||||
Arguments:
|
||||
|
||||
* inputs: input prompt
|
||||
* request_id
|
||||
* lora_request
|
||||
* prompt_adapter_request
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`LLMInputs` instance
|
||||
'''
|
||||
|
||||
prompt_comps = self._extract_prompt_components(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
prompt_comps,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
def process_model_inputs(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
|
||||
|
||||
if self.is_encoder_decoder_model():
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
model_inputs = self._process_encoder_decoder_prompt(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
else:
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
"to decoder-only models")
|
||||
|
||||
# Decoder-only operation
|
||||
model_inputs = self._process_decoder_only_prompt(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
return self.input_processor(model_inputs)
|
||||
|
||||
def add_request(
|
||||
self,
|
||||
request_id: str,
|
||||
@@ -1115,12 +730,13 @@ class LLMEngine:
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
|
||||
processed_inputs = self.process_model_inputs(
|
||||
preprocessed_inputs = self.input_preprocessor.preprocess(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
processed_inputs = self.input_processor(preprocessed_inputs)
|
||||
|
||||
self._add_processed_request(
|
||||
request_id=request_id,
|
||||
@@ -1378,7 +994,8 @@ class LLMEngine:
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
seq_group.maybe_set_first_token_time(now)
|
||||
request_output = RequestOutputFactory.create(seq_group)
|
||||
ctx.request_outputs.append(request_output)
|
||||
if request_output:
|
||||
ctx.request_outputs.append(request_output)
|
||||
|
||||
# When we process a single request, we skip it for the next time,
|
||||
# and invoke the request output callback (if there was final output)
|
||||
@@ -1415,14 +1032,19 @@ class LLMEngine:
|
||||
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
seq_group.maybe_set_first_token_time(now)
|
||||
if (seq_group.is_finished()
|
||||
if self.step_return_finished_only else True):
|
||||
request_output = RequestOutputFactory.create(seq_group)
|
||||
request_output = RequestOutputFactory.create(seq_group)
|
||||
if request_output:
|
||||
ctx.request_outputs.append(request_output)
|
||||
|
||||
for seq_group in scheduler_outputs.ignored_seq_groups:
|
||||
params = seq_group.sampling_params
|
||||
if params is not None and params.output_kind == (
|
||||
RequestOutputKind.DELTA) and not seq_group.is_finished():
|
||||
continue
|
||||
|
||||
request_output = RequestOutputFactory.create(seq_group)
|
||||
ctx.request_outputs.append(request_output)
|
||||
if request_output:
|
||||
ctx.request_outputs.append(request_output)
|
||||
|
||||
# Immediately process request outputs here (if callback is given)
|
||||
if (ctx.request_outputs
|
||||
@@ -1435,7 +1057,8 @@ class LLMEngine:
|
||||
# LLMEngine/AsyncLLMEngine directly
|
||||
if is_async:
|
||||
# Log stats.
|
||||
self.do_log_stats(scheduler_outputs, outputs, finished_before)
|
||||
self.do_log_stats(scheduler_outputs, outputs, finished_before,
|
||||
skip)
|
||||
|
||||
# Tracing
|
||||
self.do_tracing(scheduler_outputs)
|
||||
@@ -1742,18 +1365,20 @@ class LLMEngine:
|
||||
def do_log_stats(self,
|
||||
scheduler_outputs: Optional[SchedulerOutputs] = None,
|
||||
model_output: Optional[List[SamplerOutput]] = None,
|
||||
finished_before: Optional[List[int]] = None) -> None:
|
||||
finished_before: Optional[List[int]] = None,
|
||||
skip: Optional[List[int]] = None) -> None:
|
||||
"""Forced log when no requests active."""
|
||||
if self.log_stats:
|
||||
stats = self._get_stats(scheduler_outputs, model_output,
|
||||
finished_before)
|
||||
finished_before, skip)
|
||||
for logger in self.stat_loggers.values():
|
||||
logger.log(stats)
|
||||
|
||||
def _get_stats(self,
|
||||
scheduler_outputs: Optional[SchedulerOutputs],
|
||||
model_output: Optional[List[SamplerOutput]] = None,
|
||||
finished_before: Optional[List[int]] = None) -> Stats:
|
||||
finished_before: Optional[List[int]] = None,
|
||||
skip: Optional[List[int]] = None) -> Stats:
|
||||
"""Get Stats to be Logged to Prometheus.
|
||||
|
||||
Args:
|
||||
@@ -1761,6 +1386,10 @@ class LLMEngine:
|
||||
the scheduled batch,
|
||||
model_output: Optional, used to emit speculative decoding metrics
|
||||
which are created by the workers.
|
||||
finished_before: Optional, indices of sequences that were finished
|
||||
before. These sequences will be ignored.
|
||||
skip: Optional, indices of sequences that were preempted. These
|
||||
sequences will be ignored.
|
||||
"""
|
||||
now = time.time()
|
||||
|
||||
@@ -1835,6 +1464,11 @@ class LLMEngine:
|
||||
actual_num_batched_tokens -= 1
|
||||
continue
|
||||
|
||||
# Currently, skip == preempted sequences, so we need to skip
|
||||
# their log stats
|
||||
if skip and idx in skip:
|
||||
continue
|
||||
|
||||
group_was_prefill = idx < scheduler_outputs.num_prefill_groups
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
|
||||
@@ -1964,10 +1598,20 @@ class LLMEngine:
|
||||
self.model_executor.check_health()
|
||||
|
||||
def start_profile(self) -> None:
|
||||
self.model_executor.start_profile()
|
||||
# using type instead of isinstance to check to avoid capturing
|
||||
# inherited classes (MultiprocessingGPUExecutor)
|
||||
if type(self.model_executor) == GPUExecutor:
|
||||
self.model_executor.start_profile()
|
||||
else:
|
||||
self.model_executor._run_workers("start_profile")
|
||||
|
||||
def stop_profile(self) -> None:
|
||||
self.model_executor.stop_profile()
|
||||
# using type instead of isinstance to check to avoid capturing
|
||||
# inherited classes (MultiprocessingGPUExecutor)
|
||||
if type(self.model_executor) == GPUExecutor:
|
||||
self.model_executor.stop_profile()
|
||||
else:
|
||||
self.model_executor._run_workers("stop_profile")
|
||||
|
||||
def is_tracing_enabled(self) -> bool:
|
||||
return self.tracer is not None
|
||||
@@ -2041,7 +1685,7 @@ class LLMEngine:
|
||||
metrics.model_execute_time)
|
||||
|
||||
def is_encoder_decoder_model(self):
|
||||
return self.model_config.is_encoder_decoder_model
|
||||
return self.input_preprocessor.is_encoder_decoder_model()
|
||||
|
||||
def is_embedding_model(self):
|
||||
return self.model_config.is_embedding_model
|
||||
|
||||
@@ -19,7 +19,7 @@ from vllm.model_executor.guided_decoding.guided_fields import LLMGuidedOptions
|
||||
from vllm.outputs import EmbeddingRequestOutput, RequestOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
get_cached_tokenizer)
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
@@ -642,14 +642,12 @@ class LLM:
|
||||
raise ValueError("The lengths of prompts and lora_request "
|
||||
"must be the same.")
|
||||
|
||||
if isinstance(params, list):
|
||||
params = [
|
||||
self._add_guided_processor(param, guided_options)
|
||||
if isinstance(param, SamplingParams) else param
|
||||
for param in params
|
||||
]
|
||||
elif isinstance(params, SamplingParams):
|
||||
params = self._add_guided_processor(params, guided_options)
|
||||
for sp in params if isinstance(params, list) else (params, ):
|
||||
if isinstance(sp, SamplingParams):
|
||||
self._add_guided_processor(sp, guided_options)
|
||||
|
||||
# We only care about the final output
|
||||
sp.output_kind = RequestOutputKind.FINAL_ONLY
|
||||
|
||||
# Add requests to the engine.
|
||||
for i, request_inputs in enumerate(inputs):
|
||||
@@ -709,9 +707,6 @@ class LLM:
|
||||
f"output: {0:.2f} toks/s"),
|
||||
)
|
||||
|
||||
# In the loop below, only finished outputs are used
|
||||
self.llm_engine.step_return_finished_only = True
|
||||
|
||||
# Run the engine.
|
||||
outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
|
||||
total_in_toks = 0
|
||||
@@ -724,6 +719,7 @@ class LLM:
|
||||
if use_tqdm:
|
||||
if isinstance(output, RequestOutput):
|
||||
# Calculate tokens only for RequestOutput
|
||||
assert output.prompt_token_ids is not None
|
||||
total_in_toks += len(output.prompt_token_ids)
|
||||
in_spd = total_in_toks / pbar.format_dict["elapsed"]
|
||||
total_out_toks += sum(
|
||||
@@ -735,9 +731,6 @@ class LLM:
|
||||
f"output: {out_spd:.2f} toks/s")
|
||||
pbar.update(1)
|
||||
|
||||
# Restore original behavior
|
||||
self.llm_engine.step_return_finished_only = False
|
||||
|
||||
if use_tqdm:
|
||||
pbar.close()
|
||||
# Sort the outputs by request ID.
|
||||
|
||||
@@ -12,7 +12,8 @@ from typing_extensions import Annotated, Required, TypedDict
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.entrypoints.openai.logits_processors import get_logits_processors
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import LogitsProcessor, SamplingParams
|
||||
from vllm.sampling_params import (LogitsProcessor, RequestOutputKind,
|
||||
SamplingParams)
|
||||
from vllm.sequence import Logprob
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import random_uuid
|
||||
@@ -316,6 +317,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
length_penalty=self.length_penalty,
|
||||
logits_processors=logits_processors,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
output_kind=RequestOutputKind.DELTA if self.stream \
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@@ -559,6 +562,8 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
length_penalty=self.length_penalty,
|
||||
logits_processors=logits_processors,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
output_kind=RequestOutputKind.DELTA if self.stream \
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
|
||||
@@ -195,7 +195,6 @@ async def main(args):
|
||||
engine = AsyncLLMEngine.from_engine_args(
|
||||
engine_args, usage_context=UsageContext.OPENAI_BATCH_RUNNER)
|
||||
|
||||
# When using single vLLM without engine_use_ray
|
||||
model_config = await engine.get_model_config()
|
||||
|
||||
if args.disable_log_requests:
|
||||
|
||||
@@ -246,8 +246,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
|
||||
if request.add_generation_prompt:
|
||||
return self.response_role
|
||||
else:
|
||||
return request.messages[-1]["role"]
|
||||
return request.messages[-1]["role"]
|
||||
|
||||
async def chat_completion_stream_generator(
|
||||
self,
|
||||
@@ -264,15 +263,37 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
# Send response for each token for each request.n (index)
|
||||
num_choices = 1 if request.n is None else request.n
|
||||
previous_texts = [""] * num_choices
|
||||
previous_num_tokens = [0] * num_choices
|
||||
finish_reason_sent = [False] * num_choices
|
||||
|
||||
num_prompt_tokens = 0
|
||||
|
||||
tool_parser: Optional[ToolParser] = self.tool_parser(
|
||||
tokenizer) if self.tool_parser else None
|
||||
|
||||
if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
|
||||
tool_choice_function_name = request.tool_choice.function.name
|
||||
else:
|
||||
tool_choice_function_name = None
|
||||
|
||||
# Determine whether tools are in use with "auto" tool choice
|
||||
tool_choice_auto = (
|
||||
not tool_choice_function_name
|
||||
and self._should_stream_with_auto_tool_parsing(request))
|
||||
|
||||
all_previous_token_ids: Optional[List[List[int]]]
|
||||
if tool_choice_auto:
|
||||
# These are only required in "auto" tool choice case
|
||||
previous_texts = [""] * num_choices
|
||||
all_previous_token_ids = [[]] * num_choices
|
||||
else:
|
||||
previous_texts, all_previous_token_ids = None, None
|
||||
|
||||
try:
|
||||
async for res in result_generator:
|
||||
if res.prompt_token_ids is not None:
|
||||
num_prompt_tokens = len(res.prompt_token_ids)
|
||||
|
||||
# We need to do it here, because if there are exceptions in
|
||||
# the result_generator, it needs to be sent as the FIRST
|
||||
# response (by the try...catch).
|
||||
@@ -305,10 +326,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
and request.stream_options.include_usage):
|
||||
# if continuous usage stats are requested, add it
|
||||
if request.stream_options.continuous_usage_stats:
|
||||
prompt_tokens = len(res.prompt_token_ids)
|
||||
usage = UsageInfo(prompt_tokens=prompt_tokens,
|
||||
completion_tokens=0,
|
||||
total_tokens=prompt_tokens)
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=0,
|
||||
total_tokens=num_prompt_tokens)
|
||||
chunk.usage = usage
|
||||
# otherwise don't
|
||||
else:
|
||||
@@ -344,12 +365,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
request.stream_options.include_usage):
|
||||
if (request.stream_options.
|
||||
continuous_usage_stats):
|
||||
prompt_tokens = len(
|
||||
res.prompt_token_ids)
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=0,
|
||||
total_tokens=prompt_tokens)
|
||||
total_tokens=num_prompt_tokens)
|
||||
chunk.usage = usage
|
||||
else:
|
||||
chunk.usage = None
|
||||
@@ -360,65 +379,66 @@ class OpenAIServingChat(OpenAIServing):
|
||||
first_iteration = False
|
||||
|
||||
for output in res.outputs:
|
||||
|
||||
i = output.index
|
||||
|
||||
if finish_reason_sent[i]:
|
||||
continue
|
||||
|
||||
delta_token_ids = output.token_ids[previous_num_tokens[i]:]
|
||||
out_logprobs = output.logprobs[
|
||||
previous_num_tokens[i]:] if output.logprobs else None
|
||||
|
||||
if request.logprobs and request.top_logprobs is not None:
|
||||
assert out_logprobs is not None, (
|
||||
assert output.logprobs is not None, (
|
||||
"Did not output logprobs")
|
||||
logprobs = self._create_chat_logprobs(
|
||||
token_ids=delta_token_ids,
|
||||
top_logprobs=out_logprobs,
|
||||
token_ids=output.token_ids,
|
||||
top_logprobs=output.logprobs,
|
||||
tokenizer=tokenizer,
|
||||
num_output_top_logprobs=request.top_logprobs,
|
||||
)
|
||||
else:
|
||||
logprobs = None
|
||||
|
||||
delta_text = output.text[len(previous_texts[i]):]
|
||||
delta_message: Optional[DeltaMessage] = None
|
||||
delta_text = output.text
|
||||
delta_message: Optional[DeltaMessage]
|
||||
|
||||
# handle streaming deltas for tools with named tool_choice
|
||||
if (request.tool_choice and type(request.tool_choice) is
|
||||
ChatCompletionNamedToolChoiceParam):
|
||||
if tool_choice_function_name:
|
||||
delta_message = DeltaMessage(tool_calls=[
|
||||
DeltaToolCall(function=DeltaFunctionCall(
|
||||
name=request.tool_choice.function.name,
|
||||
name=tool_choice_function_name,
|
||||
arguments=delta_text),
|
||||
index=i)
|
||||
])
|
||||
|
||||
# handle streaming deltas for tools with "auto" tool choice
|
||||
elif (self._should_stream_with_auto_tool_parsing(request)
|
||||
and tool_parser):
|
||||
elif tool_choice_auto:
|
||||
assert previous_texts is not None
|
||||
assert all_previous_token_ids is not None
|
||||
assert tool_parser is not None
|
||||
#TODO optimize manipulation of these lists
|
||||
previous_text = previous_texts[i]
|
||||
previous_token_ids = all_previous_token_ids[i]
|
||||
current_text = previous_text + delta_text
|
||||
current_token_ids = previous_token_ids + list(
|
||||
output.token_ids)
|
||||
|
||||
delta_message = (
|
||||
tool_parser.extract_tool_calls_streaming(
|
||||
previous_text=previous_texts[i],
|
||||
current_text=output.text,
|
||||
previous_text=previous_text,
|
||||
current_text=current_text,
|
||||
delta_text=delta_text,
|
||||
previous_token_ids= \
|
||||
output.token_ids[
|
||||
:-1 * len(delta_token_ids)
|
||||
],
|
||||
current_token_ids=output.token_ids,
|
||||
delta_token_ids=delta_token_ids
|
||||
)
|
||||
)
|
||||
previous_token_ids=previous_token_ids,
|
||||
current_token_ids=current_token_ids,
|
||||
delta_token_ids=output.token_ids))
|
||||
|
||||
# update the previous values for the next iteration
|
||||
previous_texts[i] = current_text
|
||||
all_previous_token_ids[i] = current_token_ids
|
||||
|
||||
# handle streaming just a content delta
|
||||
else:
|
||||
delta_message = DeltaMessage(content=delta_text)
|
||||
|
||||
# set the previous values for the next iteration
|
||||
previous_texts[i] = output.text
|
||||
previous_num_tokens[i] = len(output.token_ids)
|
||||
previous_num_tokens[i] += len(output.token_ids)
|
||||
|
||||
# if the message delta is None (e.g. because it was a
|
||||
# "control token" for tool calls or the parser otherwise
|
||||
@@ -445,13 +465,12 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# handle usage stats if requested & if continuous
|
||||
if (request.stream_options
|
||||
and request.stream_options.include_usage):
|
||||
if (request.stream_options.continuous_usage_stats):
|
||||
prompt_tokens = len(res.prompt_token_ids)
|
||||
if request.stream_options.continuous_usage_stats:
|
||||
completion_tokens = len(output.token_ids)
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens +
|
||||
total_tokens=num_prompt_tokens +
|
||||
completion_tokens,
|
||||
)
|
||||
chunk.usage = usage
|
||||
@@ -482,7 +501,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
tool_parser.prev_tool_call_arr[index].get(
|
||||
"arguments", {}))
|
||||
|
||||
# get what we've streamed so for for arguments
|
||||
# get what we've streamed so far for arguments
|
||||
# for the current tool
|
||||
actual_call = tool_parser.streamed_args_for_tool[
|
||||
index]
|
||||
@@ -500,7 +519,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
])
|
||||
|
||||
# Send the finish response for each request.n only once
|
||||
prompt_tokens = len(res.prompt_token_ids)
|
||||
choice_data = ChatCompletionResponseStreamChoice(
|
||||
index=i,
|
||||
delta=delta_message,
|
||||
@@ -518,13 +536,12 @@ class OpenAIServingChat(OpenAIServing):
|
||||
model=model_name)
|
||||
if (request.stream_options
|
||||
and request.stream_options.include_usage):
|
||||
if (request.stream_options.continuous_usage_stats):
|
||||
prompt_tokens = len(res.prompt_token_ids)
|
||||
if request.stream_options.continuous_usage_stats:
|
||||
completion_tokens = len(output.token_ids)
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens +
|
||||
total_tokens=num_prompt_tokens +
|
||||
completion_tokens,
|
||||
)
|
||||
chunk.usage = usage
|
||||
@@ -538,10 +555,11 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# is sent, send the usage
|
||||
if (request.stream_options
|
||||
and request.stream_options.include_usage):
|
||||
completion_tokens = previous_num_tokens[i]
|
||||
final_usage = UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=previous_num_tokens[i],
|
||||
total_tokens=prompt_tokens + previous_num_tokens[i],
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=num_prompt_tokens + completion_tokens,
|
||||
)
|
||||
|
||||
final_usage_chunk = ChatCompletionStreamResponse(
|
||||
@@ -607,7 +625,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
# if auto tools are not enabled, and a named tool choice using
|
||||
# outlines is not being used
|
||||
if not (self.enable_auto_tools
|
||||
if (not self.enable_auto_tools
|
||||
or not self.tool_parser) and not isinstance(
|
||||
request.tool_choice,
|
||||
ChatCompletionNamedToolChoiceParam):
|
||||
@@ -680,6 +698,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
or "")
|
||||
choice.message.content = full_message
|
||||
|
||||
assert final_res.prompt_token_ids is not None
|
||||
num_prompt_tokens = len(final_res.prompt_token_ids)
|
||||
num_generated_tokens = sum(
|
||||
len(output.token_ids) for output in final_res.outputs)
|
||||
@@ -789,9 +808,9 @@ class OpenAIServingChat(OpenAIServing):
|
||||
return bool(
|
||||
# if there is a delta message that includes tool calls which
|
||||
# include a function that has arguments
|
||||
self.enable_auto_tools and self.tool_parser and delta_message
|
||||
output.finish_reason is not None
|
||||
and self.enable_auto_tools and self.tool_parser and delta_message
|
||||
and delta_message.tool_calls and delta_message.tool_calls[0]
|
||||
and delta_message.tool_calls[0].function
|
||||
and delta_message.tool_calls[0].function.arguments is not None
|
||||
and output.finish_reason is not None
|
||||
)
|
||||
|
||||
@@ -223,9 +223,10 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
tokenizer: AnyTokenizer,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
num_choices = 1 if request.n is None else request.n
|
||||
previous_texts = [""] * num_choices * num_prompts
|
||||
previous_text_lens = [0] * num_choices * num_prompts
|
||||
previous_num_tokens = [0] * num_choices * num_prompts
|
||||
has_echoed = [False] * num_choices * num_prompts
|
||||
num_prompt_tokens = [0] * num_prompts
|
||||
|
||||
try:
|
||||
async for prompt_idx, res in result_generator:
|
||||
@@ -233,6 +234,10 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
prompt_logprobs = res.prompt_logprobs
|
||||
prompt_text = res.prompt
|
||||
|
||||
# Prompt details are excluded from later streamed outputs
|
||||
if res.prompt_token_ids is not None:
|
||||
num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
|
||||
|
||||
delta_token_ids: GenericSequence[int]
|
||||
out_logprobs: Optional[GenericSequence[Optional[Dict[
|
||||
int, Logprob]]]]
|
||||
@@ -244,6 +249,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
|
||||
assert request.max_tokens is not None
|
||||
if request.echo and request.max_tokens == 0:
|
||||
assert prompt_token_ids is not None
|
||||
assert prompt_text is not None
|
||||
# only return the prompt
|
||||
delta_text = prompt_text
|
||||
@@ -252,6 +258,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
has_echoed[i] = True
|
||||
elif (request.echo and request.max_tokens > 0
|
||||
and not has_echoed[i]):
|
||||
assert prompt_token_ids is not None
|
||||
assert prompt_text is not None
|
||||
assert prompt_logprobs is not None
|
||||
# echo the prompt and first token
|
||||
@@ -266,11 +273,9 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
has_echoed[i] = True
|
||||
else:
|
||||
# return just the delta
|
||||
delta_text = output.text[len(previous_texts[i]):]
|
||||
delta_token_ids = output.token_ids[
|
||||
previous_num_tokens[i]:]
|
||||
out_logprobs = output.logprobs[previous_num_tokens[
|
||||
i]:] if output.logprobs else None
|
||||
delta_text = output.text
|
||||
delta_token_ids = output.token_ids
|
||||
out_logprobs = output.logprobs
|
||||
|
||||
if request.logprobs is not None:
|
||||
assert out_logprobs is not None, (
|
||||
@@ -280,13 +285,13 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
top_logprobs=out_logprobs,
|
||||
num_output_top_logprobs=request.logprobs,
|
||||
tokenizer=tokenizer,
|
||||
initial_text_offset=len(previous_texts[i]),
|
||||
initial_text_offset=previous_text_lens[i],
|
||||
)
|
||||
else:
|
||||
logprobs = None
|
||||
|
||||
previous_texts[i] = output.text
|
||||
previous_num_tokens[i] = len(output.token_ids)
|
||||
previous_text_lens[i] += len(output.text)
|
||||
previous_num_tokens[i] += len(output.token_ids)
|
||||
finish_reason = output.finish_reason
|
||||
stop_reason = output.stop_reason
|
||||
|
||||
@@ -307,8 +312,8 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
and request.stream_options.include_usage):
|
||||
if (request.stream_options.continuous_usage_stats
|
||||
or output.finish_reason is not None):
|
||||
prompt_tokens = len(prompt_token_ids)
|
||||
completion_tokens = len(output.token_ids)
|
||||
prompt_tokens = num_prompt_tokens[prompt_idx]
|
||||
completion_tokens = previous_num_tokens[i]
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
@@ -356,6 +361,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
|
||||
for final_res in final_res_batch:
|
||||
prompt_token_ids = final_res.prompt_token_ids
|
||||
assert prompt_token_ids is not None
|
||||
prompt_logprobs = final_res.prompt_logprobs
|
||||
prompt_text = final_res.prompt
|
||||
|
||||
@@ -411,9 +417,9 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
)
|
||||
choices.append(choice_data)
|
||||
|
||||
num_generated_tokens += len(output.token_ids)
|
||||
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
num_generated_tokens += sum(
|
||||
len(output.token_ids) for output in final_res.outputs)
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
|
||||
@@ -33,7 +33,6 @@ class Hermes2ProToolParser(ToolParser):
|
||||
self.current_tool_name_sent: bool = False
|
||||
self.prev_tool_call_arr: List[Dict] = []
|
||||
self.current_tool_id: int = -1
|
||||
self.current_tool_name_sent = False
|
||||
self.streamed_args_for_tool: List[str] = [
|
||||
] # map what has been streamed for each tool so far to a list
|
||||
|
||||
|
||||
@@ -58,7 +58,6 @@ if TYPE_CHECKING:
|
||||
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
|
||||
VLLM_TEST_FORCE_FP8_MARLIN: bool = False
|
||||
VLLM_RPC_GET_DATA_TIMEOUT_MS: int = 5000
|
||||
VLLM_ALLOW_ENGINE_USE_RAY: bool = False
|
||||
VLLM_PLUGINS: Optional[List[str]] = None
|
||||
VLLM_TORCH_PROFILER_DIR: Optional[str] = None
|
||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||
@@ -391,14 +390,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
"VLLM_RPC_GET_DATA_TIMEOUT_MS":
|
||||
lambda: int(os.getenv("VLLM_RPC_GET_DATA_TIMEOUT_MS", "5000")),
|
||||
|
||||
# If set, allow running the engine as a separate ray actor,
|
||||
# which is a deprecated feature soon to be removed.
|
||||
# See https://github.com/vllm-project/vllm/issues/7045
|
||||
"VLLM_ALLOW_ENGINE_USE_RAY":
|
||||
lambda:
|
||||
(os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
|
||||
("1", "true")),
|
||||
|
||||
# a list of plugin names to load, separated by commas.
|
||||
# if this is not set, it means all plugins will be loaded
|
||||
# if this is set to an empty string, no plugins will be loaded
|
||||
|
||||
@@ -5,7 +5,8 @@ from typing_extensions import TypeIs
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt,
|
||||
LLMInputs, PromptInputs)
|
||||
LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt,
|
||||
TokensPrompt)
|
||||
|
||||
|
||||
class ParsedText(TypedDict):
|
||||
@@ -60,8 +61,38 @@ def parse_and_batch_prompt(
|
||||
for elem in prompt
|
||||
]
|
||||
|
||||
raise ValueError("prompt must be a string, array of strings, "
|
||||
"array of tokens, or array of token arrays")
|
||||
raise TypeError("prompt must be a string, array of strings, "
|
||||
"array of tokens, or array of token arrays")
|
||||
|
||||
|
||||
class ParsedStrPrompt(TypedDict):
|
||||
type: Literal["str"]
|
||||
content: str
|
||||
|
||||
|
||||
class ParsedTextPrompt(TypedDict):
|
||||
type: Literal["text"]
|
||||
content: TextPrompt
|
||||
|
||||
|
||||
class ParsedTokensPrompt(TypedDict):
|
||||
type: Literal["tokens"]
|
||||
content: TokensPrompt
|
||||
|
||||
|
||||
def parse_singleton_prompt(
|
||||
inputs: SingletonPromptInputs,
|
||||
) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
|
||||
if isinstance(inputs, str):
|
||||
return ParsedStrPrompt(type="str", content=inputs)
|
||||
elif isinstance(inputs, dict):
|
||||
if "prompt_token_ids" in inputs:
|
||||
return ParsedTokensPrompt(type="tokens",
|
||||
content=inputs) # type: ignore
|
||||
elif "prompt" in inputs:
|
||||
return ParsedTextPrompt(type="text", content=inputs)
|
||||
|
||||
raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
|
||||
|
||||
|
||||
def is_explicit_encoder_decoder_prompt(
|
||||
|
||||
536
vllm/inputs/preprocess.py
Normal file
536
vllm/inputs/preprocess.py
Normal file
@@ -0,0 +1,536 @@
|
||||
import asyncio
|
||||
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
||||
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||
|
||||
from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
|
||||
SingletonPromptInputs)
|
||||
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
PromptComponents = Tuple[Optional[str], List[int],
|
||||
Optional["MultiModalDataDict"]]
|
||||
DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
|
||||
Optional["MultiModalDataDict"]]
|
||||
|
||||
|
||||
class InputPreprocessor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
tokenizer: Optional[BaseTokenizerGroup],
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.model_config = model_config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def get_tokenizer_group(self) -> BaseTokenizerGroup:
|
||||
if self.tokenizer is None:
|
||||
raise ValueError("You cannot pass text prompts when "
|
||||
"`skip_tokenizer_init` is True")
|
||||
|
||||
return self.tokenizer
|
||||
|
||||
def get_bos_token_id(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
if self.tokenizer is None:
|
||||
logger.warning("Using None for BOS token id because tokenizer "
|
||||
"is not initialized")
|
||||
return None
|
||||
|
||||
return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
|
||||
|
||||
def get_eos_token_id(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
if self.tokenizer is None:
|
||||
logger.warning("Using None for EOS token id because tokenizer "
|
||||
"is not initialized")
|
||||
return None
|
||||
|
||||
return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
|
||||
|
||||
def get_decoder_start_token_id(self) -> Optional[int]:
|
||||
'''
|
||||
Obtain the decoder start token id employed by an encoder/decoder
|
||||
model. Returns None for non-encoder/decoder models or if the
|
||||
model config is unavailable.
|
||||
'''
|
||||
|
||||
if not self.is_encoder_decoder_model():
|
||||
logger.warning("Using None for decoder start token id because "
|
||||
"this is not an encoder/decoder model.")
|
||||
return None
|
||||
|
||||
if (self.model_config is None or self.model_config.hf_config is None):
|
||||
logger.warning("Using None for decoder start token id because "
|
||||
"model config is not available.")
|
||||
return None
|
||||
|
||||
dec_start_token_id = getattr(self.model_config.hf_config,
|
||||
'decoder_start_token_id', None)
|
||||
if dec_start_token_id is None:
|
||||
logger.warning("Falling back on <BOS> for decoder start token id "
|
||||
"because decoder start token id is not available.")
|
||||
dec_start_token_id = self.get_bos_token_id()
|
||||
|
||||
return dec_start_token_id
|
||||
|
||||
def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
|
||||
'''
|
||||
Specifically for encoder/decoder models:
|
||||
generate a default decoder prompt for when
|
||||
the user specifies only the encoder prompt.
|
||||
|
||||
Encoder/decoder models utilize the decoder
|
||||
prompt in different ways; as new models are
|
||||
added, it is intended that this function
|
||||
will be extended to produce differing
|
||||
default decoder prompts, depending on the
|
||||
model variety.
|
||||
|
||||
Absent a special case, the default behavior
|
||||
of this method is to mirror the behavior of
|
||||
the HuggingFace (HF) GenerationMixin for a None
|
||||
decoder prompt, which is to employ a logit processor
|
||||
setting to force the first decoded token to be <BOS>.
|
||||
Here, this behavior is approximated by having the
|
||||
"default" decoder prompt be <BOS>.
|
||||
|
||||
However, it is possible that in the future
|
||||
other models may have different or more
|
||||
complex logic for the default decoder prompt.
|
||||
This motivates having a special helper method
|
||||
for default decoder prompts.
|
||||
|
||||
Returns:
|
||||
|
||||
* prompt_token_ids
|
||||
'''
|
||||
|
||||
bos_token_id = self.get_bos_token_id()
|
||||
assert bos_token_id is not None
|
||||
return [bos_token_id]
|
||||
|
||||
def _prepare_decoder_input_ids_for_generation(
|
||||
self,
|
||||
decoder_input_ids: Optional[List[int]],
|
||||
) -> List[int]:
|
||||
"""
|
||||
Prepares `decoder_input_ids` for generation with encoder-decoder models.
|
||||
|
||||
Based on
|
||||
|
||||
https://github.com/huggingface/transformers/blob/
|
||||
4037a2b5b1278736e566aec12e169100275545ea/
|
||||
src/transformers/generation/utils.py
|
||||
|
||||
specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
|
||||
|
||||
Arguments:
|
||||
|
||||
* decoder_input_ids: input token ids to preprocess
|
||||
|
||||
Returns:
|
||||
|
||||
* Processed token list
|
||||
"""
|
||||
|
||||
decoder_start_token_id = self.get_decoder_start_token_id()
|
||||
assert decoder_start_token_id is not None
|
||||
|
||||
if decoder_input_ids is None:
|
||||
# no decoder prompt input ->
|
||||
# use decoder_start_token_id as decoder_input_ids
|
||||
decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
|
||||
|
||||
if (len(decoder_input_ids) == 0
|
||||
or decoder_input_ids[0] != decoder_start_token_id):
|
||||
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
|
||||
|
||||
return decoder_input_ids
|
||||
|
||||
def _apply_prompt_adapter(
|
||||
self,
|
||||
prompt_token_ids: List[int],
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
) -> List[int]:
|
||||
if prompt_adapter_request:
|
||||
prompt_token_ids = (
|
||||
[0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
|
||||
+ prompt_token_ids)
|
||||
|
||||
return prompt_token_ids
|
||||
|
||||
def _tokenize_prompt(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> List[int]:
|
||||
"""
|
||||
Apply the model's tokenizer to a text prompt, returning the
|
||||
corresponding token IDs.
|
||||
"""
|
||||
tokenizer = self.get_tokenizer_group()
|
||||
|
||||
return tokenizer.encode(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
|
||||
async def _tokenize_prompt_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> List[int]:
|
||||
"""Async version of :meth:`_tokenize_prompt`."""
|
||||
tokenizer = self.get_tokenizer_group()
|
||||
|
||||
return await tokenizer.encode_async(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
|
||||
def _extract_prompt_components(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> PromptComponents:
|
||||
'''
|
||||
Extract the components of any single encoder or decoder input prompt.
|
||||
|
||||
Arguments:
|
||||
|
||||
* request_id
|
||||
* inputs: single encoder or decoder input prompt
|
||||
* lora_request: this is only valid for decoder prompts
|
||||
|
||||
Returns:
|
||||
|
||||
* prompt
|
||||
* prompt_token_ids
|
||||
* multi_modal_data
|
||||
'''
|
||||
|
||||
parsed = parse_singleton_prompt(inputs)
|
||||
|
||||
if parsed["type"] == "str":
|
||||
prompt = parsed["content"]
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
multi_modal_data = None
|
||||
elif parsed["type"] == "tokens":
|
||||
prompt = None
|
||||
prompt_token_ids = parsed["content"]["prompt_token_ids"]
|
||||
multi_modal_data = parsed["content"].get("multi_modal_data")
|
||||
elif parsed["type"] == "text":
|
||||
prompt = parsed["content"]["prompt"]
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
multi_modal_data = parsed["content"].get("multi_modal_data")
|
||||
else:
|
||||
assert_never(parsed)
|
||||
|
||||
return prompt, prompt_token_ids, multi_modal_data
|
||||
|
||||
async def _extract_prompt_components_async(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> PromptComponents:
|
||||
"""Async version of :meth:`_extract_prompt_components`."""
|
||||
parsed = parse_singleton_prompt(inputs)
|
||||
|
||||
if parsed["type"] == "str":
|
||||
prompt = parsed["content"]
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
multi_modal_data = None
|
||||
elif parsed["type"] == "tokens":
|
||||
prompt = None
|
||||
prompt_token_ids = parsed["content"]["prompt_token_ids"]
|
||||
multi_modal_data = parsed["content"].get("multi_modal_data")
|
||||
elif parsed["type"] == "text":
|
||||
prompt = parsed["content"]["prompt"]
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
multi_modal_data = parsed["content"].get("multi_modal_data")
|
||||
else:
|
||||
assert_never(parsed)
|
||||
|
||||
return prompt, prompt_token_ids, multi_modal_data
|
||||
|
||||
def _build_enc_dec_llm_inputs(
|
||||
self,
|
||||
encoder_comps: PromptComponents,
|
||||
decoder_comps: DecoderPromptComponents,
|
||||
) -> EncoderDecoderLLMInputs:
|
||||
encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
|
||||
decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
|
||||
|
||||
if encoder_mm_data is not None or decoder_mm_data is not None:
|
||||
raise ValueError("Multi-modal encoder-decoder models are "
|
||||
"not supported yet")
|
||||
|
||||
decoder_prompt_ids = (
|
||||
self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
|
||||
|
||||
return EncoderDecoderLLMInputs(
|
||||
prompt_token_ids=decoder_prompt_ids,
|
||||
prompt=decoder_prompt,
|
||||
encoder_prompt_token_ids=encoder_prompt_ids,
|
||||
encoder_prompt=encoder_prompt,
|
||||
)
|
||||
|
||||
def _process_encoder_decoder_prompt(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderLLMInputs:
|
||||
'''
|
||||
For encoder/decoder models only:
|
||||
Process an input prompt into an
|
||||
:class:`EncoderDecoderLLMInputs` instance.
|
||||
|
||||
There are two types of input prompts:
|
||||
singleton prompts which carry only the
|
||||
encoder prompt, and explicit encoder/decoder
|
||||
prompts which carry both the encoder and the
|
||||
decoder prompts as member variables.
|
||||
|
||||
This function handles the following scenarios:
|
||||
* Singleton encoder prompt: extract encoder prompt
|
||||
token ids & infer default decoder prompt token ids
|
||||
* Explicit encoder/decoder prompt: extract encoder
|
||||
and decoder prompt token ids
|
||||
|
||||
Note that for Explicit encoder/decoder prompts,
|
||||
each sub-prompt (encoder or decoder prompt) can
|
||||
have any possible singleton type; thus this
|
||||
method relies on helper functions to obtain
|
||||
token ids for the sub-prompts.
|
||||
|
||||
Arguments:
|
||||
|
||||
* inputs: an input prompt
|
||||
* request_id
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`EncoderDecoderLLMInputs` instance
|
||||
'''
|
||||
|
||||
encoder_comps: PromptComponents
|
||||
decoder_comps: DecoderPromptComponents
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
encoder_comps = self._extract_prompt_components(
|
||||
inputs["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
if (decoder_input := inputs["decoder_prompt"]) is None:
|
||||
decoder_comps = None, None, None
|
||||
else:
|
||||
decoder_comps = self._extract_prompt_components(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
else:
|
||||
encoder_comps = self._extract_prompt_components(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
decoder_comps = None, None, None
|
||||
|
||||
return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
|
||||
|
||||
async def _process_encoder_decoder_prompt_async(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderLLMInputs:
|
||||
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
|
||||
encoder_comps: PromptComponents
|
||||
decoder_comps: DecoderPromptComponents
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
encoder_task = self._extract_prompt_components_async(
|
||||
inputs["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
if (decoder_input := inputs["decoder_prompt"]) is None:
|
||||
encoder_comps = await encoder_task
|
||||
decoder_comps = None, None, None
|
||||
else:
|
||||
decoder_task = self._extract_prompt_components_async(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
encoder_comps, decoder_comps = await asyncio.gather(
|
||||
encoder_task, decoder_task)
|
||||
else:
|
||||
encoder_comps = await self._extract_prompt_components_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
decoder_comps = None, None, None
|
||||
|
||||
return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
|
||||
|
||||
def _build_decoder_only_llm_inputs(
|
||||
self,
|
||||
prompt_comps: PromptComponents,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
) -> LLMInputs:
|
||||
prompt, prompt_token_ids, multi_modal_data = prompt_comps
|
||||
|
||||
prompt_token_ids = self._apply_prompt_adapter(
|
||||
prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
|
||||
|
||||
return LLMInputs(prompt_token_ids=prompt_token_ids,
|
||||
prompt=prompt,
|
||||
multi_modal_data=multi_modal_data)
|
||||
|
||||
def _process_decoder_only_prompt(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> LLMInputs:
|
||||
'''
|
||||
For decoder-only models:
|
||||
Process an input prompt into an :class:`LLMInputs` instance.
|
||||
|
||||
Arguments:
|
||||
|
||||
* inputs: input prompt
|
||||
* request_id
|
||||
* lora_request
|
||||
* prompt_adapter_request
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`LLMInputs` instance
|
||||
'''
|
||||
|
||||
prompt_comps = self._extract_prompt_components(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
prompt_comps,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
async def _process_decoder_only_prompt_async(
|
||||
self,
|
||||
inputs: SingletonPromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> LLMInputs:
|
||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
||||
prompt_comps = await self._extract_prompt_components_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
prompt_comps,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
|
||||
"""Preprocess the input prompt."""
|
||||
if self.is_encoder_decoder_model():
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return self._process_encoder_decoder_prompt(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
"to decoder-only models")
|
||||
|
||||
# Decoder-only operation
|
||||
return self._process_decoder_only_prompt(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
async def preprocess_async(
|
||||
self,
|
||||
inputs: PromptInputs,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
|
||||
"""Async version of :meth:`preprocess`."""
|
||||
if self.is_encoder_decoder_model():
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return await self._process_encoder_decoder_prompt_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(inputs):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
"to decoder-only models")
|
||||
|
||||
# Decoder-only operation
|
||||
return await self._process_decoder_only_prompt_async(
|
||||
inputs,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
def is_encoder_decoder_model(self):
|
||||
return self.model_config.is_encoder_decoder_model
|
||||
@@ -410,6 +410,7 @@ def fused_topk(
|
||||
|
||||
if renormalize:
|
||||
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
||||
|
||||
return topk_weights, topk_ids
|
||||
|
||||
|
||||
@@ -443,7 +444,8 @@ def grouped_topk(hidden_states: torch.Tensor,
|
||||
|
||||
if renormalize:
|
||||
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
||||
return topk_weights, topk_ids
|
||||
|
||||
return topk_weights, topk_ids.to(torch.int32)
|
||||
|
||||
|
||||
def get_config_dtype_str(dtype: torch.dtype,
|
||||
|
||||
@@ -990,7 +990,7 @@ def get_rope(
|
||||
base, is_neox_style, dtype, short_factor, long_factor,
|
||||
**extra_kwargs)
|
||||
elif scaling_type == "mrope":
|
||||
return MRotaryEmbedding(
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
|
||||
@@ -90,12 +90,12 @@ _MULTIMODAL_MODELS = {
|
||||
"PaliGemmaForConditionalGeneration": ("paligemma",
|
||||
"PaliGemmaForConditionalGeneration"),
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
||||
"PixtralForConditionalGeneration": ("pixtral",
|
||||
"PixtralForConditionalGeneration"),
|
||||
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl",
|
||||
"Qwen2VLForConditionalGeneration"),
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
}
|
||||
_CONDITIONAL_GENERATION_MODELS = {
|
||||
"BartModel": ("bart", "BartForConditionalGeneration"),
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user