Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
37e3806132 | ||
|
|
c0efdd655b | ||
|
|
aaaec52ad9 | ||
|
|
e1eb45d397 | ||
|
|
89fca671fb | ||
|
|
d20b0c139c | ||
|
|
166a168b0f | ||
|
|
2bb0e1a799 | ||
|
|
6eaf1e5c52 | ||
|
|
868a8c5b2c | ||
|
|
b4ad56c1bd | ||
|
|
69698f257e | ||
|
|
cd0cd85102 | ||
|
|
0a74bfce9c | ||
|
|
dd3b865854 | ||
|
|
9b87a579aa | ||
|
|
b539222d4e |
@@ -200,6 +200,7 @@ steps:
|
|||||||
- pytest -v -s v1/core
|
- pytest -v -s v1/core
|
||||||
- pytest -v -s v1/entrypoints
|
- pytest -v -s v1/entrypoints
|
||||||
- pytest -v -s v1/engine
|
- pytest -v -s v1/engine
|
||||||
|
- pytest -v -s v1/entrypoints
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
@@ -226,10 +227,13 @@ steps:
|
|||||||
- python3 offline_inference/basic/chat.py
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference/prefix_caching.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
- python3 offline_inference/vision_language.py
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
|
|||||||
11
README.md
11
README.md
@@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
|
|
||||||
|
|
||||||
Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
|
|
||||||
|
|
||||||
👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
||||||
union tmpcvt {
|
[[maybe_unused]] union tmpcvt {
|
||||||
uint16_t u;
|
uint16_t u;
|
||||||
_Float16 f;
|
_Float16 f;
|
||||||
__hip_bfloat16 b;
|
__hip_bfloat16 b;
|
||||||
@@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
|
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
|
||||||
const _B16x4& inp2) {
|
const _B16x4& inp2) {
|
||||||
union tmpcvt {
|
[[maybe_unused]] union tmpcvt {
|
||||||
uint16_t u;
|
uint16_t u;
|
||||||
_Float16 f;
|
_Float16 f;
|
||||||
__hip_bfloat16 b;
|
__hip_bfloat16 b;
|
||||||
@@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
|||||||
const int seq_idx = blockIdx.y;
|
const int seq_idx = blockIdx.y;
|
||||||
const int context_len = context_lens[seq_idx];
|
const int context_len = context_lens[seq_idx];
|
||||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
const int warpid = threadIdx.x / WARP_SIZE;
|
const int warpid = threadIdx.x / WARP_SIZE;
|
||||||
const int laneid = threadIdx.x % WARP_SIZE;
|
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
|
||||||
|
|
||||||
__shared__ float shared_global_exp_sum;
|
__shared__ float shared_global_exp_sum;
|
||||||
// max num partitions supported is warp_size * NPAR_LOOPS
|
// max num partitions supported is warp_size * NPAR_LOOPS
|
||||||
|
|||||||
@@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
|||||||
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
||||||
* ✅︎
|
* ✅︎
|
||||||
* ✅︎
|
* ✅︎
|
||||||
* ⚠️
|
*
|
||||||
- * `GLM4VForCausalLM`<sup>^</sup>
|
- * `GLM4VForCausalLM`<sup>^</sup>
|
||||||
* GLM-4V
|
* GLM-4V
|
||||||
* T + I
|
* T + I
|
||||||
@@ -948,8 +948,11 @@ V1 currently uses a simplified attention pattern:
|
|||||||
- Uses causal attention for all tokens, including image tokens
|
- Uses causal attention for all tokens, including image tokens
|
||||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
|
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
|
||||||
- Will be updated in the future to support the correct behavior
|
- Will be updated in the future to support the correct behavior
|
||||||
|
- Does not support `"do_pan_and_scan": True`
|
||||||
|
|
||||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||||
|
|
||||||
|
For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
:::{note}
|
:::{note}
|
||||||
|
|||||||
@@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
|
|||||||
on HuggingFace model repository.
|
on HuggingFace model repository.
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
|
from dataclasses import asdict
|
||||||
|
from typing import NamedTuple, Optional
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, EngineArgs, SamplingParams
|
||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@@ -23,21 +25,31 @@ question_per_audio_count = {
|
|||||||
2: "What sport and what nursery rhyme are referenced?"
|
2: "What sport and what nursery rhyme are referenced?"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ModelRequestData(NamedTuple):
|
||||||
|
engine_args: EngineArgs
|
||||||
|
prompt: str
|
||||||
|
stop_token_ids: Optional[list[int]] = None
|
||||||
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
|
|
||||||
|
|
||||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||||
# lower-end GPUs.
|
# lower-end GPUs.
|
||||||
# Unless specified, these settings have been tested to work on a single L4.
|
# Unless specified, these settings have been tested to work on a single L4.
|
||||||
|
|
||||||
|
|
||||||
# MiniCPM-O
|
# MiniCPM-O
|
||||||
def run_minicpmo(question: str, audio_count: int):
|
def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
|
||||||
model_name = "openbmb/MiniCPM-o-2_6"
|
model_name = "openbmb/MiniCPM-o-2_6"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
trust_remote_code=True,
|
model=model_name,
|
||||||
max_model_len=4096,
|
trust_remote_code=True,
|
||||||
max_num_seqs=5,
|
max_model_len=4096,
|
||||||
limit_mm_per_prompt={"audio": audio_count})
|
max_num_seqs=5,
|
||||||
|
limit_mm_per_prompt={"audio": audio_count},
|
||||||
|
)
|
||||||
|
|
||||||
stop_tokens = ['<|im_end|>', '<|endoftext|>']
|
stop_tokens = ['<|im_end|>', '<|endoftext|>']
|
||||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
@@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
|
|||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
chat_template=audio_chat_template)
|
chat_template=audio_chat_template)
|
||||||
return llm, prompt, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Phi-4-multimodal-instruct
|
# Phi-4-multimodal-instruct
|
||||||
def run_phi4mm(questions: str, audio_count: int):
|
def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
|
||||||
"""
|
"""
|
||||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||||
show how to process audio inputs.
|
show how to process audio inputs.
|
||||||
@@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int):
|
|||||||
speech_lora_path = os.path.join(model_path, "speech-lora")
|
speech_lora_path = os.path.join(model_path, "speech-lora")
|
||||||
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
|
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
|
||||||
|
|
||||||
prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
|
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_path,
|
model=model_path,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
@@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int):
|
|||||||
lora_extra_vocab_size=0,
|
lora_extra_vocab_size=0,
|
||||||
limit_mm_per_prompt={"audio": audio_count},
|
limit_mm_per_prompt={"audio": audio_count},
|
||||||
)
|
)
|
||||||
lora_request = LoRARequest("speech", 1, speech_lora_path)
|
|
||||||
# To maintain code compatibility in this script, we add LoRA here.
|
|
||||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
|
||||||
# You can also add LoRA using:
|
|
||||||
# llm.generate(prompts, lora_request=lora_request,...)
|
|
||||||
|
|
||||||
stop_token_ids = None
|
return ModelRequestData(
|
||||||
return llm, prompts, stop_token_ids
|
engine_args=engine_args,
|
||||||
|
prompt=prompts,
|
||||||
|
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Qwen2-Audio
|
# Qwen2-Audio
|
||||||
def run_qwen2_audio(question: str, audio_count: int):
|
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
|
||||||
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
max_model_len=4096,
|
model=model_name,
|
||||||
max_num_seqs=5,
|
max_model_len=4096,
|
||||||
limit_mm_per_prompt={"audio": audio_count})
|
max_num_seqs=5,
|
||||||
|
limit_mm_per_prompt={"audio": audio_count},
|
||||||
|
)
|
||||||
|
|
||||||
audio_in_prompt = "".join([
|
audio_in_prompt = "".join([
|
||||||
f"Audio {idx+1}: "
|
f"Audio {idx+1}: "
|
||||||
@@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int):
|
|||||||
"<|im_start|>user\n"
|
"<|im_start|>user\n"
|
||||||
f"{audio_in_prompt}{question}<|im_end|>\n"
|
f"{audio_in_prompt}{question}<|im_end|>\n"
|
||||||
"<|im_start|>assistant\n")
|
"<|im_start|>assistant\n")
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompt, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Ultravox 0.5-1B
|
# Ultravox 0.5-1B
|
||||||
def run_ultravox(question: str, audio_count: int):
|
def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
|
||||||
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
@@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int):
|
|||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True)
|
add_generation_prompt=True)
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
max_model_len=4096,
|
model=model_name,
|
||||||
max_num_seqs=5,
|
max_model_len=4096,
|
||||||
trust_remote_code=True,
|
max_num_seqs=5,
|
||||||
limit_mm_per_prompt={"audio": audio_count})
|
trust_remote_code=True,
|
||||||
stop_token_ids = None
|
limit_mm_per_prompt={"audio": audio_count},
|
||||||
return llm, prompt, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Whisper
|
# Whisper
|
||||||
def run_whisper(question: str, audio_count: int):
|
def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||||
assert audio_count == 1, (
|
assert audio_count == 1, (
|
||||||
"Whisper only support single audio input per prompt")
|
"Whisper only support single audio input per prompt")
|
||||||
model_name = "openai/whisper-large-v3-turbo"
|
model_name = "openai/whisper-large-v3-turbo"
|
||||||
|
|
||||||
prompt = "<|startoftranscript|>"
|
prompt = "<|startoftranscript|>"
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
max_model_len=448,
|
model=model_name,
|
||||||
max_num_seqs=5,
|
max_model_len=448,
|
||||||
limit_mm_per_prompt={"audio": audio_count})
|
max_num_seqs=5,
|
||||||
stop_token_ids = None
|
limit_mm_per_prompt={"audio": audio_count},
|
||||||
return llm, prompt, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
@@ -164,14 +194,24 @@ def main(args):
|
|||||||
raise ValueError(f"Model type {model} is not supported.")
|
raise ValueError(f"Model type {model} is not supported.")
|
||||||
|
|
||||||
audio_count = args.num_audios
|
audio_count = args.num_audios
|
||||||
llm, prompt, stop_token_ids = model_example_map[model](
|
req_data = model_example_map[model](question_per_audio_count[audio_count],
|
||||||
question_per_audio_count[audio_count], audio_count)
|
audio_count)
|
||||||
|
|
||||||
|
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||||
|
llm = LLM(**engine_args)
|
||||||
|
|
||||||
|
# To maintain code compatibility in this script, we add LoRA here.
|
||||||
|
# You can also add LoRA using:
|
||||||
|
# llm.generate(prompts, lora_request=lora_request,...)
|
||||||
|
if req_data.lora_requests:
|
||||||
|
for lora_request in req_data.lora_requests:
|
||||||
|
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||||
|
|
||||||
# We set temperature to 0.2 so that outputs can be different
|
# We set temperature to 0.2 so that outputs can be different
|
||||||
# even when all prompts are identical when running batch inference.
|
# even when all prompts are identical when running batch inference.
|
||||||
sampling_params = SamplingParams(temperature=0.2,
|
sampling_params = SamplingParams(temperature=0.2,
|
||||||
max_tokens=64,
|
max_tokens=64,
|
||||||
stop_token_ids=stop_token_ids)
|
stop_token_ids=req_data.stop_token_ids)
|
||||||
|
|
||||||
mm_data = {}
|
mm_data = {}
|
||||||
if audio_count > 0:
|
if audio_count > 0:
|
||||||
@@ -183,7 +223,7 @@ def main(args):
|
|||||||
}
|
}
|
||||||
|
|
||||||
assert args.num_prompts > 0
|
assert args.num_prompts > 0
|
||||||
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
|
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
|
||||||
if args.num_prompts > 1:
|
if args.num_prompts > 1:
|
||||||
# Batch inference
|
# Batch inference
|
||||||
inputs = [inputs] * args.num_prompts
|
inputs = [inputs] * args.num_prompts
|
||||||
@@ -214,6 +254,10 @@ if __name__ == "__main__":
|
|||||||
default=1,
|
default=1,
|
||||||
choices=[0, 1, 2],
|
choices=[0, 1, 2],
|
||||||
help="Number of audio items per prompt.")
|
help="Number of audio items per prompt.")
|
||||||
|
parser.add_argument("--seed",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Set the seed when initializing `vllm.LLM`.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
|
|||||||
the explicit/implicit prompt format on enc-dec LMMs for text generation.
|
the explicit/implicit prompt format on enc-dec LMMs for text generation.
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Sequence
|
||||||
|
from dataclasses import asdict
|
||||||
|
from typing import NamedTuple
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, EngineArgs, PromptType, SamplingParams
|
||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class ModelRequestData(NamedTuple):
|
||||||
|
engine_args: EngineArgs
|
||||||
|
prompts: Sequence[PromptType]
|
||||||
|
|
||||||
|
|
||||||
def run_florence2():
|
def run_florence2():
|
||||||
# Create a Florence-2 encoder/decoder model instance
|
engine_args = EngineArgs(
|
||||||
llm = LLM(
|
|
||||||
model="microsoft/Florence-2-large",
|
model="microsoft/Florence-2-large",
|
||||||
tokenizer="facebook/bart-large",
|
tokenizer="facebook/bart-large",
|
||||||
max_num_seqs=8,
|
max_num_seqs=8,
|
||||||
@@ -39,12 +46,15 @@ def run_florence2():
|
|||||||
"decoder_prompt": "",
|
"decoder_prompt": "",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
return llm, prompts
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_mllama():
|
def run_mllama():
|
||||||
# Create a Mllama encoder/decoder model instance
|
engine_args = EngineArgs(
|
||||||
llm = LLM(
|
|
||||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@@ -69,12 +79,15 @@ def run_mllama():
|
|||||||
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
|
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
return llm, prompts
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_whisper():
|
def run_whisper():
|
||||||
# Create a Whisper encoder/decoder model instance
|
engine_args = EngineArgs(
|
||||||
llm = LLM(
|
|
||||||
model="openai/whisper-large-v3-turbo",
|
model="openai/whisper-large-v3-turbo",
|
||||||
max_model_len=448,
|
max_model_len=448,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
@@ -99,7 +112,11 @@ def run_whisper():
|
|||||||
"decoder_prompt": "<|startoftranscript|>",
|
"decoder_prompt": "<|startoftranscript|>",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
return llm, prompts
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
@@ -114,7 +131,12 @@ def main(args):
|
|||||||
if model not in model_example_map:
|
if model not in model_example_map:
|
||||||
raise ValueError(f"Model type {model} is not supported.")
|
raise ValueError(f"Model type {model} is not supported.")
|
||||||
|
|
||||||
llm, prompts = model_example_map[model]()
|
req_data = model_example_map[model]()
|
||||||
|
|
||||||
|
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||||
|
llm = LLM(**engine_args)
|
||||||
|
|
||||||
|
prompts = req_data.prompts
|
||||||
|
|
||||||
# Create a sampling params object.
|
# Create a sampling params object.
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@@ -153,6 +175,10 @@ if __name__ == "__main__":
|
|||||||
default="mllama",
|
default="mllama",
|
||||||
choices=model_example_map.keys(),
|
choices=model_example_map.keys(),
|
||||||
help='Huggingface "model_type".')
|
help='Huggingface "model_type".')
|
||||||
|
parser.add_argument("--seed",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Set the seed when initializing `vllm.LLM`.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -8,122 +8,164 @@ on HuggingFace model repository.
|
|||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
from dataclasses import asdict
|
||||||
|
from typing import NamedTuple, Optional
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, EngineArgs, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.assets.video import VideoAsset
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class ModelRequestData(NamedTuple):
|
||||||
|
engine_args: EngineArgs
|
||||||
|
prompts: list[str]
|
||||||
|
stop_token_ids: Optional[list[int]] = None
|
||||||
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
|
|
||||||
|
|
||||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||||
# lower-end GPUs.
|
# lower-end GPUs.
|
||||||
# Unless specified, these settings have been tested to work on a single L4.
|
# Unless specified, these settings have been tested to work on a single L4.
|
||||||
|
|
||||||
|
|
||||||
# Aria
|
# Aria
|
||||||
def run_aria(questions: list[str], modality: str):
|
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
model_name = "rhymes-ai/Aria"
|
model_name = "rhymes-ai/Aria"
|
||||||
|
|
||||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
max_model_len=4096,
|
model=model_name,
|
||||||
max_num_seqs=2,
|
max_model_len=4096,
|
||||||
dtype="bfloat16",
|
max_num_seqs=2,
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
dtype="bfloat16",
|
||||||
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
|
)
|
||||||
|
|
||||||
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
|
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
|
||||||
"<|im_end|>\n<|im_start|>assistant\n")
|
"<|im_end|>\n<|im_start|>assistant\n")
|
||||||
for question in questions]
|
for question in questions]
|
||||||
|
|
||||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||||
return llm, prompts, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# BLIP-2
|
# BLIP-2
|
||||||
def run_blip2(questions: list[str], modality: str):
|
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
|
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
|
||||||
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
||||||
prompts = [f"Question: {question} Answer:" for question in questions]
|
prompts = [f"Question: {question} Answer:" for question in questions]
|
||||||
llm = LLM(model="Salesforce/blip2-opt-2.7b",
|
engine_args = EngineArgs(
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
model="Salesforce/blip2-opt-2.7b",
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
return llm, prompts, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Chameleon
|
# Chameleon
|
||||||
def run_chameleon(questions: list[str], modality: str):
|
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
prompts = [f"{question}<image>" for question in questions]
|
prompts = [f"{question}<image>" for question in questions]
|
||||||
llm = LLM(model="facebook/chameleon-7b",
|
engine_args = EngineArgs(
|
||||||
max_model_len=4096,
|
model="facebook/chameleon-7b",
|
||||||
max_num_seqs=2,
|
max_model_len=4096,
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
max_num_seqs=2,
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
return llm, prompts, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Deepseek-VL2
|
# Deepseek-VL2
|
||||||
def run_deepseek_vl2(questions: list[str], modality: str):
|
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
max_model_len=4096,
|
model=model_name,
|
||||||
max_num_seqs=2,
|
max_model_len=4096,
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
max_num_seqs=2,
|
||||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
|
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||||
|
)
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
|
f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
|
||||||
for question in questions
|
for question in questions
|
||||||
]
|
]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Florence2
|
# Florence2
|
||||||
def run_florence2(question: str, modality: str):
|
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
llm = LLM(model="microsoft/Florence-2-large",
|
engine_args = EngineArgs(
|
||||||
tokenizer="facebook/bart-large",
|
model="microsoft/Florence-2-large",
|
||||||
max_num_seqs=8,
|
tokenizer="facebook/bart-large",
|
||||||
trust_remote_code=True,
|
max_num_seqs=8,
|
||||||
dtype="bfloat16",
|
trust_remote_code=True,
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
dtype="bfloat16",
|
||||||
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
|
)
|
||||||
|
|
||||||
prompt = "<MORE_DETAILED_CAPTION>"
|
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompt, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Fuyu
|
# Fuyu
|
||||||
def run_fuyu(questions: list[str], modality: str):
|
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
prompts = [f"{question}\n" for question in questions]
|
prompts = [f"{question}\n" for question in questions]
|
||||||
llm = LLM(model="adept/fuyu-8b",
|
engine_args = EngineArgs(
|
||||||
max_model_len=2048,
|
model="adept/fuyu-8b",
|
||||||
max_num_seqs=2,
|
max_model_len=2048,
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
max_num_seqs=2,
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
return llm, prompts, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Gemma 3
|
# Gemma 3
|
||||||
def run_gemma3(questions: list[str], modality: str):
|
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
model_name = "google/gemma-3-4b-it"
|
model_name = "google/gemma-3-4b-it"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str):
|
|||||||
prompts = [("<bos><start_of_turn>user\n"
|
prompts = [("<bos><start_of_turn>user\n"
|
||||||
f"<start_of_image>{question}<end_of_turn>\n"
|
f"<start_of_image>{question}<end_of_turn>\n"
|
||||||
"<start_of_turn>model\n") for question in questions]
|
"<start_of_turn>model\n") for question in questions]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# GLM-4v
|
# GLM-4v
|
||||||
def run_glm4v(questions: list[str], modality: str):
|
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
model_name = "THUDM/glm-4v-9b"
|
model_name = "THUDM/glm-4v-9b"
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
max_model_len=2048,
|
model=model_name,
|
||||||
max_num_seqs=2,
|
max_model_len=2048,
|
||||||
trust_remote_code=True,
|
max_num_seqs=2,
|
||||||
enforce_eager=True,
|
trust_remote_code=True,
|
||||||
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
|
enforce_eager=True,
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
|
||||||
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
|
)
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
|
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
|
||||||
@@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str):
|
|||||||
]
|
]
|
||||||
|
|
||||||
stop_token_ids = [151329, 151336, 151338]
|
stop_token_ids = [151329, 151336, 151338]
|
||||||
return llm, prompts, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# H2OVL-Mississippi
|
# H2OVL-Mississippi
|
||||||
def run_h2ovl(questions: list[str], modality: str):
|
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "h2oai/h2ovl-mississippi-800m"
|
model_name = "h2oai/h2ovl-mississippi-800m"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
@@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str):
|
|||||||
# Stop tokens for H2OVL-Mississippi
|
# Stop tokens for H2OVL-Mississippi
|
||||||
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
|
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
|
||||||
stop_token_ids = [tokenizer.eos_token_id]
|
stop_token_ids = [tokenizer.eos_token_id]
|
||||||
return llm, prompts, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Idefics3-8B-Llama3
|
# Idefics3-8B-Llama3
|
||||||
def run_idefics3(questions: list[str], modality: str):
|
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str):
|
|||||||
prompts = [(
|
prompts = [(
|
||||||
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
|
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
|
||||||
) for question in questions]
|
) for question in questions]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# InternVL
|
# InternVL
|
||||||
def run_internvl(questions: list[str], modality: str):
|
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "OpenGVLab/InternVL2-2B"
|
model_name = "OpenGVLab/InternVL2-2B"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
@@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str):
|
|||||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
return llm, prompts, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# LLaVA-1.5
|
# LLaVA-1.5
|
||||||
def run_llava(questions: list[str], modality: str):
|
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
f"USER: <image>\n{question}\nASSISTANT:" for question in questions
|
f"USER: <image>\n{question}\nASSISTANT:" for question in questions
|
||||||
]
|
]
|
||||||
|
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf",
|
engine_args = EngineArgs(
|
||||||
max_model_len=4096,
|
model="llava-hf/llava-1.5-7b-hf",
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
max_model_len=4096,
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
return llm, prompts, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# LLaVA-1.6/LLaVA-NeXT
|
# LLaVA-1.6/LLaVA-NeXT
|
||||||
def run_llava_next(questions: list[str], modality: str):
|
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
|
prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
|
||||||
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
|
engine_args = EngineArgs(
|
||||||
max_model_len=8192,
|
model="llava-hf/llava-v1.6-mistral-7b-hf",
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
max_model_len=8192,
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
return llm, prompts, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# LlaVA-NeXT-Video
|
# LlaVA-NeXT-Video
|
||||||
# Currently only support for video input
|
# Currently only support for video input
|
||||||
def run_llava_next_video(questions: list[str], modality: str):
|
def run_llava_next_video(questions: list[str],
|
||||||
|
modality: str) -> ModelRequestData:
|
||||||
assert modality == "video"
|
assert modality == "video"
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
f"USER: <video>\n{question} ASSISTANT:" for question in questions
|
f"USER: <video>\n{question} ASSISTANT:" for question in questions
|
||||||
]
|
]
|
||||||
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
engine_args = EngineArgs(
|
||||||
max_model_len=8192,
|
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
max_model_len=8192,
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
return llm, prompts, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# LLaVA-OneVision
|
# LLaVA-OneVision
|
||||||
def run_llava_onevision(questions: list[str], modality: str):
|
def run_llava_onevision(questions: list[str],
|
||||||
|
modality: str) -> ModelRequestData:
|
||||||
|
|
||||||
if modality == "video":
|
if modality == "video":
|
||||||
prompts = [
|
prompts = [
|
||||||
@@ -305,15 +387,20 @@ def run_llava_onevision(questions: list[str], modality: str):
|
|||||||
<|im_start|>assistant\n" for question in questions
|
<|im_start|>assistant\n" for question in questions
|
||||||
]
|
]
|
||||||
|
|
||||||
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
|
engine_args = EngineArgs(
|
||||||
max_model_len=16384,
|
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
max_model_len=16384,
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
return llm, prompts, stop_token_ids
|
)
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Mantis
|
# Mantis
|
||||||
def run_mantis(questions: list[str], modality: str):
|
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
|
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
|
||||||
@@ -322,14 +409,19 @@ def run_mantis(questions: list[str], modality: str):
|
|||||||
for question in questions
|
for question in questions
|
||||||
]
|
]
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model="TIGER-Lab/Mantis-8B-siglip-llama3",
|
model="TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
|
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
)
|
)
|
||||||
stop_token_ids = [128009]
|
stop_token_ids = [128009]
|
||||||
return llm, prompts, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# MiniCPM-V
|
# MiniCPM-V
|
||||||
@@ -357,7 +449,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
|
|||||||
# model_name = "openbmb/MiniCPM-o-2_6"
|
# model_name = "openbmb/MiniCPM-o-2_6"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@@ -389,19 +481,24 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
|
|||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True) for question in questions
|
add_generation_prompt=True) for question in questions
|
||||||
]
|
]
|
||||||
return llm, prompts, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_minicpmo(questions: list[str], modality: str):
|
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
|
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
|
||||||
|
|
||||||
|
|
||||||
def run_minicpmv(questions: list[str], modality: str):
|
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
|
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
|
||||||
|
|
||||||
|
|
||||||
# LLama 3.2
|
# LLama 3.2
|
||||||
def run_mllama(questions: list[str], modality: str):
|
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||||
@@ -411,7 +508,7 @@ def run_mllama(questions: list[str], modality: str):
|
|||||||
# You may lower either to run this example on lower-end GPUs.
|
# You may lower either to run this example on lower-end GPUs.
|
||||||
|
|
||||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
@@ -432,17 +529,20 @@ def run_mllama(questions: list[str], modality: str):
|
|||||||
prompts = tokenizer.apply_chat_template(messages,
|
prompts = tokenizer.apply_chat_template(messages,
|
||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
tokenize=False)
|
tokenize=False)
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Molmo
|
# Molmo
|
||||||
def run_molmo(questions: list[str], modality: str):
|
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "allenai/Molmo-7B-D-0924"
|
model_name = "allenai/Molmo-7B-D-0924"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
@@ -453,18 +553,21 @@ def run_molmo(questions: list[str], modality: str):
|
|||||||
f"<|im_start|>user <image>\n{question}<|im_end|> \
|
f"<|im_start|>user <image>\n{question}<|im_end|> \
|
||||||
<|im_start|>assistant\n" for question in questions
|
<|im_start|>assistant\n" for question in questions
|
||||||
]
|
]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# NVLM-D
|
# NVLM-D
|
||||||
def run_nvlm_d(questions: list[str], modality: str):
|
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "nvidia/NVLM-D-72B"
|
model_name = "nvidia/NVLM-D-72B"
|
||||||
|
|
||||||
# Adjust this as necessary to fit in GPU
|
# Adjust this as necessary to fit in GPU
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
@@ -481,36 +584,47 @@ def run_nvlm_d(questions: list[str], modality: str):
|
|||||||
prompts = tokenizer.apply_chat_template(messages,
|
prompts = tokenizer.apply_chat_template(messages,
|
||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True)
|
add_generation_prompt=True)
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# PaliGemma
|
# PaliGemma
|
||||||
def run_paligemma(question: str, modality: str):
|
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
# PaliGemma has special prompt format for VQA
|
# PaliGemma has special prompt format for VQA
|
||||||
prompt = ["caption en"]
|
prompts = ["caption en" for _ in questions]
|
||||||
llm = LLM(model="google/paligemma-3b-mix-224",
|
engine_args = EngineArgs(
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
model="google/paligemma-3b-mix-224",
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||||
return llm, prompt, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# PaliGemma 2
|
# PaliGemma 2
|
||||||
def run_paligemma2(question: str, modality: str):
|
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
# PaliGemma 2 has special prompt format for VQA
|
# PaliGemma 2 has special prompt format for VQA
|
||||||
prompt = ["caption en"]
|
prompts = ["caption en" for _ in questions]
|
||||||
llm = LLM(model="google/paligemma2-3b-ft-docci-448",
|
engine_args = EngineArgs(
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
model="google/paligemma2-3b-ft-docci-448",
|
||||||
stop_token_ids = None
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||||
return llm, prompt, stop_token_ids
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Phi-3-Vision
|
# Phi-3-Vision
|
||||||
def run_phi3v(questions: list[str], modality: str):
|
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
@@ -530,7 +644,7 @@ def run_phi3v(questions: list[str], modality: str):
|
|||||||
#
|
#
|
||||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
@@ -539,12 +653,15 @@ def run_phi3v(questions: list[str], modality: str):
|
|||||||
mm_processor_kwargs={"num_crops": 16},
|
mm_processor_kwargs={"num_crops": 16},
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||||
)
|
)
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Phi-4-multimodal-instruct
|
# Phi-4-multimodal-instruct
|
||||||
def run_phi4mm(questions: list[str], modality: str):
|
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
"""
|
"""
|
||||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||||
show how to process image inputs.
|
show how to process image inputs.
|
||||||
@@ -558,7 +675,7 @@ def run_phi4mm(questions: list[str], modality: str):
|
|||||||
f"<|user|><|image_1|>{question}<|end|><|assistant|>"
|
f"<|user|><|image_1|>{question}<|end|><|assistant|>"
|
||||||
for question in questions
|
for question in questions
|
||||||
]
|
]
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_path,
|
model=model_path,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
@@ -567,24 +684,22 @@ def run_phi4mm(questions: list[str], modality: str):
|
|||||||
max_lora_rank=320,
|
max_lora_rank=320,
|
||||||
lora_extra_vocab_size=0,
|
lora_extra_vocab_size=0,
|
||||||
)
|
)
|
||||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
|
||||||
# To maintain code compatibility in this script, we add LoRA here.
|
|
||||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
|
||||||
# You can also add LoRA using:
|
|
||||||
# llm.generate(prompts, lora_request=lora_request,...)
|
|
||||||
|
|
||||||
stop_token_ids = None
|
return ModelRequestData(
|
||||||
return llm, prompts, stop_token_ids
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Pixtral HF-format
|
# Pixtral HF-format
|
||||||
def run_pixtral_hf(questions: list[str], modality: str):
|
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "mistral-community/pixtral-12b"
|
model_name = "mistral-community/pixtral-12b"
|
||||||
|
|
||||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@@ -592,15 +707,18 @@ def run_pixtral_hf(questions: list[str], modality: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Qwen
|
# Qwen
|
||||||
def run_qwen_vl(questions: list[str], modality: str):
|
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model="Qwen/Qwen-VL",
|
model="Qwen/Qwen-VL",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -610,16 +728,19 @@ def run_qwen_vl(questions: list[str], modality: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
|
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Qwen2-VL
|
# Qwen2-VL
|
||||||
def run_qwen2_vl(questions: list[str], modality: str):
|
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
|
||||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=5,
|
max_num_seqs=5,
|
||||||
@@ -642,16 +763,19 @@ def run_qwen2_vl(questions: list[str], modality: str):
|
|||||||
f"{question}<|im_end|>\n"
|
f"{question}<|im_end|>\n"
|
||||||
"<|im_start|>assistant\n") for question in questions
|
"<|im_start|>assistant\n") for question in questions
|
||||||
]
|
]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Qwen2.5-VL
|
# Qwen2.5-VL
|
||||||
def run_qwen2_5_vl(questions: list[str], modality: str):
|
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
|
||||||
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=5,
|
max_num_seqs=5,
|
||||||
@@ -674,8 +798,11 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
|
|||||||
f"{question}<|im_end|>\n"
|
f"{question}<|im_end|>\n"
|
||||||
"<|im_start|>assistant\n") for question in questions
|
"<|im_start|>assistant\n") for question in questions
|
||||||
]
|
]
|
||||||
stop_token_ids = None
|
|
||||||
return llm, prompts, stop_token_ids
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
@@ -789,18 +916,28 @@ def main(args):
|
|||||||
data = mm_input["data"]
|
data = mm_input["data"]
|
||||||
questions = mm_input["questions"]
|
questions = mm_input["questions"]
|
||||||
|
|
||||||
llm, prompts, stop_token_ids = model_example_map[model](questions,
|
req_data = model_example_map[model](questions, modality)
|
||||||
modality)
|
|
||||||
|
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||||
|
llm = LLM(**engine_args)
|
||||||
|
|
||||||
|
# To maintain code compatibility in this script, we add LoRA here.
|
||||||
|
# You can also add LoRA using:
|
||||||
|
# llm.generate(prompts, lora_request=lora_request,...)
|
||||||
|
if req_data.lora_requests:
|
||||||
|
for lora_request in req_data.lora_requests:
|
||||||
|
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||||
|
|
||||||
# Don't want to check the flag multiple times, so just hijack `prompts`.
|
# Don't want to check the flag multiple times, so just hijack `prompts`.
|
||||||
prompts = prompts if args.use_different_prompt_per_request else [
|
prompts = req_data.prompts if args.use_different_prompt_per_request else [
|
||||||
prompts[0]
|
req_data.prompts[0]
|
||||||
]
|
]
|
||||||
|
|
||||||
# We set temperature to 0.2 so that outputs can be different
|
# We set temperature to 0.2 so that outputs can be different
|
||||||
# even when all prompts are identical when running batch inference.
|
# even when all prompts are identical when running batch inference.
|
||||||
sampling_params = SamplingParams(temperature=0.2,
|
sampling_params = SamplingParams(temperature=0.2,
|
||||||
max_tokens=64,
|
max_tokens=64,
|
||||||
stop_token_ids=stop_token_ids)
|
stop_token_ids=req_data.stop_token_ids)
|
||||||
|
|
||||||
assert args.num_prompts > 0
|
assert args.num_prompts > 0
|
||||||
if args.num_prompts == 1:
|
if args.num_prompts == 1:
|
||||||
@@ -865,6 +1002,10 @@ if __name__ == "__main__":
|
|||||||
type=int,
|
type=int,
|
||||||
default=16,
|
default=16,
|
||||||
help='Number of frames to extract from the video.')
|
help='Number of frames to extract from the video.')
|
||||||
|
parser.add_argument("--seed",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Set the seed when initializing `vllm.LLM`.")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--image-repeat-prob',
|
'--image-repeat-prob',
|
||||||
|
|||||||
@@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
|
|||||||
on HuggingFace model repository.
|
on HuggingFace model repository.
|
||||||
"""
|
"""
|
||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
|
from dataclasses import asdict
|
||||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||||
|
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM, EngineArgs
|
||||||
from vllm.multimodal.utils import fetch_image
|
from vllm.multimodal.utils import fetch_image
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
@@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
|
|||||||
|
|
||||||
|
|
||||||
class ModelRequestData(NamedTuple):
|
class ModelRequestData(NamedTuple):
|
||||||
llm: LLM
|
engine_args: EngineArgs
|
||||||
prompt: str
|
prompt: str
|
||||||
image: Optional[Image]
|
image: Optional[Image]
|
||||||
|
|
||||||
|
|
||||||
def run_e5_v(query: Query):
|
def run_e5_v(query: Query) -> ModelRequestData:
|
||||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
||||||
|
|
||||||
if query["modality"] == "text":
|
if query["modality"] == "text":
|
||||||
@@ -58,20 +59,20 @@ def run_e5_v(query: Query):
|
|||||||
modality = query['modality']
|
modality = query['modality']
|
||||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model="royokong/e5-v",
|
model="royokong/e5-v",
|
||||||
task="embed",
|
task="embed",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
)
|
)
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
image=image,
|
image=image,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_vlm2vec(query: Query):
|
def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||||
if query["modality"] == "text":
|
if query["modality"] == "text":
|
||||||
text = query["text"]
|
text = query["text"]
|
||||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||||
@@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
|
|||||||
modality = query['modality']
|
modality = query['modality']
|
||||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model="TIGER-Lab/VLM2Vec-Full",
|
model="TIGER-Lab/VLM2Vec-Full",
|
||||||
task="embed",
|
task="embed",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
image=image,
|
image=image,
|
||||||
)
|
)
|
||||||
@@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
|
|||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
def run_encode(model: str, modality: QueryModality):
|
def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
||||||
query = get_query(modality)
|
query = get_query(modality)
|
||||||
req_data = model_example_map[model](query)
|
req_data = model_example_map[model](query)
|
||||||
|
|
||||||
|
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||||
|
llm = LLM(**engine_args)
|
||||||
|
|
||||||
mm_data = {}
|
mm_data = {}
|
||||||
if req_data.image is not None:
|
if req_data.image is not None:
|
||||||
mm_data["image"] = req_data.image
|
mm_data["image"] = req_data.image
|
||||||
|
|
||||||
outputs = req_data.llm.embed({
|
outputs = llm.embed({
|
||||||
"prompt": req_data.prompt,
|
"prompt": req_data.prompt,
|
||||||
"multi_modal_data": mm_data,
|
"multi_modal_data": mm_data,
|
||||||
})
|
})
|
||||||
@@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
|
|||||||
|
|
||||||
|
|
||||||
def main(args: Namespace):
|
def main(args: Namespace):
|
||||||
run_encode(args.model_name, args.modality)
|
run_encode(args.model_name, args.modality, args.seed)
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
@@ -167,5 +171,10 @@ if __name__ == "__main__":
|
|||||||
default="image",
|
default="image",
|
||||||
choices=get_args(QueryModality),
|
choices=get_args(QueryModality),
|
||||||
help='Modality of the input.')
|
help='Modality of the input.')
|
||||||
|
parser.add_argument("--seed",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Set the seed when initializing `vllm.LLM`.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -6,13 +6,14 @@ using the chat template defined by the model.
|
|||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
|
from dataclasses import asdict
|
||||||
from typing import NamedTuple, Optional
|
from typing import NamedTuple, Optional
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from transformers import AutoProcessor, AutoTokenizer
|
from transformers import AutoProcessor, AutoTokenizer
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, EngineArgs, SamplingParams
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal.utils import fetch_image
|
from vllm.multimodal.utils import fetch_image
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@@ -25,11 +26,12 @@ IMAGE_URLS = [
|
|||||||
|
|
||||||
|
|
||||||
class ModelRequestData(NamedTuple):
|
class ModelRequestData(NamedTuple):
|
||||||
llm: LLM
|
engine_args: EngineArgs
|
||||||
prompt: str
|
prompt: str
|
||||||
stop_token_ids: Optional[list[int]]
|
|
||||||
image_data: list[Image]
|
image_data: list[Image]
|
||||||
chat_template: Optional[str]
|
stop_token_ids: Optional[list[int]] = None
|
||||||
|
chat_template: Optional[str] = None
|
||||||
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
|
|
||||||
|
|
||||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||||
@@ -37,53 +39,55 @@ class ModelRequestData(NamedTuple):
|
|||||||
# Unless specified, these settings have been tested to work on a single L4.
|
# Unless specified, these settings have been tested to work on a single L4.
|
||||||
|
|
||||||
|
|
||||||
def load_aria(question, image_urls: list[str]) -> ModelRequestData:
|
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "rhymes-ai/Aria"
|
model_name = "rhymes-ai/Aria"
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
tokenizer_mode="slow",
|
model=model_name,
|
||||||
trust_remote_code=True,
|
tokenizer_mode="slow",
|
||||||
dtype="bfloat16",
|
trust_remote_code=True,
|
||||||
limit_mm_per_prompt={"image": len(image_urls)})
|
dtype="bfloat16",
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
|
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
|
||||||
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
|
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
|
||||||
"<|im_start|>assistant\n")
|
"<|im_start|>assistant\n")
|
||||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
stop_token_ids=stop_token_ids,
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_deepseek_vl2(question: str, image_urls: list[str]):
|
def load_deepseek_vl2(question: str,
|
||||||
|
image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
engine_args = EngineArgs(
|
||||||
max_model_len=4096,
|
model=model_name,
|
||||||
max_num_seqs=2,
|
max_model_len=4096,
|
||||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
max_num_seqs=2,
|
||||||
limit_mm_per_prompt={"image": len(image_urls)})
|
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
)
|
||||||
|
|
||||||
placeholder = "".join(f"image_{i}:<image>\n"
|
placeholder = "".join(f"image_{i}:<image>\n"
|
||||||
for i, _ in enumerate(image_urls, start=1))
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
|
prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=None,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "google/gemma-3-4b-it"
|
model_name = "google/gemma-3-4b-it"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@@ -112,18 +116,16 @@ def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
add_generation_prompt=True)
|
add_generation_prompt=True)
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=None,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "h2oai/h2ovl-mississippi-800m"
|
model_name = "h2oai/h2ovl-mississippi-800m"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
@@ -146,19 +148,18 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
stop_token_ids = [tokenizer.eos_token_id]
|
stop_token_ids = [tokenizer.eos_token_id]
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
stop_token_ids=stop_token_ids,
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
|
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||||
|
|
||||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
@@ -177,18 +178,16 @@ def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
for i, _ in enumerate(image_urls, start=1))
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=None,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "OpenGVLab/InternVL2-2B"
|
model_name = "OpenGVLab/InternVL2-2B"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
@@ -214,19 +213,18 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
stop_token_ids=stop_token_ids,
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
|
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||||
|
|
||||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
@@ -236,19 +234,17 @@ def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
placeholders = "<|image|>" * len(image_urls)
|
placeholders = "<|image|>" * len(image_urls)
|
||||||
prompt = f"{placeholders}<|begin_of_text|>{question}"
|
prompt = f"{placeholders}<|begin_of_text|>{question}"
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=None,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_nvlm_d(question: str, image_urls: list[str]):
|
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "nvidia/NVLM-D-72B"
|
model_name = "nvidia/NVLM-D-72B"
|
||||||
|
|
||||||
# Adjust this as necessary to fit in GPU
|
# Adjust this as necessary to fit in GPU
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
@@ -266,14 +262,11 @@ def load_nvlm_d(question: str, image_urls: list[str]):
|
|||||||
prompt = tokenizer.apply_chat_template(messages,
|
prompt = tokenizer.apply_chat_template(messages,
|
||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True)
|
add_generation_prompt=True)
|
||||||
stop_token_ids = None
|
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -281,7 +274,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
model_name = "mistral-community/pixtral-12b"
|
model_name = "mistral-community/pixtral-12b"
|
||||||
|
|
||||||
# Adjust this as necessary to fit in GPU
|
# Adjust this as necessary to fit in GPU
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
@@ -291,14 +284,11 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
|
|
||||||
placeholders = "[IMG]" * len(image_urls)
|
placeholders = "[IMG]" * len(image_urls)
|
||||||
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
||||||
stop_token_ids = None
|
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -315,7 +305,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
#
|
#
|
||||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
@@ -326,14 +316,11 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
placeholders = "\n".join(f"<|image_{i}|>"
|
placeholders = "\n".join(f"<|image_{i}|>"
|
||||||
for i, _ in enumerate(image_urls, start=1))
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||||||
stop_token_ids = None
|
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -347,7 +334,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||||
# we have to manually specify the path of the lora weights.
|
# we have to manually specify the path of the lora weights.
|
||||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_path,
|
model=model_path,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=10000,
|
max_model_len=10000,
|
||||||
@@ -357,30 +344,23 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
max_lora_rank=320,
|
max_lora_rank=320,
|
||||||
lora_extra_vocab_size=0,
|
lora_extra_vocab_size=0,
|
||||||
)
|
)
|
||||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
|
||||||
# To maintain code compatibility in this script, we add LoRA here.
|
|
||||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
|
||||||
# You can also add LoRA using:
|
|
||||||
# llm.generate(prompts, lora_request=lora_request,...)
|
|
||||||
|
|
||||||
placeholders = "".join(f"<|image_{i}|>"
|
placeholders = "".join(f"<|image_{i}|>"
|
||||||
for i, _ in enumerate(image_urls, start=1))
|
for i, _ in enumerate(image_urls, start=1))
|
||||||
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||||
stop_token_ids = None
|
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
chat_template=None,
|
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_qwen_vl_chat(question: str,
|
def load_qwen_vl_chat(question: str,
|
||||||
image_urls: list[str]) -> ModelRequestData:
|
image_urls: list[str]) -> ModelRequestData:
|
||||||
model_name = "Qwen/Qwen-VL-Chat"
|
model_name = "Qwen/Qwen-VL-Chat"
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -411,7 +391,7 @@ def load_qwen_vl_chat(question: str,
|
|||||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
stop_token_ids=stop_token_ids,
|
||||||
image_data=[fetch_image(url) for url in image_urls],
|
image_data=[fetch_image(url) for url in image_urls],
|
||||||
@@ -419,7 +399,7 @@ def load_qwen_vl_chat(question: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
try:
|
try:
|
||||||
from qwen_vl_utils import process_vision_info
|
from qwen_vl_utils import process_vision_info
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
@@ -431,7 +411,7 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
|
||||||
# Tested on L40
|
# Tested on L40
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=32768 if process_vision_info is None else 4096,
|
max_model_len=32768 if process_vision_info is None else 4096,
|
||||||
max_num_seqs=5,
|
max_num_seqs=5,
|
||||||
@@ -460,23 +440,19 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True)
|
add_generation_prompt=True)
|
||||||
|
|
||||||
stop_token_ids = None
|
|
||||||
|
|
||||||
if process_vision_info is None:
|
if process_vision_info is None:
|
||||||
image_data = [fetch_image(url) for url in image_urls]
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
else:
|
else:
|
||||||
image_data, _ = process_vision_info(messages)
|
image_data, _ = process_vision_info(messages)
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
image_data=image_data,
|
image_data=image_data,
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
try:
|
try:
|
||||||
from qwen_vl_utils import process_vision_info
|
from qwen_vl_utils import process_vision_info
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
@@ -487,7 +463,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
|
|
||||||
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||||
|
|
||||||
llm = LLM(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=32768 if process_vision_info is None else 4096,
|
max_model_len=32768 if process_vision_info is None else 4096,
|
||||||
max_num_seqs=5,
|
max_num_seqs=5,
|
||||||
@@ -516,8 +492,6 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
tokenize=False,
|
tokenize=False,
|
||||||
add_generation_prompt=True)
|
add_generation_prompt=True)
|
||||||
|
|
||||||
stop_token_ids = None
|
|
||||||
|
|
||||||
if process_vision_info is None:
|
if process_vision_info is None:
|
||||||
image_data = [fetch_image(url) for url in image_urls]
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
else:
|
else:
|
||||||
@@ -525,11 +499,9 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
|||||||
return_video_kwargs=False)
|
return_video_kwargs=False)
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
llm=llm,
|
engine_args=engine_args,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
image_data=image_data,
|
image_data=image_data,
|
||||||
chat_template=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -551,14 +523,25 @@ model_example_map = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def run_generate(model, question: str, image_urls: list[str]):
|
def run_generate(model, question: str, image_urls: list[str],
|
||||||
|
seed: Optional[int]):
|
||||||
req_data = model_example_map[model](question, image_urls)
|
req_data = model_example_map[model](question, image_urls)
|
||||||
|
|
||||||
|
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||||
|
llm = LLM(**engine_args)
|
||||||
|
|
||||||
|
# To maintain code compatibility in this script, we add LoRA here.
|
||||||
|
# You can also add LoRA using:
|
||||||
|
# llm.generate(prompts, lora_request=lora_request,...)
|
||||||
|
if req_data.lora_requests:
|
||||||
|
for lora_request in req_data.lora_requests:
|
||||||
|
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.0,
|
sampling_params = SamplingParams(temperature=0.0,
|
||||||
max_tokens=128,
|
max_tokens=128,
|
||||||
stop_token_ids=req_data.stop_token_ids)
|
stop_token_ids=req_data.stop_token_ids)
|
||||||
|
|
||||||
outputs = req_data.llm.generate(
|
outputs = llm.generate(
|
||||||
{
|
{
|
||||||
"prompt": req_data.prompt,
|
"prompt": req_data.prompt,
|
||||||
"multi_modal_data": {
|
"multi_modal_data": {
|
||||||
@@ -572,13 +555,24 @@ def run_generate(model, question: str, image_urls: list[str]):
|
|||||||
print(generated_text)
|
print(generated_text)
|
||||||
|
|
||||||
|
|
||||||
def run_chat(model: str, question: str, image_urls: list[str]):
|
def run_chat(model: str, question: str, image_urls: list[str],
|
||||||
|
seed: Optional[int]):
|
||||||
req_data = model_example_map[model](question, image_urls)
|
req_data = model_example_map[model](question, image_urls)
|
||||||
|
|
||||||
|
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||||
|
llm = LLM(**engine_args)
|
||||||
|
|
||||||
|
# To maintain code compatibility in this script, we add LoRA here.
|
||||||
|
# You can also add LoRA using:
|
||||||
|
# llm.generate(prompts, lora_request=lora_request,...)
|
||||||
|
if req_data.lora_requests:
|
||||||
|
for lora_request in req_data.lora_requests:
|
||||||
|
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.0,
|
sampling_params = SamplingParams(temperature=0.0,
|
||||||
max_tokens=128,
|
max_tokens=128,
|
||||||
stop_token_ids=req_data.stop_token_ids)
|
stop_token_ids=req_data.stop_token_ids)
|
||||||
outputs = req_data.llm.chat(
|
outputs = llm.chat(
|
||||||
[{
|
[{
|
||||||
"role":
|
"role":
|
||||||
"user",
|
"user",
|
||||||
@@ -607,11 +601,12 @@ def run_chat(model: str, question: str, image_urls: list[str]):
|
|||||||
def main(args: Namespace):
|
def main(args: Namespace):
|
||||||
model = args.model_type
|
model = args.model_type
|
||||||
method = args.method
|
method = args.method
|
||||||
|
seed = args.seed
|
||||||
|
|
||||||
if method == "generate":
|
if method == "generate":
|
||||||
run_generate(model, QUESTION, IMAGE_URLS)
|
run_generate(model, QUESTION, IMAGE_URLS, seed)
|
||||||
elif method == "chat":
|
elif method == "chat":
|
||||||
run_chat(model, QUESTION, IMAGE_URLS)
|
run_chat(model, QUESTION, IMAGE_URLS, seed)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid method: {method}")
|
raise ValueError(f"Invalid method: {method}")
|
||||||
|
|
||||||
@@ -632,6 +627,10 @@ if __name__ == "__main__":
|
|||||||
default="generate",
|
default="generate",
|
||||||
choices=["generate", "chat"],
|
choices=["generate", "chat"],
|
||||||
help="The method to run in `vllm.LLM`.")
|
help="The method to run in `vllm.LLM`.")
|
||||||
|
parser.add_argument("--seed",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Set the seed when initializing `vllm.LLM`.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
|||||||
lm-format-enforcer >= 0.10.11, < 0.11
|
lm-format-enforcer >= 0.10.11, < 0.11
|
||||||
outlines == 0.1.11
|
outlines == 0.1.11
|
||||||
lark == 1.2.2
|
lark == 1.2.2
|
||||||
xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
|
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||||
typing_extensions >= 4.10
|
typing_extensions >= 4.10
|
||||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||||
partial-json-parser # used for parsing partial JSON outputs
|
partial-json-parser # used for parsing partial JSON outputs
|
||||||
@@ -28,7 +28,7 @@ pyzmq
|
|||||||
msgspec
|
msgspec
|
||||||
gguf == 0.10.0
|
gguf == 0.10.0
|
||||||
importlib_metadata
|
importlib_metadata
|
||||||
mistral_common[opencv] >= 1.5.0
|
mistral_common[opencv] >= 1.5.4
|
||||||
pyyaml
|
pyyaml
|
||||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ pydantic >= 2.8
|
|||||||
torch
|
torch
|
||||||
py-cpuinfo
|
py-cpuinfo
|
||||||
transformers
|
transformers
|
||||||
mistral_common >= 1.5.0
|
mistral_common >= 1.5.4
|
||||||
aiohttp
|
aiohttp
|
||||||
starlette
|
starlette
|
||||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ torchaudio==2.6.0
|
|||||||
torchvision==0.21.0
|
torchvision==0.21.0
|
||||||
transformers_stream_generator # required for qwen-vl test
|
transformers_stream_generator # required for qwen-vl test
|
||||||
matplotlib # required for qwen-vl test
|
matplotlib # required for qwen-vl test
|
||||||
mistral_common[opencv] >= 1.5.0 # required for pixtral test
|
mistral_common[opencv] >= 1.5.4 # required for pixtral test
|
||||||
datamodel_code_generator # required for minicpm3 test
|
datamodel_code_generator # required for minicpm3 test
|
||||||
lm-eval[api]==0.4.4 # required for model evaluation test
|
lm-eval[api]==0.4.4 # required for model evaluation test
|
||||||
transformers==4.48.2
|
transformers==4.48.2
|
||||||
|
|||||||
@@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
|
|||||||
# typepy
|
# typepy
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
# via markdown-it-py
|
# via markdown-it-py
|
||||||
mistral-common==1.5.1
|
mistral-common==1.5.4
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
more-itertools==10.5.0
|
more-itertools==10.5.0
|
||||||
# via lm-eval
|
# via lm-eval
|
||||||
|
|||||||
@@ -17,9 +17,9 @@ ray[data]
|
|||||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||||
|
|||||||
30
setup.py
30
setup.py
@@ -294,26 +294,28 @@ class repackage_wheel(build_ext):
|
|||||||
]).decode("utf-8")
|
]).decode("utf-8")
|
||||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||||
|
|
||||||
# Check if the local main branch is up-to-date. This is to ensure
|
# Check if the upstream_main_commit exists in the local repo
|
||||||
# the base commit we found is the most recent commit on the main
|
try:
|
||||||
# branch.
|
subprocess.check_output(
|
||||||
local_main_commit = subprocess.check_output(
|
["git", "cat-file", "-e", f"{upstream_main_commit}"])
|
||||||
["git", "rev-parse", "main"]).decode("utf-8").strip()
|
except subprocess.CalledProcessError:
|
||||||
if local_main_commit != upstream_main_commit:
|
# If not present, fetch it from the remote repository.
|
||||||
raise ValueError(
|
# Note that this does not update any local branches,
|
||||||
f"Local main branch ({local_main_commit}) is not "
|
# but ensures that this commit ref and its history are
|
||||||
"up-to-date with upstream main branch "
|
# available in our local repo.
|
||||||
f"({upstream_main_commit}). Please pull the latest "
|
subprocess.check_call([
|
||||||
"changes from upstream main branch first.")
|
"git", "fetch", "https://github.com/vllm-project/vllm",
|
||||||
|
"main"
|
||||||
|
])
|
||||||
|
|
||||||
# Then get the commit hash of the current branch that is the same as
|
# Then get the commit hash of the current branch that is the same as
|
||||||
# the upstream main commit.
|
# the upstream main commit.
|
||||||
current_branch = subprocess.check_output(
|
current_branch = subprocess.check_output(
|
||||||
["git", "branch", "--show-current"]).decode("utf-8").strip()
|
["git", "branch", "--show-current"]).decode("utf-8").strip()
|
||||||
|
|
||||||
base_commit = subprocess.check_output(
|
base_commit = subprocess.check_output([
|
||||||
["git", "merge-base", "main",
|
"git", "merge-base", f"{upstream_main_commit}", current_branch
|
||||||
current_branch]).decode("utf-8").strip()
|
]).decode("utf-8").strip()
|
||||||
return base_commit
|
return base_commit
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
raise ValueError(err) from None
|
raise ValueError(err) from None
|
||||||
|
|||||||
@@ -7,10 +7,10 @@ from vllm import LLM, SamplingParams
|
|||||||
from vllm.device_allocator.cumem import CuMemAllocator
|
from vllm.device_allocator.cumem import CuMemAllocator
|
||||||
from vllm.utils import GiB_bytes
|
from vllm.utils import GiB_bytes
|
||||||
|
|
||||||
from ..utils import fork_new_process_for_each_test
|
from ..utils import create_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_python_error():
|
def test_python_error():
|
||||||
"""
|
"""
|
||||||
Test if Python error occurs when there's low-level
|
Test if Python error occurs when there's low-level
|
||||||
@@ -36,7 +36,7 @@ def test_python_error():
|
|||||||
allocator.wake_up()
|
allocator.wake_up()
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_basic_cumem():
|
def test_basic_cumem():
|
||||||
# some tensors from default memory pool
|
# some tensors from default memory pool
|
||||||
shape = (1024, 1024)
|
shape = (1024, 1024)
|
||||||
@@ -69,7 +69,7 @@ def test_basic_cumem():
|
|||||||
assert torch.allclose(output, torch.ones_like(output) * 3)
|
assert torch.allclose(output, torch.ones_like(output) * 3)
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_cumem_with_cudagraph():
|
def test_cumem_with_cudagraph():
|
||||||
allocator = CuMemAllocator.get_instance()
|
allocator = CuMemAllocator.get_instance()
|
||||||
with allocator.use_memory_pool():
|
with allocator.use_memory_pool():
|
||||||
@@ -114,7 +114,7 @@ def test_cumem_with_cudagraph():
|
|||||||
assert torch.allclose(y, x + 1)
|
assert torch.allclose(y, x + 1)
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model, use_v1",
|
"model, use_v1",
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from vllm import LLM, SamplingParams
|
|||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ..utils import fork_new_process_for_each_test
|
from ..utils import create_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=None, name="model_info")
|
@pytest.fixture(params=None, name="model_info")
|
||||||
@@ -78,7 +78,7 @@ def models_list_fixture(request):
|
|||||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("model_info", "", indirect=True)
|
@pytest.mark.parametrize("model_info", "", indirect=True)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_full_graph(
|
def test_full_graph(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
model_info: tuple[str, dict[str, Any]],
|
model_info: tuple[str, dict[str, Any]],
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import pytest
|
|||||||
from vllm.config import TaskOption
|
from vllm.config import TaskOption
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||||
|
|
||||||
logger = init_logger("test_expert_parallel")
|
logger = init_logger("test_expert_parallel")
|
||||||
|
|
||||||
@@ -209,7 +209,7 @@ def _compare_tp(
|
|||||||
for params in settings.iter_params(model_name)
|
for params in settings.iter_params(model_name)
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_ep(
|
def test_ep(
|
||||||
model_name: str,
|
model_name: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from vllm.config import TaskOption
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
from ..models.registry import HF_EXAMPLE_MODELS
|
from ..models.registry import HF_EXAMPLE_MODELS
|
||||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||||
|
|
||||||
logger = init_logger("test_pipeline_parallel")
|
logger = init_logger("test_pipeline_parallel")
|
||||||
|
|
||||||
@@ -402,7 +402,7 @@ def _compare_tp(
|
|||||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_tp_language_generation(
|
def test_tp_language_generation(
|
||||||
model_id: str,
|
model_id: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
@@ -431,7 +431,7 @@ def test_tp_language_generation(
|
|||||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_tp_language_embedding(
|
def test_tp_language_embedding(
|
||||||
model_id: str,
|
model_id: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
@@ -460,7 +460,7 @@ def test_tp_language_embedding(
|
|||||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_tp_multimodal_generation(
|
def test_tp_multimodal_generation(
|
||||||
model_id: str,
|
model_id: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing_extensions import LiteralString
|
from typing_extensions import LiteralString
|
||||||
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|||||||
"FLASH_ATTN",
|
"FLASH_ATTN",
|
||||||
"FLASHINFER",
|
"FLASHINFER",
|
||||||
])
|
])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_pp_cudagraph(
|
def test_pp_cudagraph(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
PP_SIZE: int,
|
PP_SIZE: int,
|
||||||
|
|||||||
@@ -4,12 +4,12 @@ import pytest
|
|||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
from ...utils import fork_new_process_for_each_test
|
from ...utils import create_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||||
@pytest.mark.parametrize("backend", ["mp", "ray"])
|
@pytest.mark.parametrize("backend", ["mp", "ray"])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_collective_rpc(tp_size, backend):
|
def test_collective_rpc(tp_size, backend):
|
||||||
if tp_size == 1 and backend == "ray":
|
if tp_size == 1 and backend == "ray":
|
||||||
pytest.skip("Skip duplicate test case")
|
pytest.skip("Skip duplicate test case")
|
||||||
|
|||||||
@@ -3,10 +3,9 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from tests.utils import fork_new_process_for_each_test
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
from ..utils import multi_gpu_test
|
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||||
|
|
||||||
MODEL_PATH = "THUDM/chatglm3-6b"
|
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||||
|
|
||||||
@@ -55,7 +54,7 @@ def v1(run_with_both_engines_lora):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_chatglm3_lora(chatglm3_lora_files):
|
def test_chatglm3_lora(chatglm3_lora_files):
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -75,7 +74,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
|||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -96,7 +95,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
|||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
|
|||||||
@@ -4,10 +4,9 @@ import pytest
|
|||||||
import ray
|
import ray
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from tests.utils import fork_new_process_for_each_test
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
from ..utils import multi_gpu_test
|
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||||
|
|
||||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||||
|
|
||||||
@@ -82,7 +81,7 @@ def v1(run_with_both_engines_lora):
|
|||||||
|
|
||||||
# V1 Test: Failing due to numerics on V1.
|
# V1 Test: Failing due to numerics on V1.
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip_v1
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_llama_lora(sql_lora_files):
|
def test_llama_lora(sql_lora_files):
|
||||||
|
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
@@ -97,7 +96,7 @@ def test_llama_lora(sql_lora_files):
|
|||||||
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
|
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
|
||||||
# used by the engine yet.
|
# used by the engine yet.
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip_v1
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_llama_lora_warmup(sql_lora_files):
|
def test_llama_lora_warmup(sql_lora_files):
|
||||||
"""Test that the LLM initialization works with a warmup LORA path and
|
"""Test that the LLM initialization works with a warmup LORA path and
|
||||||
is more conservative"""
|
is more conservative"""
|
||||||
@@ -128,7 +127,7 @@ def test_llama_lora_warmup(sql_lora_files):
|
|||||||
# V1 Test: Failing due to numerics on V1.
|
# V1 Test: Failing due to numerics on V1.
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip_v1
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_llama_lora_tp4(sql_lora_files):
|
def test_llama_lora_tp4(sql_lora_files):
|
||||||
|
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
@@ -143,7 +142,7 @@ def test_llama_lora_tp4(sql_lora_files):
|
|||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||||
|
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
@@ -159,7 +158,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
|||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
|
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
|
||||||
|
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
|
|||||||
@@ -3,11 +3,12 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from tests.utils import fork_new_process_for_each_test
|
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
from ..utils import create_new_process_for_each_test
|
||||||
|
|
||||||
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||||
|
|
||||||
PROMPT_TEMPLATE = (
|
PROMPT_TEMPLATE = (
|
||||||
@@ -57,7 +58,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
|||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
current_platform.is_rocm(),
|
current_platform.is_rocm(),
|
||||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_minicpmv_lora(minicpmv_lora_files):
|
def test_minicpmv_lora(minicpmv_lora_files):
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
@@ -80,7 +81,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
|||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
current_platform.is_rocm(),
|
current_platform.is_rocm(),
|
||||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
@@ -101,7 +102,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
|||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
current_platform.is_rocm(),
|
current_platform.is_rocm(),
|
||||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
|
|||||||
@@ -3,10 +3,9 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from tests.utils import fork_new_process_for_each_test
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
from ..utils import multi_gpu_test
|
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||||
|
|
||||||
MODEL_PATH = "ArthurZ/ilama-3.2-1B"
|
MODEL_PATH = "ArthurZ/ilama-3.2-1B"
|
||||||
|
|
||||||
@@ -56,7 +55,7 @@ def v1(run_with_both_engines_lora):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip_v1
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_ilama_lora(ilama_lora_files):
|
def test_ilama_lora(ilama_lora_files):
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -77,7 +76,7 @@ def test_ilama_lora(ilama_lora_files):
|
|||||||
|
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip_v1
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_ilama_lora_tp4(ilama_lora_files):
|
def test_ilama_lora_tp4(ilama_lora_files):
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -99,7 +98,7 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
|||||||
|
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip_v1
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -208,8 +209,6 @@ def test_guided_decoding_backend_options():
|
|||||||
|
|
||||||
|
|
||||||
def test_pickle_xgrammar_tokenizer_data():
|
def test_pickle_xgrammar_tokenizer_data():
|
||||||
|
|
||||||
# TODO: move to another test file for xgrammar
|
|
||||||
try:
|
try:
|
||||||
import xgrammar as xgr
|
import xgrammar as xgr
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -217,7 +216,11 @@ def test_pickle_xgrammar_tokenizer_data():
|
|||||||
|
|
||||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||||
TokenizerData)
|
TokenizerData)
|
||||||
tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
|
tokenizer_data = TokenizerData(
|
||||||
|
metadata=
|
||||||
|
'{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
|
||||||
|
encoded_vocab=['!', '"', '#', '$', '%'],
|
||||||
|
)
|
||||||
pickled = pickle.dumps(tokenizer_data)
|
pickled = pickle.dumps(tokenizer_data)
|
||||||
|
|
||||||
assert pickled is not None
|
assert pickled is not None
|
||||||
@@ -225,4 +228,5 @@ def test_pickle_xgrammar_tokenizer_data():
|
|||||||
depickled: TokenizerData = pickle.loads(pickled)
|
depickled: TokenizerData = pickle.loads(pickled)
|
||||||
|
|
||||||
assert depickled is not None
|
assert depickled is not None
|
||||||
assert depickled.vocab_type == xgr.VocabType.RAW
|
assert json.loads(
|
||||||
|
depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from vllm.utils import identity
|
|||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
||||||
_VideoAssets)
|
_VideoAssets)
|
||||||
from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
|
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
|
||||||
multi_gpu_marks)
|
multi_gpu_marks)
|
||||||
from ...utils import check_outputs_equal
|
from ...utils import check_outputs_equal
|
||||||
from .vlm_utils import custom_inputs, model_utils, runners
|
from .vlm_utils import custom_inputs, model_utils, runners
|
||||||
@@ -592,7 +592,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
fork_new_process_for_each_test=False,
|
create_new_process_for_each_test=False,
|
||||||
))
|
))
|
||||||
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
@@ -617,7 +617,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.MULTI_IMAGE,
|
test_type=VLMTestType.MULTI_IMAGE,
|
||||||
fork_new_process_for_each_test=False,
|
create_new_process_for_each_test=False,
|
||||||
))
|
))
|
||||||
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
@@ -642,7 +642,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.EMBEDDING,
|
test_type=VLMTestType.EMBEDDING,
|
||||||
fork_new_process_for_each_test=False,
|
create_new_process_for_each_test=False,
|
||||||
))
|
))
|
||||||
def test_image_embedding_models(model_type: str,
|
def test_image_embedding_models(model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
@@ -666,7 +666,7 @@ def test_image_embedding_models(model_type: str,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.VIDEO,
|
test_type=VLMTestType.VIDEO,
|
||||||
fork_new_process_for_each_test=False,
|
create_new_process_for_each_test=False,
|
||||||
))
|
))
|
||||||
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||||
@@ -688,7 +688,7 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
fork_new_process_for_each_test=False,
|
create_new_process_for_each_test=False,
|
||||||
))
|
))
|
||||||
def test_custom_inputs_models(
|
def test_custom_inputs_models(
|
||||||
model_type: str,
|
model_type: str,
|
||||||
@@ -714,9 +714,9 @@ def test_custom_inputs_models(
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
fork_new_process_for_each_test=True,
|
create_new_process_for_each_test=True,
|
||||||
))
|
))
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: type[HfRunner],
|
hf_runner: type[HfRunner],
|
||||||
@@ -740,9 +740,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.MULTI_IMAGE,
|
test_type=VLMTestType.MULTI_IMAGE,
|
||||||
fork_new_process_for_each_test=True,
|
create_new_process_for_each_test=True,
|
||||||
))
|
))
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: type[HfRunner],
|
hf_runner: type[HfRunner],
|
||||||
@@ -766,9 +766,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.EMBEDDING,
|
test_type=VLMTestType.EMBEDDING,
|
||||||
fork_new_process_for_each_test=True,
|
create_new_process_for_each_test=True,
|
||||||
))
|
))
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_image_embedding_models_heavy(model_type: str,
|
def test_image_embedding_models_heavy(model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: type[HfRunner],
|
hf_runner: type[HfRunner],
|
||||||
@@ -791,7 +791,7 @@ def test_image_embedding_models_heavy(model_type: str,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.VIDEO,
|
test_type=VLMTestType.VIDEO,
|
||||||
fork_new_process_for_each_test=True,
|
create_new_process_for_each_test=True,
|
||||||
))
|
))
|
||||||
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: type[HfRunner],
|
hf_runner: type[HfRunner],
|
||||||
@@ -814,9 +814,9 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
|||||||
get_parametrized_options(
|
get_parametrized_options(
|
||||||
VLM_TEST_SETTINGS,
|
VLM_TEST_SETTINGS,
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
fork_new_process_for_each_test=True,
|
create_new_process_for_each_test=True,
|
||||||
))
|
))
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_custom_inputs_models_heavy(
|
def test_custom_inputs_models_heavy(
|
||||||
model_type: str,
|
model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
|||||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||||
|
|
||||||
|
|
||||||
def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
def get_filtered_test_settings(
|
||||||
test_type: VLMTestType,
|
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
|
||||||
fork_per_test: bool) -> dict[str, VLMTestInfo]:
|
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||||
"""Given the dict of potential test settings to run, return a subdict
|
"""Given the dict of potential test settings to run, return a subdict
|
||||||
of tests who have the current test type enabled with the matching val for
|
of tests who have the current test type enabled with the matching val for
|
||||||
fork_per_test.
|
fork_per_test.
|
||||||
@@ -43,7 +43,7 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
|||||||
|
|
||||||
# Everything looks okay; keep if this is has correct proc handling
|
# Everything looks okay; keep if this is has correct proc handling
|
||||||
if (test_info.distributed_executor_backend
|
if (test_info.distributed_executor_backend
|
||||||
is not None) == fork_per_test:
|
is not None) == new_proc_per_test:
|
||||||
matching_tests[test_name] = test_info
|
matching_tests[test_name] = test_info
|
||||||
|
|
||||||
return matching_tests
|
return matching_tests
|
||||||
@@ -51,14 +51,14 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
|||||||
|
|
||||||
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||||
test_type: VLMTestType,
|
test_type: VLMTestType,
|
||||||
fork_new_process_for_each_test: bool):
|
create_new_process_for_each_test: bool):
|
||||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||||
This is similar to nesting pytest parametrize calls, but done directly
|
This is similar to nesting pytest parametrize calls, but done directly
|
||||||
through an itertools product so that each test can set things like
|
through an itertools product so that each test can set things like
|
||||||
size factors etc, while still running in isolated test cases.
|
size factors etc, while still running in isolated test cases.
|
||||||
"""
|
"""
|
||||||
matching_tests = get_filtered_test_settings(
|
matching_tests = get_filtered_test_settings(
|
||||||
test_settings, test_type, fork_new_process_for_each_test)
|
test_settings, test_type, create_new_process_for_each_test)
|
||||||
|
|
||||||
# Ensure that something is wrapped as an iterable it's not already
|
# Ensure that something is wrapped as an iterable it's not already
|
||||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import pytest
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
|
|
||||||
from ....utils import fork_new_process_for_each_test, multi_gpu_test
|
from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||||
|
|
||||||
PROMPTS = [
|
PROMPTS = [
|
||||||
{
|
{
|
||||||
@@ -119,7 +119,7 @@ def run_test(
|
|||||||
assert output.outputs[0].text == expected
|
assert output.outputs[0].text == expected
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
@pytest.mark.core_model
|
@pytest.mark.core_model
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
|
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ import pytest
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
|
|
||||||
from ..utils import fork_new_process_for_each_test
|
from ..utils import create_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_plugin(
|
def test_plugin(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
dummy_opt_path: str,
|
dummy_opt_path: str,
|
||||||
@@ -24,7 +24,7 @@ def test_plugin(
|
|||||||
assert (error_msg in str(excinfo.value))
|
assert (error_msg in str(excinfo.value))
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_oot_registration_text_generation(
|
def test_oot_registration_text_generation(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
dummy_opt_path: str,
|
dummy_opt_path: str,
|
||||||
@@ -44,7 +44,7 @@ def test_oot_registration_text_generation(
|
|||||||
assert rest == ""
|
assert rest == ""
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_oot_registration_embedding(
|
def test_oot_registration_embedding(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
dummy_gemma2_embedding_path: str,
|
dummy_gemma2_embedding_path: str,
|
||||||
@@ -62,7 +62,7 @@ def test_oot_registration_embedding(
|
|||||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_oot_registration_multimodal(
|
def test_oot_registration_multimodal(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
dummy_llava_path: str,
|
dummy_llava_path: str,
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
|
|||||||
ModelRegistry)
|
ModelRegistry)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ..utils import fork_new_process_for_each_test
|
from ..utils import create_new_process_for_each_test
|
||||||
from .registry import HF_EXAMPLE_MODELS
|
from .registry import HF_EXAMPLE_MODELS
|
||||||
|
|
||||||
|
|
||||||
@@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
|
|||||||
assert supports_multimodal(model_cls)
|
assert supports_multimodal(model_cls)
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
|
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
|
||||||
("LlamaForCausalLM", False, False, False),
|
("LlamaForCausalLM", False, False, False),
|
||||||
("MllamaForConditionalGeneration", True, False, False),
|
("MllamaForConditionalGeneration", True, False, False),
|
||||||
@@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
|
|||||||
stacklevel=2)
|
stacklevel=2)
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
|
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
|
||||||
("MLPSpeculatorPreTrainedModel", False, False),
|
("MLPSpeculatorPreTrainedModel", False, False),
|
||||||
("DeepseekV2ForCausalLM", True, False),
|
("DeepseekV2ForCausalLM", True, False),
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
from tests.utils import compare_two_settings, fork_new_process_for_each_test
|
|
||||||
|
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||||
|
|
||||||
models_4bit_to_test = [
|
models_4bit_to_test = [
|
||||||
("facebook/opt-125m", "quantize opt model inflight"),
|
("facebook/opt-125m", "quantize opt model inflight"),
|
||||||
@@ -32,7 +33,7 @@ models_pre_quant_8bit_to_test = [
|
|||||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@@ -45,7 +46,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description",
|
@pytest.mark.parametrize("model_name, description",
|
||||||
models_pre_qaunt_4bit_to_test)
|
models_pre_qaunt_4bit_to_test)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@@ -57,7 +58,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description",
|
@pytest.mark.parametrize("model_name, description",
|
||||||
models_pre_quant_8bit_to_test)
|
models_pre_quant_8bit_to_test)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@@ -70,7 +71,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||||
model_name, description) -> None:
|
model_name, description) -> None:
|
||||||
|
|
||||||
@@ -88,7 +89,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
|||||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||||
reason='bitsandbytes is not supported on this GPU type.')
|
reason='bitsandbytes is not supported on this GPU type.')
|
||||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||||
common_args = [
|
common_args = [
|
||||||
"--disable-log-stats",
|
"--disable-log-stats",
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ from transformers import AutoTokenizer
|
|||||||
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
from ...utils import fork_new_process_for_each_test
|
from ...utils import create_new_process_for_each_test
|
||||||
from .conftest import (get_output_from_llm_generator,
|
from .conftest import (get_output_from_llm_generator,
|
||||||
run_equality_correctness_test)
|
run_equality_correctness_test)
|
||||||
|
|
||||||
@@ -82,7 +82,7 @@ from .conftest import (get_output_from_llm_generator,
|
|||||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||||
batch_size: int):
|
batch_size: int):
|
||||||
"""Run generation with speculative decoding on a batch. Verify the engine
|
"""Run generation with speculative decoding on a batch. Verify the engine
|
||||||
@@ -170,7 +170,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [1])
|
@pytest.mark.parametrize("batch_size", [1])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
@@ -244,7 +244,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [64])
|
@pytest.mark.parametrize("batch_size", [64])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
@@ -300,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [32])
|
@pytest.mark.parametrize("batch_size", [32])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||||
@@ -356,7 +356,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
|||||||
256,
|
256,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
@@ -411,7 +411,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
|||||||
64,
|
64,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
@@ -469,7 +469,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
@@ -534,7 +534,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||||
per_test_common_llm_kwargs,
|
per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs,
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
@@ -594,7 +594,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
|||||||
64,
|
64,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||||
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
test_llm_kwargs, batch_size: int, output_len: int,
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
@@ -644,7 +644,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
|||||||
@pytest.mark.parametrize("batch_size", [8])
|
@pytest.mark.parametrize("batch_size", [8])
|
||||||
@pytest.mark.parametrize("output_len", [10])
|
@pytest.mark.parametrize("output_len", [10])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||||
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||||
test_llm_kwargs, batch_size: int, output_len: int,
|
test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
@@ -697,7 +697,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||||
output_len: int, seed: int):
|
output_len: int, seed: int):
|
||||||
@@ -752,7 +752,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|||||||
32,
|
32,
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
|
def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
|
||||||
per_test_common_llm_kwargs,
|
per_test_common_llm_kwargs,
|
||||||
baseline_llm_kwargs, test_llm_kwargs,
|
baseline_llm_kwargs, test_llm_kwargs,
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
|
|||||||
deprecate_kwargs, get_open_port, memory_profiling,
|
deprecate_kwargs, get_open_port, memory_profiling,
|
||||||
merge_async_iterators, supports_kw, swap_dict_values)
|
merge_async_iterators, supports_kw, swap_dict_values)
|
||||||
|
|
||||||
from .utils import error_on_warning, fork_new_process_for_each_test
|
from .utils import create_new_process_for_each_test, error_on_warning
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@@ -276,7 +276,7 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
|
|||||||
) == is_supported
|
) == is_supported
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_memory_profiling():
|
def test_memory_profiling():
|
||||||
# Fake out some model loading + inference memory usage to test profiling
|
# Fake out some model loading + inference memory usage to test profiling
|
||||||
# Memory used by other processes will show up as cuda usage outside of torch
|
# Memory used by other processes will show up as cuda usage outside of torch
|
||||||
|
|||||||
@@ -7,12 +7,14 @@ import os
|
|||||||
import signal
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager, suppress
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Optional, Union
|
from typing import Any, Callable, Literal, Optional, Union
|
||||||
|
|
||||||
|
import cloudpickle
|
||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
@@ -703,6 +705,78 @@ def fork_new_process_for_each_test(
|
|||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def spawn_new_process_for_each_test(
|
||||||
|
f: Callable[_P, None]) -> Callable[_P, None]:
|
||||||
|
"""Decorator to spawn a new process for each test function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@functools.wraps(f)
|
||||||
|
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
||||||
|
# Check if we're already in a subprocess
|
||||||
|
if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
|
||||||
|
# If we are, just run the function directly
|
||||||
|
return f(*args, **kwargs)
|
||||||
|
|
||||||
|
import torch.multiprocessing as mp
|
||||||
|
with suppress(RuntimeError):
|
||||||
|
mp.set_start_method('spawn')
|
||||||
|
|
||||||
|
# Get the module
|
||||||
|
module_name = f.__module__
|
||||||
|
|
||||||
|
# Create a process with environment variable set
|
||||||
|
env = os.environ.copy()
|
||||||
|
env['RUNNING_IN_SUBPROCESS'] = '1'
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
|
output_filepath = os.path.join(tempdir, "new_process.tmp")
|
||||||
|
|
||||||
|
# `cloudpickle` allows pickling complex functions directly
|
||||||
|
input_bytes = cloudpickle.dumps((f, output_filepath))
|
||||||
|
|
||||||
|
cmd = [sys.executable, "-m", f"{module_name}"]
|
||||||
|
|
||||||
|
returned = subprocess.run(cmd,
|
||||||
|
input=input_bytes,
|
||||||
|
capture_output=True,
|
||||||
|
env=env)
|
||||||
|
|
||||||
|
# check if the subprocess is successful
|
||||||
|
try:
|
||||||
|
returned.check_returncode()
|
||||||
|
except Exception as e:
|
||||||
|
# wrap raised exception to provide more information
|
||||||
|
raise RuntimeError(f"Error raised in subprocess:\n"
|
||||||
|
f"{returned.stderr.decode()}") from e
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def create_new_process_for_each_test(
|
||||||
|
method: Optional[Literal["spawn", "fork"]] = None
|
||||||
|
) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
|
||||||
|
"""Creates a decorator that runs each test function in a new process.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
method: The process creation method. Can be either "spawn" or "fork".
|
||||||
|
If not specified,
|
||||||
|
it defaults to "spawn" on ROCm platforms and "fork" otherwise.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A decorator to run test functions in separate processes.
|
||||||
|
"""
|
||||||
|
if method is None:
|
||||||
|
method = "spawn" if current_platform.is_rocm() else "fork"
|
||||||
|
|
||||||
|
assert method in ["spawn",
|
||||||
|
"fork"], "Method must be either 'spawn' or 'fork'"
|
||||||
|
|
||||||
|
if method == "fork":
|
||||||
|
return fork_new_process_for_each_test
|
||||||
|
|
||||||
|
return spawn_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||||
"""
|
"""
|
||||||
Get a pytest mark, which skips the test if the GPU doesn't meet
|
Get a pytest mark, which skips the test if the GPU doesn't meet
|
||||||
@@ -762,7 +836,7 @@ def multi_gpu_test(*, num_gpus: int):
|
|||||||
marks = multi_gpu_marks(num_gpus=num_gpus)
|
marks = multi_gpu_marks(num_gpus=num_gpus)
|
||||||
|
|
||||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||||
func = fork_new_process_for_each_test(f)
|
func = create_new_process_for_each_test()(f)
|
||||||
for mark in reversed(marks):
|
for mark in reversed(marks):
|
||||||
func = mark(func)
|
func = mark(func)
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from concurrent.futures import Future
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from tests.utils import fork_new_process_for_each_test
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@@ -19,6 +18,8 @@ from vllm.v1.executor.abstract import Executor, UniProcExecutor
|
|||||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||||
from vllm.v1.outputs import ModelRunnerOutput
|
from vllm.v1.outputs import ModelRunnerOutput
|
||||||
|
|
||||||
|
from ...utils import create_new_process_for_each_test
|
||||||
|
|
||||||
if not current_platform.is_cuda():
|
if not current_platform.is_cuda():
|
||||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||||
allow_module_level=True)
|
allow_module_level=True)
|
||||||
@@ -44,7 +45,7 @@ def make_request() -> EngineCoreRequest:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
@@ -158,7 +159,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
|||||||
assert len(engine_core.scheduler.running) == 0
|
assert len(engine_core.scheduler.running) == 0
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
A basic end-to-end test to verify that the engine functions correctly
|
A basic end-to-end test to verify that the engine functions correctly
|
||||||
@@ -208,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
|||||||
_check_engine_state()
|
_check_engine_state()
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
Test that the engine can handle multiple concurrent batches.
|
Test that the engine can handle multiple concurrent batches.
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ from typing import Optional
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from tests.utils import fork_new_process_for_each_test
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@@ -19,6 +18,8 @@ from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
|
|||||||
SyncMPClient)
|
SyncMPClient)
|
||||||
from vllm.v1.executor.abstract import Executor
|
from vllm.v1.executor.abstract import Executor
|
||||||
|
|
||||||
|
from ...utils import create_new_process_for_each_test
|
||||||
|
|
||||||
if not current_platform.is_cuda():
|
if not current_platform.is_cuda():
|
||||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||||
allow_module_level=True)
|
allow_module_level=True)
|
||||||
@@ -88,7 +89,7 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
|
|||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@create_new_process_for_each_test()
|
||||||
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
|
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
|
||||||
def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
|
def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
|
||||||
multiprocessing_mode: bool):
|
multiprocessing_mode: bool):
|
||||||
|
|||||||
@@ -18,9 +18,6 @@ MODELS_TO_TEST = [
|
|||||||
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
|
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Undo after https://github.com/vllm-project/vllm/pull/14868
|
|
||||||
pytest.skip(allow_module_level=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
@pytest.mark.parametrize("guided_decoding_backend",
|
@pytest.mark.parametrize("guided_decoding_backend",
|
||||||
|
|||||||
@@ -1576,10 +1576,6 @@ class EngineArgs:
|
|||||||
#############################################################
|
#############################################################
|
||||||
# Experimental Features - allow users to opt in.
|
# Experimental Features - allow users to opt in.
|
||||||
|
|
||||||
# MLA is is supported on V1, but off by default for now.
|
|
||||||
if model_config.use_mla and _warn_or_fallback("MLA"):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# LoRA is supported on V1, but off by default for now.
|
# LoRA is supported on V1, but off by default for now.
|
||||||
if self.enable_lora and _warn_or_fallback("LORA"):
|
if self.enable_lora and _warn_or_fallback("LORA"):
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -379,6 +379,7 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
mm_processor_kwargs,
|
mm_processor_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
prompt_token_ids = self._tokenize_prompt(
|
prompt_token_ids = self._tokenize_prompt(
|
||||||
@@ -401,6 +402,7 @@ class InputPreprocessor:
|
|||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> SingletonInputs:
|
) -> SingletonInputs:
|
||||||
"""Async version of :meth:`_extract_prompt_components`."""
|
"""Async version of :meth:`_extract_prompt_components`."""
|
||||||
parsed = parse_singleton_prompt(prompt)
|
parsed = parse_singleton_prompt(prompt)
|
||||||
@@ -431,6 +433,7 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
mm_processor_kwargs,
|
mm_processor_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
return token_inputs(
|
return token_inputs(
|
||||||
@@ -452,6 +455,7 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
mm_processor_kwargs,
|
mm_processor_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
prompt_token_ids = await self._tokenize_prompt_async(
|
prompt_token_ids = await self._tokenize_prompt_async(
|
||||||
@@ -726,6 +730,7 @@ class InputPreprocessor:
|
|||||||
prompt,
|
prompt,
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._build_decoder_only_llm_inputs(
|
return self._build_decoder_only_llm_inputs(
|
||||||
@@ -746,6 +751,7 @@ class InputPreprocessor:
|
|||||||
prompt,
|
prompt,
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._build_decoder_only_llm_inputs(
|
return self._build_decoder_only_llm_inputs(
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from vllm.model_executor.guided_decoding.reasoner import get_reasoner
|
|||||||
from vllm.model_executor.guided_decoding.utils import (
|
from vllm.model_executor.guided_decoding.utils import (
|
||||||
convert_lark_to_gbnf, grammar_is_likely_lark,
|
convert_lark_to_gbnf, grammar_is_likely_lark,
|
||||||
has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
|
has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
|
||||||
from vllm.platforms import CpuArchEnum
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
@@ -53,19 +52,12 @@ def maybe_backend_fallback(
|
|||||||
if guided_params.backend_name == "xgrammar":
|
if guided_params.backend_name == "xgrammar":
|
||||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||||
xgr_installed)
|
xgr_installed)
|
||||||
# xgrammar only has x86 wheels for linux, fallback to outlines
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
|
|
||||||
fallback_or_error(guided_params,
|
|
||||||
"xgrammar is only supported on x86 CPUs.",
|
|
||||||
"outlines")
|
|
||||||
|
|
||||||
# xgrammar doesn't support regex, fallback to outlines
|
# xgrammar doesn't support regex, fallback to outlines
|
||||||
if guided_params.regex is not None:
|
if guided_params.regex is not None:
|
||||||
fallback_or_error(
|
fallback_or_error(
|
||||||
guided_params,
|
guided_params,
|
||||||
"xgrammar does not support regex guided decoding.", "outlines")
|
"xgrammar does not support regex guided decoding.", "outlines")
|
||||||
|
|
||||||
# xgrammar doesn't support some JSON schema features
|
# xgrammar doesn't support some JSON schema features
|
||||||
elif (guided_params.json is not None
|
elif (guided_params.json is not None
|
||||||
and has_xgrammar_unsupported_json_features(guided_params.json)):
|
and has_xgrammar_unsupported_json_features(guided_params.json)):
|
||||||
|
|||||||
@@ -9,13 +9,11 @@ from dataclasses import dataclass, field
|
|||||||
from typing import TYPE_CHECKING, Any, List
|
from typing import TYPE_CHECKING, Any, List
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from transformers import PreTrainedTokenizerFast
|
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import xgrammar as xgr
|
import xgrammar as xgr
|
||||||
from xgrammar.base import _core as xgr_core
|
|
||||||
xgr_installed = True
|
xgr_installed = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
xgr_installed = False
|
xgr_installed = False
|
||||||
@@ -35,7 +33,6 @@ if TYPE_CHECKING:
|
|||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# TODO: passing batch size to max threads here
|
|
||||||
def get_local_xgrammar_guided_decoding_logits_processor(
|
def get_local_xgrammar_guided_decoding_logits_processor(
|
||||||
guided_params: GuidedDecodingParams,
|
guided_params: GuidedDecodingParams,
|
||||||
tokenizer: PreTrainedTokenizer,
|
tokenizer: PreTrainedTokenizer,
|
||||||
@@ -52,18 +49,8 @@ def get_local_xgrammar_guided_decoding_logits_processor(
|
|||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class TokenizerData:
|
class TokenizerData:
|
||||||
"""Immutable container for cached tokenizer data."""
|
"""Immutable container for cached tokenizer data."""
|
||||||
|
metadata: str
|
||||||
encoded_vocab: list[str] = field(default_factory=list)
|
encoded_vocab: list[str] = field(default_factory=list)
|
||||||
stop_token_ids: list[int] | None = None
|
|
||||||
# These fields are mutually exclusive: `backend_str` is used to create a
|
|
||||||
# TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
|
|
||||||
# used within the constructor of TokenizeInfo
|
|
||||||
backend_str: str | None = None
|
|
||||||
vocab_type: xgr.VocabType | None = None
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
# Check for mutual exclusive
|
|
||||||
assert not (self.backend_str and self.vocab_type), \
|
|
||||||
"backend_str and vocab_type are mutual exclusive"
|
|
||||||
|
|
||||||
|
|
||||||
class TokenizerDataCache:
|
class TokenizerDataCache:
|
||||||
@@ -71,46 +58,52 @@ class TokenizerDataCache:
|
|||||||
_cache: dict[int, TokenizerData] = {}
|
_cache: dict[int, TokenizerData] = {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_tokenizer_data(cls,
|
def get_tokenizer_data(
|
||||||
tokenizer: PreTrainedTokenizer) -> TokenizerData:
|
cls,
|
||||||
tokenizer_hash = hash(tokenizer)
|
tokenizer: PreTrainedTokenizer,
|
||||||
|
/,
|
||||||
|
*,
|
||||||
|
tokenizer_hash: int,
|
||||||
|
vocab_size: int,
|
||||||
|
) -> TokenizerData:
|
||||||
|
|
||||||
if tokenizer_hash not in cls._cache:
|
if tokenizer_hash not in cls._cache:
|
||||||
# Vendored from xgrammar logic since we cannot pickle the tokenizer
|
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
|
||||||
# https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
|
tokenizer,
|
||||||
|
# NOTE: We will need to use lm_head's vocab_size
|
||||||
|
# to determine correct special_token_ids for this tokenizer.
|
||||||
|
# See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92 # noqa: E501
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
)
|
||||||
|
metadata = json.loads(tokenizer_info.dump_metadata())
|
||||||
|
|
||||||
|
# Vendored from xgrammar logic to get encoded_vocab
|
||||||
|
# https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
|
||||||
try:
|
try:
|
||||||
encoded_vocab = [
|
vocab_dict = tokenizer.get_vocab()
|
||||||
token for token, _ in sorted(tokenizer.get_vocab().items(),
|
|
||||||
key=lambda x: x[1])
|
|
||||||
]
|
|
||||||
except AttributeError as e:
|
except AttributeError as e:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Cannot get the vocabulary of the tokenizer "
|
f"Cannot get the vocabulary of the tokenizer "
|
||||||
f"{type(tokenizer)}. The tokenizer should have a "
|
f"{type(tokenizer)}. The tokenizer should have a "
|
||||||
"get_vocab method.") from e
|
"get_vocab method.") from e
|
||||||
|
|
||||||
stop_token_ids = None
|
# maintain tokenizer's indexing
|
||||||
backend_str = ""
|
encoded_vocab = [""] * tokenizer_info.vocab_size
|
||||||
vocab_type = xgr.VocabType.RAW
|
for token, idx in vocab_dict.items():
|
||||||
|
if idx < tokenizer_info.vocab_size:
|
||||||
|
encoded_vocab[idx] = token
|
||||||
|
|
||||||
if stop_token_ids is None and hasattr(
|
if isinstance(tokenizer, MistralTokenizer):
|
||||||
tokenizer,
|
|
||||||
"eos_token_id") and tokenizer.eos_token_id is not None:
|
|
||||||
stop_token_ids = [tokenizer.eos_token_id]
|
|
||||||
|
|
||||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
|
||||||
backend_str = tokenizer.backend_tokenizer.to_str()
|
|
||||||
vocab_type = None
|
|
||||||
|
|
||||||
elif isinstance(tokenizer, MistralTokenizer):
|
|
||||||
# REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
|
# REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
|
||||||
vocab_type = xgr.VocabType.BYTE_FALLBACK
|
metadata.update({
|
||||||
|
"vocab_type": xgr.VocabType.BYTE_FALLBACK,
|
||||||
|
"add_prefix_space": True
|
||||||
|
})
|
||||||
|
|
||||||
cls._cache[tokenizer_hash] = TokenizerData(
|
cls._cache[tokenizer_hash] = TokenizerData(
|
||||||
encoded_vocab=encoded_vocab,
|
encoded_vocab=encoded_vocab,
|
||||||
stop_token_ids=stop_token_ids,
|
metadata=json.dumps(metadata),
|
||||||
backend_str=backend_str,
|
)
|
||||||
vocab_type=vocab_type)
|
|
||||||
|
|
||||||
return cls._cache[tokenizer_hash]
|
return cls._cache[tokenizer_hash]
|
||||||
|
|
||||||
@@ -129,30 +122,15 @@ class GrammarCompilerCache:
|
|||||||
cache_key = str(config.tokenizer_hash)
|
cache_key = str(config.tokenizer_hash)
|
||||||
|
|
||||||
if cache_key not in cls._cache:
|
if cache_key not in cls._cache:
|
||||||
assert config.tokenizer_data is not None
|
|
||||||
assert config.tokenizer_data.encoded_vocab is not None
|
|
||||||
|
|
||||||
config_data = config.tokenizer_data
|
config_data = config.tokenizer_data
|
||||||
|
|
||||||
# In TokenizerDataCache.get_tokenizer_data, a serializable
|
# In TokenizerDataCache.get_tokenizer_data, a serializable
|
||||||
# tokenizer_data is created and cached. This data is used to build
|
# tokenizer_data is created and cached. This data is used to build
|
||||||
# a tokenizer_info and create an xgrammar compiler.
|
# a tokenizer_info and create an xgrammar compiler.
|
||||||
# - If tokenizer_data has backend_str set, use
|
tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||||
# xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
|
encoded_vocab=config_data.encoded_vocab,
|
||||||
# - Otherwise, use the default constructor with vocab_type.
|
metadata=config_data.metadata,
|
||||||
# - xgr_core.TokenizerInfo.from_huggingface !=
|
)
|
||||||
# xgr.TokenizerInfo.from_huggingface.
|
|
||||||
if config_data.backend_str:
|
|
||||||
tokenizer_info = xgr.TokenizerInfo._create_from_handle(
|
|
||||||
xgr_core.TokenizerInfo.from_huggingface(
|
|
||||||
config_data.encoded_vocab, config_data.backend_str,
|
|
||||||
config.vocab_size, config_data.stop_token_ids))
|
|
||||||
else:
|
|
||||||
tokenizer_info = xgr.TokenizerInfo(
|
|
||||||
config_data.encoded_vocab,
|
|
||||||
config_data.vocab_type,
|
|
||||||
vocab_size=config.vocab_size,
|
|
||||||
stop_token_ids=config_data.stop_token_ids)
|
|
||||||
cls._cache[cache_key] = xgr.GrammarCompiler(
|
cls._cache[cache_key] = xgr.GrammarCompiler(
|
||||||
tokenizer_info, max_threads=config.max_threads)
|
tokenizer_info, max_threads=config.max_threads)
|
||||||
|
|
||||||
@@ -163,13 +141,12 @@ class GrammarCompilerCache:
|
|||||||
class GrammarConfig:
|
class GrammarConfig:
|
||||||
"""Serializable configuration for grammar compilation"""
|
"""Serializable configuration for grammar compilation"""
|
||||||
tokenizer_hash: int
|
tokenizer_hash: int
|
||||||
vocab_size: int
|
tokenizer_data: TokenizerData
|
||||||
json_str: str | None = None
|
json_str: str | None = None
|
||||||
grammar_str: str | None = None
|
grammar_str: str | None = None
|
||||||
json_object: bool | None = None
|
json_object: bool | None = None
|
||||||
any_whitespace: bool = True
|
any_whitespace: bool = True
|
||||||
max_threads: int = 8
|
max_threads: int = 8
|
||||||
tokenizer_data: TokenizerData | None = None
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_guided_params(cls,
|
def from_guided_params(cls,
|
||||||
@@ -179,7 +156,11 @@ class GrammarConfig:
|
|||||||
max_threads: int = 8) -> GrammarConfig:
|
max_threads: int = 8) -> GrammarConfig:
|
||||||
|
|
||||||
tokenizer_hash = hash(tokenizer)
|
tokenizer_hash = hash(tokenizer)
|
||||||
tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
|
tokenizer_data = TokenizerDataCache.get_tokenizer_data(
|
||||||
|
tokenizer,
|
||||||
|
tokenizer_hash=tokenizer_hash,
|
||||||
|
vocab_size=model_config.hf_text_config.vocab_size,
|
||||||
|
)
|
||||||
|
|
||||||
if guided_params.json:
|
if guided_params.json:
|
||||||
if not isinstance(guided_params.json, str):
|
if not isinstance(guided_params.json, str):
|
||||||
@@ -218,7 +199,6 @@ class GrammarConfig:
|
|||||||
raise ValueError(str(err)) from err
|
raise ValueError(str(err)) from err
|
||||||
|
|
||||||
return cls(json_str=json_str,
|
return cls(json_str=json_str,
|
||||||
vocab_size=model_config.hf_text_config.vocab_size,
|
|
||||||
tokenizer_hash=tokenizer_hash,
|
tokenizer_hash=tokenizer_hash,
|
||||||
max_threads=max_threads,
|
max_threads=max_threads,
|
||||||
tokenizer_data=tokenizer_data,
|
tokenizer_data=tokenizer_data,
|
||||||
@@ -246,14 +226,12 @@ class GrammarConfig:
|
|||||||
raise ValueError(str(err)) from err
|
raise ValueError(str(err)) from err
|
||||||
|
|
||||||
return cls(grammar_str=grammar_str,
|
return cls(grammar_str=grammar_str,
|
||||||
vocab_size=model_config.hf_text_config.vocab_size,
|
|
||||||
tokenizer_hash=tokenizer_hash,
|
tokenizer_hash=tokenizer_hash,
|
||||||
max_threads=max_threads,
|
max_threads=max_threads,
|
||||||
tokenizer_data=tokenizer_data)
|
tokenizer_data=tokenizer_data)
|
||||||
elif guided_params.json_object:
|
elif guided_params.json_object:
|
||||||
return cls(
|
return cls(
|
||||||
json_object=True,
|
json_object=True,
|
||||||
vocab_size=model_config.hf_text_config.vocab_size,
|
|
||||||
tokenizer_hash=tokenizer_hash,
|
tokenizer_hash=tokenizer_hash,
|
||||||
max_threads=max_threads,
|
max_threads=max_threads,
|
||||||
tokenizer_data=tokenizer_data,
|
tokenizer_data=tokenizer_data,
|
||||||
@@ -267,7 +245,6 @@ class GrammarConfig:
|
|||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
grammar_str=choice_str,
|
grammar_str=choice_str,
|
||||||
vocab_size=model_config.hf_text_config.vocab_size,
|
|
||||||
tokenizer_hash=tokenizer_hash,
|
tokenizer_hash=tokenizer_hash,
|
||||||
max_threads=max_threads,
|
max_threads=max_threads,
|
||||||
tokenizer_data=tokenizer_data,
|
tokenizer_data=tokenizer_data,
|
||||||
@@ -291,6 +268,13 @@ class GrammarConfig:
|
|||||||
grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
|
grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
|
||||||
return grammar
|
return grammar
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
|
||||||
|
return xgr.TokenizerInfo.from_vocab_and_metadata(
|
||||||
|
encoded_vocab=tokenizer_data.encoded_vocab,
|
||||||
|
metadata=tokenizer_data.metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class XGrammarLogitsProcessor:
|
class XGrammarLogitsProcessor:
|
||||||
@@ -299,11 +283,16 @@ class XGrammarLogitsProcessor:
|
|||||||
reasoner: Reasoner | None = None
|
reasoner: Reasoner | None = None
|
||||||
|
|
||||||
ctx: xgr.CompiledGrammar | None = None
|
ctx: xgr.CompiledGrammar | None = None
|
||||||
|
tokenizer_info: xgr.TokenizerInfo = None # type: ignore[assignment]
|
||||||
token_bitmask: torch.Tensor = None # type: ignore[assignment]
|
token_bitmask: torch.Tensor = None # type: ignore[assignment]
|
||||||
matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
|
matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
|
||||||
batch_size: int = field(default=1)
|
batch_size: int = field(default=1)
|
||||||
prefilled: bool = field(default=False)
|
prefilled: bool = field(default=False)
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.tokenizer_info = self.config.tokenizer_info(
|
||||||
|
self.config.tokenizer_data)
|
||||||
|
|
||||||
def __getstate__(self) -> dict[str, Any]:
|
def __getstate__(self) -> dict[str, Any]:
|
||||||
return {'config': self.config, 'reasoner': self.reasoner}
|
return {'config': self.config, 'reasoner': self.reasoner}
|
||||||
|
|
||||||
@@ -311,6 +300,8 @@ class XGrammarLogitsProcessor:
|
|||||||
self.config = state['config']
|
self.config = state['config']
|
||||||
self.reasoner = state['reasoner']
|
self.reasoner = state['reasoner']
|
||||||
|
|
||||||
|
self.tokenizer_info = GrammarConfig.tokenizer_info(
|
||||||
|
self.config.tokenizer_data)
|
||||||
self.ctx = None
|
self.ctx = None
|
||||||
self.matchers = []
|
self.matchers = []
|
||||||
self.batch_size = 1
|
self.batch_size = 1
|
||||||
@@ -352,7 +343,7 @@ class XGrammarLogitsProcessor:
|
|||||||
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
|
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
|
||||||
]
|
]
|
||||||
self.token_bitmask = xgr.allocate_token_bitmask(
|
self.token_bitmask = xgr.allocate_token_bitmask(
|
||||||
self.batch_size, self.config.vocab_size)
|
self.batch_size, self.tokenizer_info.vocab_size)
|
||||||
|
|
||||||
if not self.prefilled:
|
if not self.prefilled:
|
||||||
# Have not sampled a token yet
|
# Have not sampled a token yet
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
|||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
SupportsMultiModal, SupportsPP)
|
SupportsMultiModal, SupportsPP, SupportsV0Only)
|
||||||
from .siglip import SiglipVisionModel
|
from .siglip import SiglipVisionModel
|
||||||
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
|
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
|
||||||
maybe_prefix, merge_multimodal_embeddings)
|
maybe_prefix, merge_multimodal_embeddings)
|
||||||
@@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module):
|
|||||||
info=Gemma3ProcessingInfo,
|
info=Gemma3ProcessingInfo,
|
||||||
dummy_inputs=Gemma3DummyInputsBuilder)
|
dummy_inputs=Gemma3DummyInputsBuilder)
|
||||||
class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||||
SupportsLoRA):
|
SupportsLoRA, SupportsV0Only):
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ class MixtralAttention(nn.Module):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
config: MixtralConfig,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
num_kv_heads: int,
|
num_kv_heads: int,
|
||||||
@@ -136,7 +137,9 @@ class MixtralAttention(nn.Module):
|
|||||||
# the KV heads across multiple tensor parallel GPUs.
|
# the KV heads across multiple tensor parallel GPUs.
|
||||||
assert tp_size % self.total_num_kv_heads == 0
|
assert tp_size % self.total_num_kv_heads == 0
|
||||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||||
self.head_dim = hidden_size // self.total_num_heads
|
# MixtralConfig has an optional head_dim argument
|
||||||
|
self.head_dim = getattr(config, "head_dim",
|
||||||
|
self.hidden_size // self.total_num_heads)
|
||||||
self.q_size = self.num_heads * self.head_dim
|
self.q_size = self.num_heads * self.head_dim
|
||||||
self.kv_size = self.num_kv_heads * self.head_dim
|
self.kv_size = self.num_kv_heads * self.head_dim
|
||||||
self.scaling = self.head_dim**-0.5
|
self.scaling = self.head_dim**-0.5
|
||||||
@@ -200,6 +203,7 @@ class MixtralDecoderLayer(nn.Module):
|
|||||||
# Requires transformers > 4.32.0
|
# Requires transformers > 4.32.0
|
||||||
rope_theta = getattr(config, "rope_theta", 10000)
|
rope_theta = getattr(config, "rope_theta", 10000)
|
||||||
self.self_attn = MixtralAttention(
|
self.self_attn = MixtralAttention(
|
||||||
|
config=config,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_heads=config.num_attention_heads,
|
num_heads=config.num_attention_heads,
|
||||||
max_position=config.max_position_embeddings,
|
max_position=config.max_position_embeddings,
|
||||||
|
|||||||
@@ -165,6 +165,7 @@ class MixtralAttention(nn.Module):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
config: MixtralConfig,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
num_kv_heads: int,
|
num_kv_heads: int,
|
||||||
@@ -190,7 +191,9 @@ class MixtralAttention(nn.Module):
|
|||||||
# the KV heads across multiple tensor parallel GPUs.
|
# the KV heads across multiple tensor parallel GPUs.
|
||||||
assert tp_size % self.total_num_kv_heads == 0
|
assert tp_size % self.total_num_kv_heads == 0
|
||||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||||
self.head_dim = hidden_size // self.total_num_heads
|
# MixtralConfig has an optional head_dim argument
|
||||||
|
self.head_dim = getattr(config, "head_dim",
|
||||||
|
self.hidden_size // self.total_num_heads)
|
||||||
self.q_size = self.num_heads * self.head_dim
|
self.q_size = self.num_heads * self.head_dim
|
||||||
self.kv_size = self.num_kv_heads * self.head_dim
|
self.kv_size = self.num_kv_heads * self.head_dim
|
||||||
self.scaling = self.head_dim**-0.5
|
self.scaling = self.head_dim**-0.5
|
||||||
@@ -252,6 +255,7 @@ class MixtralDecoderLayer(nn.Module):
|
|||||||
# Requires transformers > 4.32.0
|
# Requires transformers > 4.32.0
|
||||||
rope_theta = getattr(config, "rope_theta", 10000)
|
rope_theta = getattr(config, "rope_theta", 10000)
|
||||||
self.self_attn = MixtralAttention(
|
self.self_attn = MixtralAttention(
|
||||||
|
config=config,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_heads=config.num_attention_heads,
|
num_heads=config.num_attention_heads,
|
||||||
max_position=config.max_position_embeddings,
|
max_position=config.max_position_embeddings,
|
||||||
|
|||||||
@@ -56,6 +56,8 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
USE_XFORMERS_OPS = False
|
USE_XFORMERS_OPS = False
|
||||||
|
|
||||||
|
PATCH_MERGE = "patch_merge"
|
||||||
|
|
||||||
|
|
||||||
class PixtralImagePixelInputs(TypedDict):
|
class PixtralImagePixelInputs(TypedDict):
|
||||||
type: Literal["pixel_values"]
|
type: Literal["pixel_values"]
|
||||||
@@ -155,7 +157,6 @@ class PixtralProcessorAdapter:
|
|||||||
|
|
||||||
for image in images:
|
for image in images:
|
||||||
image_inputs = self.image_processor(ImageChunk(image=image))
|
image_inputs = self.image_processor(ImageChunk(image=image))
|
||||||
|
|
||||||
image_processed = torch.tensor(image_inputs.image)
|
image_processed = torch.tensor(image_inputs.image)
|
||||||
image_tokens = torch.tensor(image_inputs.tokens)
|
image_tokens = torch.tensor(image_inputs.tokens)
|
||||||
|
|
||||||
@@ -353,6 +354,27 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.vision_encoder = VisionTransformer(self.vision_args)
|
self.vision_encoder = VisionTransformer(self.vision_args)
|
||||||
|
|
||||||
|
if self.vision_args.add_pre_mm_projector_layer_norm:
|
||||||
|
self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
|
||||||
|
eps=1e-5)
|
||||||
|
|
||||||
|
if self.vision_args.mm_projector_id == PATCH_MERGE:
|
||||||
|
self.patch_merger = PatchMerger(
|
||||||
|
vision_encoder_dim=self.vision_args.hidden_size,
|
||||||
|
spatial_merge_size=self.vision_args.spatial_merge_size,
|
||||||
|
use_mlp_bias=False,
|
||||||
|
)
|
||||||
|
if self.vision_args.add_pre_mm_projector_layer_norm:
|
||||||
|
self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
|
||||||
|
eps=1e-5)
|
||||||
|
|
||||||
|
if self.vision_args.mm_projector_id == PATCH_MERGE:
|
||||||
|
self.patch_merger = PatchMerger(
|
||||||
|
vision_encoder_dim=self.vision_args.hidden_size,
|
||||||
|
spatial_merge_size=self.vision_args.spatial_merge_size,
|
||||||
|
use_mlp_bias=False,
|
||||||
|
)
|
||||||
self.vision_language_adapter = VisionLanguageAdapter(
|
self.vision_language_adapter = VisionLanguageAdapter(
|
||||||
self.vision_args, dim=config.text_config.hidden_size)
|
self.vision_args, dim=config.text_config.hidden_size)
|
||||||
|
|
||||||
@@ -398,13 +420,25 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|||||||
image_input: PixtralImagePixelInputs,
|
image_input: PixtralImagePixelInputs,
|
||||||
) -> tuple[torch.Tensor, ...]:
|
) -> tuple[torch.Tensor, ...]:
|
||||||
images = image_input["images"]
|
images = image_input["images"]
|
||||||
|
|
||||||
image_features = self.vision_encoder(images)
|
image_features = self.vision_encoder(images)
|
||||||
feature_sizes = [
|
feature_sizes = [
|
||||||
image_feature.shape[0] for image_feature in image_features
|
image_feature.shape[0] for image_feature in image_features
|
||||||
]
|
]
|
||||||
|
image_features = torch.cat(image_features)
|
||||||
image_embeds = self.vision_language_adapter(torch.cat(image_features))
|
if self.vision_args.add_pre_mm_projector_layer_norm:
|
||||||
|
image_features = self.pre_mm_projector_norm(image_features)
|
||||||
|
if self.vision_args.mm_projector_id == PATCH_MERGE:
|
||||||
|
patch_size = self.vision_args.patch_size
|
||||||
|
spatial_merge_size_square = self.vision_args.spatial_merge_size**2
|
||||||
|
img_patch_dims = [(img.shape[1] // patch_size,
|
||||||
|
img.shape[2] // patch_size) for img in images]
|
||||||
|
feature_sizes = [
|
||||||
|
feature_size // spatial_merge_size_square
|
||||||
|
for feature_size in feature_sizes
|
||||||
|
]
|
||||||
|
image_features = self.patch_merger(image_features,
|
||||||
|
image_sizes=img_patch_dims)
|
||||||
|
image_embeds = self.vision_language_adapter(image_features)
|
||||||
image_embeds = torch.split(image_embeds, feature_sizes)
|
image_embeds = torch.split(image_embeds, feature_sizes)
|
||||||
return image_embeds
|
return image_embeds
|
||||||
|
|
||||||
@@ -524,8 +558,19 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|||||||
def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]):
|
def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]):
|
||||||
return weight[0].startswith("vision_language_adapter")
|
return weight[0].startswith("vision_language_adapter")
|
||||||
|
|
||||||
|
def is_patch_merger(weight: Tuple[str, torch.Tensor]):
|
||||||
|
return weight[0].startswith("patch_merger")
|
||||||
|
|
||||||
|
def is_pre_mm_projector_norm(weight: Tuple[str, torch.Tensor]):
|
||||||
|
return weight[0].startswith("pre_mm_projector_norm")
|
||||||
|
|
||||||
# Get references to parameters for direct loading
|
# Get references to parameters for direct loading
|
||||||
vision_encoder_dict = dict(self.vision_encoder.named_parameters())
|
vision_encoder_dict = dict(self.vision_encoder.named_parameters())
|
||||||
|
patch_merger_dict = dict(self.patch_merger.named_parameters(
|
||||||
|
)) if self.vision_args.mm_projector_id == PATCH_MERGE else dict()
|
||||||
|
pre_mm_projector_norm_dict = dict(
|
||||||
|
self.pre_mm_projector_norm.named_parameters(
|
||||||
|
)) if self.vision_args.add_pre_mm_projector_layer_norm else dict()
|
||||||
vision_lang_adapter_dict = dict(
|
vision_lang_adapter_dict = dict(
|
||||||
self.vision_language_adapter.named_parameters())
|
self.vision_language_adapter.named_parameters())
|
||||||
|
|
||||||
@@ -538,6 +583,18 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|||||||
param = vision_encoder_dict[trimmed_name]
|
param = vision_encoder_dict[trimmed_name]
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
default_weight_loader(param, w)
|
default_weight_loader(param, w)
|
||||||
|
elif is_patch_merger((name, w)):
|
||||||
|
# Load vision patch merger weights directly
|
||||||
|
trimmed_name = '.'.join(name.split(".")[1:])
|
||||||
|
param = patch_merger_dict[trimmed_name]
|
||||||
|
with torch.no_grad():
|
||||||
|
default_weight_loader(param, w)
|
||||||
|
elif is_pre_mm_projector_norm((name, w)):
|
||||||
|
# Load vision pre_mm_projector_norm weights directly
|
||||||
|
trimmed_name = '.'.join(name.split(".")[1:])
|
||||||
|
param = pre_mm_projector_norm_dict[trimmed_name]
|
||||||
|
with torch.no_grad():
|
||||||
|
default_weight_loader(param, w)
|
||||||
elif is_vision_lang_adapter_weights((name, w)):
|
elif is_vision_lang_adapter_weights((name, w)):
|
||||||
# Load vision-language adapter weights directly
|
# Load vision-language adapter weights directly
|
||||||
trimmed_name = '.'.join(name.split(".")[1:])
|
trimmed_name = '.'.join(name.split(".")[1:])
|
||||||
@@ -566,6 +623,9 @@ class VisionEncoderArgs:
|
|||||||
rope_theta: float # for rope-2D
|
rope_theta: float # for rope-2D
|
||||||
image_token_id: int
|
image_token_id: int
|
||||||
adapter_bias: bool = True
|
adapter_bias: bool = True
|
||||||
|
spatial_merge_size: int = 1
|
||||||
|
add_pre_mm_projector_layer_norm: bool = False
|
||||||
|
mm_projector_id: str = ""
|
||||||
|
|
||||||
|
|
||||||
def _reshape_for_broadcast(freqs_cis: torch.Tensor,
|
def _reshape_for_broadcast(freqs_cis: torch.Tensor,
|
||||||
@@ -843,6 +903,105 @@ class VisionLanguageAdapter(nn.Module):
|
|||||||
return self.w_out(self.gelu(self.w_in(x)))
|
return self.w_out(self.gelu(self.w_in(x)))
|
||||||
|
|
||||||
|
|
||||||
|
class PatchMerger(nn.Module):
|
||||||
|
"""
|
||||||
|
Learned merging of spatial_merge_size ** 2 patches
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vision_encoder_dim: int,
|
||||||
|
spatial_merge_size: int,
|
||||||
|
use_mlp_bias: bool = False,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
mlp_input_dim = vision_encoder_dim * (spatial_merge_size**2)
|
||||||
|
|
||||||
|
self.spatial_merge_size = spatial_merge_size
|
||||||
|
self.mlp_input_dim = mlp_input_dim
|
||||||
|
|
||||||
|
self.merging_layer = nn.Linear(
|
||||||
|
mlp_input_dim,
|
||||||
|
vision_encoder_dim,
|
||||||
|
bias=use_mlp_bias,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor,
|
||||||
|
image_sizes: list[tuple[int, int]]) -> torch.Tensor:
|
||||||
|
# image_sizes specified in tokens
|
||||||
|
assert sum([h * w for h, w in image_sizes]) == len(x)
|
||||||
|
|
||||||
|
# x is (N, vision_encoder_dim)
|
||||||
|
x = self.permute(x, image_sizes)
|
||||||
|
|
||||||
|
# x is (N / spatial_merge_size ** 2,
|
||||||
|
# vision_encoder_dim * spatial_merge_size ** 2)
|
||||||
|
x = self.merging_layer(x)
|
||||||
|
|
||||||
|
# x is (N / spatial_merge_size ** 2, vision_encoder_dim)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def permute(
|
||||||
|
self,
|
||||||
|
x: torch.Tensor,
|
||||||
|
image_sizes: list[tuple[int, int]],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x: (N, D) where N is flattened and concatenated patch tokens
|
||||||
|
for all images
|
||||||
|
image_sizes: list of tuple of (height, width) in tokens for
|
||||||
|
each image
|
||||||
|
Returns:
|
||||||
|
image_features: reorders patch tokens so each grid of
|
||||||
|
(spatial_merge_size, spatial_merge_size) is contiguous.
|
||||||
|
now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
|
||||||
|
"""
|
||||||
|
|
||||||
|
sub_grids = get_sub_grids(
|
||||||
|
x=x,
|
||||||
|
image_sizes=image_sizes,
|
||||||
|
spatial_merge_size=self.spatial_merge_size
|
||||||
|
) # list of [d x sub_grid_size x sub_grid_size x n_patches]
|
||||||
|
permuted_tensor: list[torch.Tensor] = []
|
||||||
|
for grid in sub_grids:
|
||||||
|
n_patches = grid.shape[-1]
|
||||||
|
permuted_tensor.append(grid.view(-1, n_patches).t(
|
||||||
|
)) # n_patches x d * sub_grid_size * sub_grid_size
|
||||||
|
return torch.cat(
|
||||||
|
permuted_tensor, dim=0
|
||||||
|
) # (N / spatial_merge_size ** 2, d * spatial_merge_size ** 2)
|
||||||
|
|
||||||
|
|
||||||
|
def get_sub_grids(
|
||||||
|
x: torch.Tensor,
|
||||||
|
image_sizes: list[tuple[int, int]],
|
||||||
|
spatial_merge_size: int,
|
||||||
|
) -> list[torch.Tensor]:
|
||||||
|
# image_sizes specified in tokens
|
||||||
|
tokens_per_image = [h * w for h, w in image_sizes]
|
||||||
|
d = x.shape[-1]
|
||||||
|
all_img_sub_grids: list[torch.Tensor] = []
|
||||||
|
sub_grid_size = spatial_merge_size
|
||||||
|
|
||||||
|
for image_index, image_tokens in enumerate(x.split(tokens_per_image)):
|
||||||
|
# Reshape image_tokens into a 2D grid
|
||||||
|
h, w = image_sizes[image_index]
|
||||||
|
image_grid = image_tokens.view(h, w, d).permute(
|
||||||
|
2, 0, 1)[None, :, :, :] # 1 x d x h x w
|
||||||
|
sub_grids = torch.nn.functional.unfold(image_grid,
|
||||||
|
kernel_size=sub_grid_size,
|
||||||
|
stride=sub_grid_size)
|
||||||
|
sub_grids = sub_grids.view(
|
||||||
|
1, d, sub_grid_size, sub_grid_size,
|
||||||
|
-1) # 1 x d x sub_grid_size x sub_grid_size x n_patches
|
||||||
|
|
||||||
|
all_img_sub_grids.append(sub_grids[0])
|
||||||
|
|
||||||
|
return all_img_sub_grids
|
||||||
|
|
||||||
|
|
||||||
#### HF Transformers version of Pixtral ####
|
#### HF Transformers version of Pixtral ####
|
||||||
# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py
|
# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py
|
||||||
# This model follows the Llava family, meaning image embeddings are placed
|
# This model follows the Llava family, meaning image embeddings are placed
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
import math
|
import math
|
||||||
from collections.abc import Iterable, Mapping, Sequence
|
from collections.abc import Iterable, Mapping, Sequence
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
|
from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.checkpoint
|
import torch.utils.checkpoint
|
||||||
@@ -36,7 +36,7 @@ from vllm.sequence import IntermediateTensors
|
|||||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
SupportsMultiModal, SupportsPP, SupportsV0Only)
|
SupportsMultiModal, SupportsPP)
|
||||||
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
|
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
|
||||||
init_vllm_registered_model, maybe_prefix,
|
init_vllm_registered_model, maybe_prefix,
|
||||||
merge_multimodal_embeddings,
|
merge_multimodal_embeddings,
|
||||||
@@ -50,14 +50,14 @@ _MAX_ENCODER_BATCH_SIZE = 16
|
|||||||
|
|
||||||
class UltravoxAudioFeatureInputs(TypedDict):
|
class UltravoxAudioFeatureInputs(TypedDict):
|
||||||
type: Literal["audio_features"]
|
type: Literal["audio_features"]
|
||||||
data: NestedTensors
|
data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
|
||||||
"""Shape: `(batch_size, num_chunks, 80, M)`"""
|
"""Shape: `(batch_size, num_chunks, 80, M)`"""
|
||||||
lens: NestedTensors
|
lens: Union[torch.Tensor, list[torch.Tensor]]
|
||||||
"""
|
"""
|
||||||
Length of the audio frames. Used for attention mask in WhisperEncoder.
|
Length of the audio frames. Used for attention mask in WhisperEncoder.
|
||||||
Shape: `(batch_size, num_chunks)`
|
Shape: `(batch_size, num_chunks)`
|
||||||
"""
|
"""
|
||||||
token_len: NestedTensors
|
token_len: Union[torch.Tensor, list[torch.Tensor]]
|
||||||
"""
|
"""
|
||||||
Length of the audio tokens. Used for flattening the audio features.
|
Length of the audio tokens. Used for flattening the audio features.
|
||||||
Shape: `(batch_size, num_chunks)`
|
Shape: `(batch_size, num_chunks)`
|
||||||
@@ -405,8 +405,7 @@ class ModifiedWhisperEncoder(WhisperEncoder):
|
|||||||
UltravoxMultiModalProcessor,
|
UltravoxMultiModalProcessor,
|
||||||
info=UltravoxProcessingInfo,
|
info=UltravoxProcessingInfo,
|
||||||
dummy_inputs=UltravoxDummyInputsBuilder)
|
dummy_inputs=UltravoxDummyInputsBuilder)
|
||||||
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
|
||||||
SupportsV0Only):
|
|
||||||
|
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
@@ -506,6 +505,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
|||||||
if not isinstance(audio_features, (torch.Tensor, list)):
|
if not isinstance(audio_features, (torch.Tensor, list)):
|
||||||
raise ValueError("Incorrect type of audio features. "
|
raise ValueError("Incorrect type of audio features. "
|
||||||
f"Got type: {type(audio_features)}")
|
f"Got type: {type(audio_features)}")
|
||||||
|
if not isinstance(audio_lens, (torch.Tensor, list)):
|
||||||
|
raise ValueError("Incorrect type of audio_lens. "
|
||||||
|
f"Got type: {type(audio_features)}")
|
||||||
|
if not isinstance(audio_token_len, (torch.Tensor, list)):
|
||||||
|
raise ValueError("Incorrect type of audio_token_len. "
|
||||||
|
f"Got type: {type(audio_features)}")
|
||||||
|
|
||||||
return UltravoxAudioFeatureInputs(type="audio_features",
|
return UltravoxAudioFeatureInputs(type="audio_features",
|
||||||
data=audio_features,
|
data=audio_features,
|
||||||
@@ -523,7 +528,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
|||||||
raise AssertionError("This line should be unreachable.")
|
raise AssertionError("This line should be unreachable.")
|
||||||
|
|
||||||
def _process_audio_input(
|
def _process_audio_input(
|
||||||
self, audio_input: UltravoxAudioInputs) -> NestedTensors:
|
self,
|
||||||
|
audio_input: UltravoxAudioInputs,
|
||||||
|
) -> Union[NestedTensors, tuple[torch.Tensor, ...]]:
|
||||||
if audio_input["type"] == "audio_embeds":
|
if audio_input["type"] == "audio_embeds":
|
||||||
return audio_input["data"]
|
return audio_input["data"]
|
||||||
|
|
||||||
@@ -531,13 +538,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
|||||||
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
|
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
|
||||||
audio_features = pad_and_concat_to_dim3(audio_input["data"])
|
audio_features = pad_and_concat_to_dim3(audio_input["data"])
|
||||||
|
|
||||||
if isinstance(audio_input['lens'], list):
|
# [B1, B2] -> [B1+B2]
|
||||||
# [B1, B2] -> [B1+B2]
|
audio_lens = flatten_bn(audio_input['lens'], concat=True)
|
||||||
audio_lens = torch.cat(audio_input['lens'])
|
audio_token_len = flatten_bn(audio_input['token_len'], concat=True)
|
||||||
audio_token_len = torch.cat(audio_input['token_len'])
|
|
||||||
else:
|
|
||||||
audio_lens = flatten_bn(audio_input['lens'])
|
|
||||||
audio_token_len = flatten_bn(audio_input['token_len'])
|
|
||||||
|
|
||||||
embeddings = self._audio_features_to_embeddings(
|
embeddings = self._audio_features_to_embeddings(
|
||||||
audio_features, audio_lens)
|
audio_features, audio_lens)
|
||||||
@@ -554,7 +557,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
|||||||
# Apply mask and flatten
|
# Apply mask and flatten
|
||||||
flattened_embeddings = embeddings[mask]
|
flattened_embeddings = embeddings[mask]
|
||||||
|
|
||||||
return flattened_embeddings
|
# Return one tensor per input audio
|
||||||
|
embed_lens = [
|
||||||
|
token_len_item.sum().item()
|
||||||
|
for token_len_item in audio_input['token_len']
|
||||||
|
]
|
||||||
|
return flattened_embeddings.split(embed_lens)
|
||||||
|
|
||||||
def get_multimodal_embeddings(
|
def get_multimodal_embeddings(
|
||||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||||
@@ -646,7 +654,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
|||||||
|
|
||||||
|
|
||||||
def pad_and_concat_to_dim3(
|
def pad_and_concat_to_dim3(
|
||||||
features: Union[torch.Tensor, List[torch.Tensor], List[List[torch.Tensor]]]
|
features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
Pad and concatenate a list of tensors.
|
Pad and concatenate a list of tensors.
|
||||||
|
|||||||
@@ -218,8 +218,10 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
|
|
||||||
# V0 does not support chunked prefill.
|
# V0 does not support chunked prefill.
|
||||||
if total_len > seq_len and not envs.VLLM_USE_V1:
|
if total_len > seq_len and not envs.VLLM_USE_V1:
|
||||||
|
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"The context length (%d) of the model is too short "
|
"The sequence length used for profiling ("
|
||||||
|
"max_num_batched_tokens / max_num_seqs = %d) is too short "
|
||||||
"to hold the multi-modal embeddings in the worst case "
|
"to hold the multi-modal embeddings in the worst case "
|
||||||
"(%d tokens in total, out of which %s are reserved for "
|
"(%d tokens in total, out of which %s are reserved for "
|
||||||
"multi-modal embeddings). This may cause certain "
|
"multi-modal embeddings). This may cause certain "
|
||||||
|
|||||||
@@ -37,10 +37,11 @@ class XPUPlatform(Platform):
|
|||||||
return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
|
return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_device_capability(device_id: int = 0) -> DeviceCapability:
|
def get_device_capability(
|
||||||
major, minor, *_ = torch.xpu.get_device_capability(
|
device_id: int = 0) -> Optional[DeviceCapability]:
|
||||||
device_id)['version'].split('.')
|
# capacity format differs from cuda's and will cause unexpected
|
||||||
return DeviceCapability(major=int(major), minor=int(minor))
|
# failure, so use None directly
|
||||||
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_device_name(device_id: int = 0) -> str:
|
def get_device_name(device_id: int = 0) -> str:
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ class EngineCoreRequest(
|
|||||||
# Detokenizer, but set to None when it is added to EngineCoreClient.
|
# Detokenizer, but set to None when it is added to EngineCoreClient.
|
||||||
prompt: Optional[str]
|
prompt: Optional[str]
|
||||||
prompt_token_ids: list[int]
|
prompt_token_ids: list[int]
|
||||||
mm_inputs: Optional[list[Optional[MultiModalKwargs]]]
|
mm_inputs: Optional[list[MultiModalKwargs]]
|
||||||
mm_hashes: Optional[list[str]]
|
mm_hashes: Optional[list[str]]
|
||||||
mm_placeholders: Optional[list[PlaceholderRange]]
|
mm_placeholders: Optional[list[PlaceholderRange]]
|
||||||
sampling_params: SamplingParams
|
sampling_params: SamplingParams
|
||||||
|
|||||||
@@ -1,131 +1,30 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from typing import Any, Optional
|
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
|
||||||
from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
|
from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
|
||||||
from vllm.logger import init_logger
|
from vllm.multimodal import MultiModalKwargs
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
|
|
||||||
MultiModalKwargs, MultiModalRegistry)
|
|
||||||
from vllm.multimodal.processing import ProcessingCache
|
from vllm.multimodal.processing import ProcessingCache
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
# The idea of multimodal preprocessing caching is based on having a client and
|
# The idea of multimodal preprocessing caching is based on having a client and
|
||||||
# a server, where the client executes in the frontend process (=P0) and the
|
# a server, where the client executes in the frontend process (=P0) and the
|
||||||
# server in the core process (=P1).
|
# server in the core process (=P1).
|
||||||
#
|
#
|
||||||
# -- Client:
|
# -- Client:
|
||||||
# - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs.
|
# - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
|
||||||
# - Perform caching of the generated MultiModalKwargs.
|
# with built-in caching functionality, with mm_hash as its identifier.
|
||||||
# - This client can be deprecated once all mutimodal models migrate to use
|
|
||||||
# merged preprocessor with built-in caching functionality.
|
|
||||||
#
|
#
|
||||||
# -- Server:
|
# -- Server:
|
||||||
# - Perform caching of the received MultiModalKwargs.
|
# - MMInputCacheServer to perform caching of the received MultiModalKwargs.
|
||||||
#
|
#
|
||||||
# The caching for both client and server is mirrored/similar, and this allows us
|
# The caching for both client and server is mirrored, and this allows us
|
||||||
# to avoid the serialization of "mm_inputs" (like pixel values) between
|
# to avoid the serialization of "mm_inputs" (like pixel values) between
|
||||||
# client (=P0) and server (=P1) processes.
|
# client (=P0) and server (=P1) processes if the mm_hash is found in the client
|
||||||
|
# cache.
|
||||||
|
|
||||||
# Both Client and Server must use the same cache size
|
# Both Client and Server must use the same cache size
|
||||||
# (to perform mirrored caching). This cache size is set by the environment
|
# (to perform mirrored caching). This cache size is set by the environment
|
||||||
# variable VLLM_MM_INPUT_CACHE_GIB.
|
# variable VLLM_MM_INPUT_CACHE_GIB.
|
||||||
|
|
||||||
|
|
||||||
# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
|
|
||||||
# merged preprocessor with built-in caching functionality.
|
|
||||||
class MMInputCacheClient:
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model_config: ModelConfig,
|
|
||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
|
||||||
):
|
|
||||||
self.model_config = model_config
|
|
||||||
self.mm_registry = mm_registry
|
|
||||||
self.multi_modal_input_mapper = mm_registry.create_input_mapper(
|
|
||||||
model_config)
|
|
||||||
self.mm_registry.init_mm_limits_per_prompt(model_config)
|
|
||||||
|
|
||||||
# Init cache
|
|
||||||
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
|
||||||
self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
|
|
||||||
MultiModalKwargs)
|
|
||||||
|
|
||||||
# DEBUG: Set to None to disable
|
|
||||||
self.mm_debug_cache_hit_ratio_steps = None
|
|
||||||
self.mm_debug_cache_hits = 0
|
|
||||||
self.mm_debug_cache_total = 0
|
|
||||||
|
|
||||||
def cache_hit_ratio(self, steps):
|
|
||||||
total = self.mm_debug_cache_total
|
|
||||||
|
|
||||||
if total > 0 and total % steps == 0:
|
|
||||||
logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
|
|
||||||
self.mm_debug_cache_hits / total)
|
|
||||||
|
|
||||||
# NOTE: process_inputs only supports image inputs since all multimodal
|
|
||||||
# models with other modalities have migrated to use merged preprocessor.
|
|
||||||
def process_inputs(
|
|
||||||
self,
|
|
||||||
mm_data: MultiModalDataDict,
|
|
||||||
mm_hashes: Optional[list[str]],
|
|
||||||
mm_processor_kwargs: Optional[dict[str, Any]],
|
|
||||||
precomputed_mm_inputs: Optional[list[MultiModalKwargs]],
|
|
||||||
) -> list[Optional[MultiModalKwargs]]:
|
|
||||||
if precomputed_mm_inputs is None:
|
|
||||||
image_inputs = mm_data["image"]
|
|
||||||
if not isinstance(image_inputs, list):
|
|
||||||
image_inputs = [image_inputs]
|
|
||||||
num_inputs = len(image_inputs)
|
|
||||||
else:
|
|
||||||
num_inputs = len(precomputed_mm_inputs)
|
|
||||||
|
|
||||||
# Sanity
|
|
||||||
if self.use_cache:
|
|
||||||
assert mm_hashes is not None
|
|
||||||
assert num_inputs == len(mm_hashes)
|
|
||||||
|
|
||||||
# Process each image input separately, so that later we can schedule
|
|
||||||
# them in a fine-grained manner.
|
|
||||||
# Apply caching (if enabled) and reuse precomputed inputs (if provided)
|
|
||||||
ret_inputs: list[Optional[MultiModalKwargs]] = []
|
|
||||||
for input_id in range(num_inputs):
|
|
||||||
if self.mm_debug_cache_hit_ratio_steps is not None:
|
|
||||||
self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
|
|
||||||
|
|
||||||
mm_input = None
|
|
||||||
if self.use_cache:
|
|
||||||
assert mm_hashes is not None
|
|
||||||
mm_hash = mm_hashes[input_id]
|
|
||||||
mm_input = self.mm_cache.get(mm_hash)
|
|
||||||
|
|
||||||
self.mm_debug_cache_total += 1
|
|
||||||
if mm_input is None:
|
|
||||||
if precomputed_mm_inputs is not None:
|
|
||||||
# Reuse precomputed input (for merged preprocessor)
|
|
||||||
mm_input = precomputed_mm_inputs[input_id]
|
|
||||||
else:
|
|
||||||
# Apply legacy input_mapper
|
|
||||||
mm_input = self.multi_modal_input_mapper(
|
|
||||||
{"image": [image_inputs[input_id]]},
|
|
||||||
mm_processor_kwargs=mm_processor_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.use_cache:
|
|
||||||
# Add to cache
|
|
||||||
assert mm_hash is not None
|
|
||||||
self.mm_cache[mm_hash] = mm_input
|
|
||||||
else:
|
|
||||||
self.mm_debug_cache_hits += 1
|
|
||||||
mm_input = None # Avoids sending mm_input to Server
|
|
||||||
|
|
||||||
ret_inputs.append(mm_input)
|
|
||||||
|
|
||||||
return ret_inputs
|
|
||||||
|
|
||||||
|
|
||||||
class MMInputCacheServer:
|
class MMInputCacheServer:
|
||||||
|
|
||||||
def __init__(self, model_config):
|
def __init__(self, model_config):
|
||||||
@@ -135,9 +34,9 @@ class MMInputCacheServer:
|
|||||||
|
|
||||||
def get_and_update(
|
def get_and_update(
|
||||||
self,
|
self,
|
||||||
mm_inputs: list[Optional[MultiModalKwargs]],
|
mm_inputs: list[MultiModalKwargs],
|
||||||
mm_hashes: list[str],
|
mm_hashes: list[str],
|
||||||
) -> list[Optional[MultiModalKwargs]]:
|
) -> list[MultiModalKwargs]:
|
||||||
assert len(mm_inputs) == len(mm_hashes)
|
assert len(mm_inputs) == len(mm_hashes)
|
||||||
|
|
||||||
if not self.use_cache:
|
if not self.use_cache:
|
||||||
@@ -147,8 +46,7 @@ class MMInputCacheServer:
|
|||||||
for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
|
for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
|
||||||
assert mm_hash is not None
|
assert mm_hash is not None
|
||||||
if mm_input is None:
|
if mm_input is None:
|
||||||
mm_input = self.mm_cache.get(mm_hash)
|
mm_input = self.mm_cache[mm_hash]
|
||||||
assert mm_input is not None
|
|
||||||
else:
|
else:
|
||||||
self.mm_cache[mm_hash] = mm_input
|
self.mm_cache[mm_hash] = mm_input
|
||||||
|
|
||||||
|
|||||||
@@ -11,15 +11,15 @@ from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
|
|||||||
from vllm.inputs.parse import is_encoder_decoder_inputs
|
from vllm.inputs.parse import is_encoder_decoder_inputs
|
||||||
from vllm.inputs.preprocess import InputPreprocessor
|
from vllm.inputs.preprocess import InputPreprocessor
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
|
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
|
||||||
MultiModalKwargs, MultiModalRegistry)
|
MultiModalRegistry)
|
||||||
|
from vllm.multimodal.inputs import PlaceholderRange
|
||||||
from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
|
from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.engine.mm_input_cache import MMInputCacheClient
|
|
||||||
from vllm.v1.structured_output.utils import validate_structured_output_request
|
from vllm.v1.structured_output.utils import validate_structured_output_request
|
||||||
|
|
||||||
|
|
||||||
@@ -45,11 +45,6 @@ class Processor:
|
|||||||
self.input_preprocessor = InputPreprocessor(self.model_config,
|
self.input_preprocessor = InputPreprocessor(self.model_config,
|
||||||
self.tokenizer,
|
self.tokenizer,
|
||||||
mm_registry)
|
mm_registry)
|
||||||
self.input_processor = input_registry.create_input_processor(
|
|
||||||
self.model_config)
|
|
||||||
|
|
||||||
# Multi-modal (huggingface) input mapper
|
|
||||||
self.mm_input_cache_client = MMInputCacheClient(self.model_config)
|
|
||||||
|
|
||||||
# Multi-modal hasher (for images)
|
# Multi-modal hasher (for images)
|
||||||
self.use_hash = (
|
self.use_hash = (
|
||||||
@@ -171,7 +166,7 @@ class Processor:
|
|||||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||||
# multimodal data and expand prompt token ids accordingly.
|
# multimodal data and expand prompt token ids accordingly.
|
||||||
# 3. Apply prompt adapter to prompt token ids if one exists.
|
# 3. Apply prompt adapter to prompt token ids if one exists.
|
||||||
preprocessed_inputs = self.input_preprocessor.preprocess(
|
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||||
prompt,
|
prompt,
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
@@ -180,10 +175,6 @@ class Processor:
|
|||||||
)
|
)
|
||||||
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
||||||
|
|
||||||
# Process prompt and prompt token ids.
|
|
||||||
# Only applicable to multimodal models with legacy input processor.
|
|
||||||
processed_inputs = self.input_processor(preprocessed_inputs)
|
|
||||||
|
|
||||||
self._validate_model_inputs(processed_inputs, lora_request)
|
self._validate_model_inputs(processed_inputs, lora_request)
|
||||||
|
|
||||||
if is_encoder_decoder_inputs(processed_inputs):
|
if is_encoder_decoder_inputs(processed_inputs):
|
||||||
@@ -212,36 +203,22 @@ class Processor:
|
|||||||
self.tokenizer.get_lora_tokenizer(lora_request))
|
self.tokenizer.get_lora_tokenizer(lora_request))
|
||||||
|
|
||||||
# Multimodal related.
|
# Multimodal related.
|
||||||
# Compute MM hashes (if enabled)
|
sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
|
||||||
mm_hashes = None
|
sorted_mm_positions: Optional[list[PlaceholderRange]] = None
|
||||||
if self.use_hash:
|
sorted_mm_hashes: Optional[list[str]] = None
|
||||||
# Use mm_hashes from processed inputs if the model has merged
|
if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
|
||||||
# input processor.
|
assert isinstance(decoder_mm_inputs, MultiModalKwargs)
|
||||||
if decoder_inputs.multi_modal_hashes:
|
|
||||||
mm_hashes = decoder_inputs.multi_modal_hashes
|
|
||||||
# Fallback to using MultiModalHasher directly.
|
|
||||||
else:
|
|
||||||
mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
|
|
||||||
|
|
||||||
# For merged preprocessor, mm_data is already mm_inputs
|
# The output of merged multi-modal processor (`decoder_mm_inputs`)
|
||||||
precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
|
|
||||||
decoder_mm_data = decoder_inputs.multi_modal_data
|
|
||||||
if isinstance(decoder_mm_data, MultiModalKwargs):
|
|
||||||
# The output of merged multi-modal processor (`decoder_mm_data`)
|
|
||||||
# contains the kwargs for all items from all modalities.
|
# contains the kwargs for all items from all modalities.
|
||||||
# This code separates them so that there is one set of kwargs
|
# This code separates them so that there is one set of kwargs
|
||||||
# per item per modality.
|
# per item per modality.
|
||||||
precomputed_mm_inputs = [
|
individual_mm_inputs = [
|
||||||
MultiModalKwargs.from_items([item])
|
MultiModalKwargs.from_items([item])
|
||||||
for modality in decoder_mm_data.modalities
|
for modality in decoder_mm_inputs.modalities
|
||||||
for item in decoder_mm_data.get_items(modality)
|
for item in decoder_mm_inputs.get_items(modality)
|
||||||
]
|
]
|
||||||
|
|
||||||
mm_positions = decoder_inputs.multi_modal_placeholders
|
|
||||||
|
|
||||||
# Last-mile processing of multimodal metadata and inputs.
|
|
||||||
if mm_positions:
|
|
||||||
|
|
||||||
# Merge and flatten multimodal placeholders, hashes and inputs
|
# Merge and flatten multimodal placeholders, hashes and inputs
|
||||||
# from dictionaries to lists, and sort them by each item's position
|
# from dictionaries to lists, and sort them by each item's position
|
||||||
# in the input sequence.
|
# in the input sequence.
|
||||||
@@ -251,14 +228,13 @@ class Processor:
|
|||||||
sorted_mm_positions,
|
sorted_mm_positions,
|
||||||
sorted_mm_hashes,
|
sorted_mm_hashes,
|
||||||
) = merge_and_sort_multimodal_metadata(
|
) = merge_and_sort_multimodal_metadata(
|
||||||
mm_positions,
|
decoder_inputs.multi_modal_placeholders,
|
||||||
mm_hashes,
|
decoder_inputs.multi_modal_hashes if self.use_hash else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
|
# NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
|
||||||
# modalities involved AND the model supports merged input processor.
|
# modalities involved.
|
||||||
if len(sorted_modalities) > 1 and precomputed_mm_inputs:
|
if len(sorted_modalities) > 1:
|
||||||
|
|
||||||
modality_order_dict = {
|
modality_order_dict = {
|
||||||
modality: order
|
modality: order
|
||||||
for order, modality in enumerate(sorted_modalities)
|
for order, modality in enumerate(sorted_modalities)
|
||||||
@@ -266,26 +242,16 @@ class Processor:
|
|||||||
|
|
||||||
# Sanity check to make sure each multimodal input has only one
|
# Sanity check to make sure each multimodal input has only one
|
||||||
# modality key.
|
# modality key.
|
||||||
for mm_input in precomputed_mm_inputs:
|
for mm_input in individual_mm_inputs:
|
||||||
assert len(mm_input.modalities) == 1
|
assert len(mm_input.modalities) == 1
|
||||||
|
|
||||||
# Sort MultiModalKwags to match sorted_mm_positions
|
# Sort MultiModalKwargs to match sorted_mm_positions
|
||||||
precomputed_mm_inputs = sorted(
|
sorted_mm_inputs = sorted(
|
||||||
precomputed_mm_inputs,
|
individual_mm_inputs,
|
||||||
key=lambda mm_input: modality_order_dict[list(
|
key=lambda mm_input: modality_order_dict[list(
|
||||||
mm_input.modalities)[0]])
|
mm_input.modalities)[0]])
|
||||||
|
else:
|
||||||
# Apply mm input cache update and legacy input mapper if one exists.
|
sorted_mm_inputs = individual_mm_inputs
|
||||||
sorted_mm_inputs = self.mm_input_cache_client.process_inputs(
|
|
||||||
mm_data=decoder_mm_data,
|
|
||||||
mm_hashes=sorted_mm_hashes,
|
|
||||||
mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
|
|
||||||
precomputed_mm_inputs=precomputed_mm_inputs,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
sorted_mm_inputs = None
|
|
||||||
sorted_mm_hashes = None
|
|
||||||
sorted_mm_positions = None
|
|
||||||
|
|
||||||
return EngineCoreRequest(
|
return EngineCoreRequest(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ class StructuredOutputManager:
|
|||||||
tokenizer_group.ping()
|
tokenizer_group.ping()
|
||||||
|
|
||||||
tokenizer = tokenizer_group.get_lora_tokenizer(None)
|
tokenizer = tokenizer_group.get_lora_tokenizer(None)
|
||||||
self.vocab_size = len(tokenizer.get_vocab())
|
self.vocab_size = self.vllm_config.model_config.get_vocab_size()
|
||||||
if isinstance(tokenizer, MistralTokenizer):
|
if isinstance(tokenizer, MistralTokenizer):
|
||||||
# NOTE: ideally, xgrammar should handle this accordingly.
|
# NOTE: ideally, xgrammar should handle this accordingly.
|
||||||
# refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
|
# refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
|||||||
is_pin_memory_available)
|
is_pin_memory_available)
|
||||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
|
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
|
||||||
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
||||||
from vllm.v1.engine.mm_input_cache import MMInputCacheClient
|
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
KVCacheSpec)
|
KVCacheSpec)
|
||||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
|
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
|
||||||
@@ -133,14 +132,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.mm_registry = MULTIMODAL_REGISTRY
|
self.mm_registry = MULTIMODAL_REGISTRY
|
||||||
self.uses_mrope = model_config.uses_mrope
|
self.uses_mrope = model_config.uses_mrope
|
||||||
|
|
||||||
if self.is_multimodal_model:
|
|
||||||
# NOTE: Initialized client is only used for processing dummy
|
|
||||||
# multimodal data into multimodal kwargs for GPU memory profiling.
|
|
||||||
# Only applicable to multimodal models with legacy input mapper.
|
|
||||||
self.mm_input_mapper_profiling = MMInputCacheClient(
|
|
||||||
self.model_config)
|
|
||||||
self.mm_input_mapper_profiling.use_cache = False
|
|
||||||
|
|
||||||
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
|
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
@@ -1376,32 +1367,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
mm_registry=self.mm_registry,
|
mm_registry=self.mm_registry,
|
||||||
)
|
)
|
||||||
dummy_mm_data = dummy_request_data.multi_modal_data
|
dummy_mm_data = dummy_request_data.multi_modal_data
|
||||||
|
if not isinstance(dummy_mm_data, MultiModalKwargs):
|
||||||
|
# TODO: Delete this check once input mapper is fully removed.
|
||||||
|
raise RuntimeError(
|
||||||
|
"Legacy input mapper is not supported in V1")
|
||||||
|
|
||||||
# Dummy data definition in V0 may contain multiple multimodal items
|
# Dummy data definition may contain multiple multimodal items
|
||||||
# (e.g, multiple images) for a single request, therefore here we
|
# (e.g, multiple images) for a single request, therefore here we
|
||||||
# always replicate first item by max_num_mm_items times since in V1
|
# always replicate first item by max_num_mm_items times since in V1
|
||||||
# they are scheduled to be processed separately.
|
# they are scheduled to be processed separately.
|
||||||
|
dummy_mm_item = dummy_mm_data.get_item(
|
||||||
# Case when models have a merged processor, their dummy data is
|
modality=dummy_data_modality, item_index=0)
|
||||||
# already batched `MultiModalKwargs`, therefore we take the first
|
dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
|
||||||
# `MultiModalKwargsItem` from the desired modality to profile on.
|
|
||||||
if isinstance(dummy_mm_data, MultiModalKwargs):
|
|
||||||
dummy_mm_item = dummy_mm_data.get_item(
|
|
||||||
modality=dummy_data_modality, item_index=0)
|
|
||||||
dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
|
|
||||||
|
|
||||||
# Case when models have dummy data explicitly defined as
|
|
||||||
# `MultiModalDataDict`, so they need to be processed through input
|
|
||||||
# mapper.
|
|
||||||
# TODO (ywang96): deprecate this path once merged processor is
|
|
||||||
# supported on all models.
|
|
||||||
else:
|
|
||||||
mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
|
|
||||||
mm_data=dummy_mm_data,
|
|
||||||
mm_hashes=None,
|
|
||||||
mm_processor_kwargs=None,
|
|
||||||
precomputed_mm_inputs=None)
|
|
||||||
dummy_mm_kwargs = mm_kwargs_list[0]
|
|
||||||
|
|
||||||
batched_dummy_mm_inputs = MultiModalKwargs.batch(
|
batched_dummy_mm_inputs = MultiModalKwargs.batch(
|
||||||
[dummy_mm_kwargs] * max_num_mm_items)
|
[dummy_mm_kwargs] * max_num_mm_items)
|
||||||
|
|||||||
@@ -23,8 +23,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
|
|||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
|
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
|
||||||
from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
|
from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
|
||||||
PallasAttentionBackend,
|
|
||||||
PallasMetadata)
|
PallasMetadata)
|
||||||
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
@@ -139,10 +138,8 @@ class TPUModelRunner:
|
|||||||
device="cpu")
|
device="cpu")
|
||||||
self.slot_mapping_np = self.slot_mapping_cpu.numpy()
|
self.slot_mapping_np = self.slot_mapping_cpu.numpy()
|
||||||
|
|
||||||
padded_max_num_blocks_per_req = _get_padded_number(
|
|
||||||
self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
|
|
||||||
self.block_table_cpu = torch.zeros(
|
self.block_table_cpu = torch.zeros(
|
||||||
(self.max_num_tokens, padded_max_num_blocks_per_req),
|
(self.max_num_tokens, self.max_num_blocks_per_req),
|
||||||
dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
|
dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
|
||||||
device="cpu")
|
device="cpu")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user