EVS Support (Video tokens pruning) (#22980)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
committed by
GitHub
parent
983056e456
commit
392edee34a
132
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
132
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
@@ -0,0 +1,132 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
|
||||
from ....conftest import VIDEO_ASSETS
|
||||
|
||||
models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
|
||||
target_dtype = "bfloat16"
|
||||
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
|
||||
def qwen2_5_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
"baby_reading":
|
||||
qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
})
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
"""Test EVS (Efficient Video Sampling) functionality with different
|
||||
pruning rates.
|
||||
"""
|
||||
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
prompts = [VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
|
||||
# Basic validation that we got a response
|
||||
assert len(outputs) == 1
|
||||
output_ids, output_text = outputs[0]
|
||||
|
||||
# Ensure we got some output
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
"""Test EVS functionality with batched videos.
|
||||
|
||||
This test validates that:
|
||||
1. The model handles batched video inputs correctly with EVS
|
||||
2. Both pruning configurations work with multiple videos
|
||||
3. The model doesn't crash when processing multiple videos simultaneously
|
||||
"""
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
# Test batched videos
|
||||
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0],
|
||||
sampled_vids[0]] # Use same video twice for testing
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
|
||||
# Basic validation that we got responses for both videos
|
||||
assert len(outputs) == 2
|
||||
|
||||
for output_ids, output_text in outputs:
|
||||
# Ensure we got some output for each video
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
||||
Reference in New Issue
Block a user