EVS Support (Video tokens pruning) (#22980)

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-26 06:54:54 +03:00
parent 983056e456
commit 392edee34a
8 changed files with 783 additions and 39 deletions
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.multimodal.video import sample_frames_from_video
+
+from ....conftest import VIDEO_ASSETS
+
+models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
+target_dtype = "bfloat16"
+
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+def qwen2_5_vl_chat_template(*query):
+    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "baby_reading":
+    qwen2_5_vl_chat_template(
+        VIDEO_PLACEHOLDER,
+        "Describe this video with a short sentence ",
+        "(no more than 20 words)",
+    ),
+})
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
+@pytest.mark.parametrize("num_frames", [16])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
+                                      video_pruning_rate: float,
+                                      num_frames: int, dtype: str,
+                                      max_tokens: int) -> None:
+    """Test EVS (Efficient Video Sampling) functionality with different
+    pruning rates.
+    """
+
+    # Sample frames from video assets
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    prompts = [VIDEO_PROMPTS[0]]
+    videos = [sampled_vids[0]]
+
+    # Initialize model with EVS configuration
+    with vllm_runner(model,
+                     runner="generate",
+                     max_model_len=4000,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"video": 1},
+                     tensor_parallel_size=1,
+                     video_pruning_rate=video_pruning_rate) as vllm_model:
+
+        # Generate output - this should not crash
+        outputs = vllm_model.generate_greedy(prompts,
+                                             max_tokens,
+                                             videos=videos)
+
+        # Basic validation that we got a response
+        assert len(outputs) == 1
+        output_ids, output_text = outputs[0]
+
+        # Ensure we got some output
+        assert len(output_ids) > 0
+        assert len(output_text) > 0
+
+        # Ensure the output is a string
+        assert isinstance(output_text, str)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
+@pytest.mark.parametrize("num_frames", [16])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
+                                       video_pruning_rate: float,
+                                       num_frames: int, dtype: str,
+                                       max_tokens: int) -> None:
+    """Test EVS functionality with batched videos.
+
+    This test validates that:
+    1. The model handles batched video inputs correctly with EVS
+    2. Both pruning configurations work with multiple videos
+    3. The model doesn't crash when processing multiple videos simultaneously
+    """
+    # Sample frames from video assets
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    # Test batched videos
+    prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
+    videos = [sampled_vids[0],
+              sampled_vids[0]]  # Use same video twice for testing
+
+    # Initialize model with EVS configuration
+    with vllm_runner(model,
+                     runner="generate",
+                     max_model_len=4000,
+                     max_num_seqs=2,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"video": 2},
+                     tensor_parallel_size=1,
+                     video_pruning_rate=video_pruning_rate) as vllm_model:
+
+        # Generate output - this should not crash
+        outputs = vllm_model.generate_greedy(prompts,
+                                             max_tokens,
+                                             videos=videos)
+
+        # Basic validation that we got responses for both videos
+        assert len(outputs) == 2
+
+        for output_ids, output_text in outputs:
+            # Ensure we got some output for each video
+            assert len(output_ids) > 0
+            assert len(output_text) > 0
+
+            # Ensure the output is a string
+            assert isinstance(output_text, str)