[VLM] Add Qwen3-VL generation test (#25185)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-29 20:19:37 +08:00
parent 3481e40743
commit ad3ec89532
7 changed files with 108 additions and 5 deletions
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -4,7 +4,9 @@

 from collections.abc import Callable, Iterable
 from pathlib import PosixPath
+from typing import Any

+import numpy.typing as npt
 import torch

 from vllm.multimodal.audio import AudioResampler
@@ -236,6 +238,7 @@ def build_video_inputs_from_test_info(
    video_assets: VideoTestAssets,
    size_wrapper: ImageSizeWrapper,
    num_frames: int,
+    needs_video_metadata: bool,
 ) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
        raise ValueError("Prompt formatter must be set to build video inputs")
@@ -248,7 +251,10 @@ def build_video_inputs_from_test_info(
    )

    sampled_vids = [
-        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        sample_frames_with_video_metadata(
+            (asset.np_ndarrays, asset.metadata),
+            num_frames,
+        )
        for asset in video_assets
    ]

@@ -259,12 +265,33 @@ def build_video_inputs_from_test_info(
    return [
        PromptWithMultiModalInput(
            prompts=[prompt for _ in size_wrapper.data],
-            video_data=[video_scaler(video, size) for size in size_wrapper.data],
+            video_data=[
+                (
+                    video_scaler(video, size)
+                    if not needs_video_metadata
+                    else (video_scaler(video, size), meta)
+                )
+                for size in size_wrapper.data
+            ],
        )
-        for video, prompt in zip(sampled_vids, model_prompts)
+        for (video, meta), prompt in zip(sampled_vids, model_prompts)
    ]


+def sample_frames_with_video_metadata(
+    video_with_meta: tuple[npt.NDArray, dict[str, Any]],
+    num_frames: int,
+) -> tuple[npt.NDArray, dict[str, Any]]:
+    video, meta = video_with_meta
+    video = sample_frames_from_video(video, num_frames)
+
+    meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
+    meta["total_num_frames"] = num_frames
+    meta["fps"] = meta["duration"] / num_frames
+    meta["frames_indices"] = list(range(num_frames))
+    return video, meta
+
+
 def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
    """Applies a size scaler to one image; this can be an image size factor,
    which scales the image while maintaining the aspect ratio"""