95 lines
3.1 KiB
Python
95 lines
3.1 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
|
|
"""Regression tests for Qwen3-VL processor.
|
||
|
|
|
||
|
|
Covers the fix for num_frames-based timestamp calculation
|
||
|
|
(issue vllm-project/vllm#35909).
|
||
|
|
"""
|
||
|
|
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||
|
|
|
||
|
|
from ...utils import build_model_context
|
||
|
|
|
||
|
|
MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
|
||
|
|
|
||
|
|
|
||
|
|
def _build_video_mm_data(
|
||
|
|
num_frames: int,
|
||
|
|
width: int = 128,
|
||
|
|
height: int = 128,
|
||
|
|
original_fps: float = 30.0,
|
||
|
|
) -> dict[str, Any]:
|
||
|
|
"""Create synthetic video data with metadata indicating that
|
||
|
|
HF processor should re-sample frames (do_sample_frames=True).
|
||
|
|
|
||
|
|
``total_num_frames`` is set equal to the ndarray frame count so
|
||
|
|
that HF's ``sample_frames`` indices stay within bounds of the
|
||
|
|
actual tensor that is passed."""
|
||
|
|
video = np.zeros((num_frames, height, width, 3), dtype=np.uint8)
|
||
|
|
metadata = {
|
||
|
|
"fps": original_fps,
|
||
|
|
"duration": num_frames / original_fps,
|
||
|
|
"total_num_frames": num_frames,
|
||
|
|
"frames_indices": list(range(num_frames)),
|
||
|
|
"video_backend": "opencv",
|
||
|
|
"do_sample_frames": True,
|
||
|
|
}
|
||
|
|
return {"video": [(video, metadata)]}
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize("model_id", [MODEL_ID])
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"num_frames",
|
||
|
|
[8, 16],
|
||
|
|
)
|
||
|
|
def test_processor_num_frames_timestamp(
|
||
|
|
model_id: str,
|
||
|
|
num_frames: int,
|
||
|
|
) -> None:
|
||
|
|
"""Regression test: using ``num_frames`` (without ``fps``) must not
|
||
|
|
cause a timestamp / token-count mismatch.
|
||
|
|
|
||
|
|
Before the fix, ``_get_video_second_idx`` ignored the explicit
|
||
|
|
``num_frames`` and fell back to an fps-based calculation, which
|
||
|
|
produced a different number of timestamp entries and ultimately led
|
||
|
|
to shape mismatches in downstream token construction.
|
||
|
|
|
||
|
|
We deliberately choose ``num_frames`` values (8, 16) that differ
|
||
|
|
from what the default fps-based path would compute (which clamps
|
||
|
|
to ``min_frames=4`` for a short video at 30 fps), so this test
|
||
|
|
would fail without the fix.
|
||
|
|
"""
|
||
|
|
ctx = build_model_context(
|
||
|
|
model_id,
|
||
|
|
limit_mm_per_prompt={"image": 0, "video": 1},
|
||
|
|
)
|
||
|
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||
|
|
|
||
|
|
prompt = "<|vision_start|><|video_pad|><|vision_end|>"
|
||
|
|
mm_data = _build_video_mm_data(num_frames=num_frames)
|
||
|
|
|
||
|
|
# Process with explicit num_frames (no fps) -- this is the path
|
||
|
|
# that was broken before the fix.
|
||
|
|
hf_mm_kwargs: dict[str, Any] = {"num_frames": num_frames}
|
||
|
|
processed = processor(
|
||
|
|
prompt,
|
||
|
|
mm_items=processor.info.parse_mm_data(mm_data),
|
||
|
|
hf_processor_mm_kwargs=hf_mm_kwargs,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Basic sanity: the processor must produce video tokens.
|
||
|
|
token_ids = processed["prompt_token_ids"]
|
||
|
|
assert len(token_ids) > 0, "Processor produced empty token list"
|
||
|
|
|
||
|
|
# Verify that video placeholders were actually inserted.
|
||
|
|
assert "mm_placeholders" in processed
|
||
|
|
video_phs = processed["mm_placeholders"].get("video", [])
|
||
|
|
assert len(video_phs) == 1, (
|
||
|
|
f"Expected exactly 1 video placeholder, got {len(video_phs)}"
|
||
|
|
)
|