[Bugfix] Fix Qwen3-VL timestamp mismatch when using num_frames without fps (#36136)
Signed-off-by: OiPunk <codingpunk@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
94
tests/models/multimodal/processing/test_qwen3_vl.py
Normal file
94
tests/models/multimodal/processing/test_qwen3_vl.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Regression tests for Qwen3-VL processor.
|
||||
|
||||
Covers the fix for num_frames-based timestamp calculation
|
||||
(issue vllm-project/vllm#35909).
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
|
||||
|
||||
|
||||
def _build_video_mm_data(
|
||||
num_frames: int,
|
||||
width: int = 128,
|
||||
height: int = 128,
|
||||
original_fps: float = 30.0,
|
||||
) -> dict[str, Any]:
|
||||
"""Create synthetic video data with metadata indicating that
|
||||
HF processor should re-sample frames (do_sample_frames=True).
|
||||
|
||||
``total_num_frames`` is set equal to the ndarray frame count so
|
||||
that HF's ``sample_frames`` indices stay within bounds of the
|
||||
actual tensor that is passed."""
|
||||
video = np.zeros((num_frames, height, width, 3), dtype=np.uint8)
|
||||
metadata = {
|
||||
"fps": original_fps,
|
||||
"duration": num_frames / original_fps,
|
||||
"total_num_frames": num_frames,
|
||||
"frames_indices": list(range(num_frames)),
|
||||
"video_backend": "opencv",
|
||||
"do_sample_frames": True,
|
||||
}
|
||||
return {"video": [(video, metadata)]}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [MODEL_ID])
|
||||
@pytest.mark.parametrize(
|
||||
"num_frames",
|
||||
[8, 16],
|
||||
)
|
||||
def test_processor_num_frames_timestamp(
|
||||
model_id: str,
|
||||
num_frames: int,
|
||||
) -> None:
|
||||
"""Regression test: using ``num_frames`` (without ``fps``) must not
|
||||
cause a timestamp / token-count mismatch.
|
||||
|
||||
Before the fix, ``_get_video_second_idx`` ignored the explicit
|
||||
``num_frames`` and fell back to an fps-based calculation, which
|
||||
produced a different number of timestamp entries and ultimately led
|
||||
to shape mismatches in downstream token construction.
|
||||
|
||||
We deliberately choose ``num_frames`` values (8, 16) that differ
|
||||
from what the default fps-based path would compute (which clamps
|
||||
to ``min_frames=4`` for a short video at 30 fps), so this test
|
||||
would fail without the fix.
|
||||
"""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
limit_mm_per_prompt={"image": 0, "video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
prompt = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
mm_data = _build_video_mm_data(num_frames=num_frames)
|
||||
|
||||
# Process with explicit num_frames (no fps) -- this is the path
|
||||
# that was broken before the fix.
|
||||
hf_mm_kwargs: dict[str, Any] = {"num_frames": num_frames}
|
||||
processed = processor(
|
||||
prompt,
|
||||
mm_items=processor.info.parse_mm_data(mm_data),
|
||||
hf_processor_mm_kwargs=hf_mm_kwargs,
|
||||
)
|
||||
|
||||
# Basic sanity: the processor must produce video tokens.
|
||||
token_ids = processed["prompt_token_ids"]
|
||||
assert len(token_ids) > 0, "Processor produced empty token list"
|
||||
|
||||
# Verify that video placeholders were actually inserted.
|
||||
assert "mm_placeholders" in processed
|
||||
video_phs = processed["mm_placeholders"].get("video", [])
|
||||
assert len(video_phs) == 1, (
|
||||
f"Expected exactly 1 video placeholder, got {len(video_phs)}"
|
||||
)
|
||||
@@ -768,6 +768,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
|
||||
metadata: dict[str, Any],
|
||||
do_sample_frames: bool | None = None,
|
||||
sampled_fps: float | None = None,
|
||||
sampled_num_frames: int | None = None,
|
||||
) -> list[int]:
|
||||
video_processor = self.get_video_processor()
|
||||
merge_size = video_processor.merge_size
|
||||
@@ -782,11 +783,20 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
|
||||
# video loader), we need to re-calculate the indices from original
|
||||
# metadata.
|
||||
if do_sample_frames:
|
||||
# here video_fps is the fps of the sampled video, and
|
||||
# metadata["fps"] refers to the fps of the original video.
|
||||
sampled_fps = sampled_fps if sampled_fps else video_processor.fps
|
||||
total_num_frames = metadata["total_num_frames"]
|
||||
num_frames = int(total_num_frames / metadata["fps"] * sampled_fps)
|
||||
|
||||
# When num_frames is explicitly provided, use it directly
|
||||
# instead of computing from fps. This mirrors the behavior of
|
||||
# HF's Qwen3VLVideoProcessor.sample_frames where num_frames
|
||||
# and fps are mutually exclusive.
|
||||
if sampled_num_frames is not None:
|
||||
num_frames = sampled_num_frames
|
||||
else:
|
||||
# here video_fps is the fps of the sampled video, and
|
||||
# metadata["fps"] refers to the fps of the original video.
|
||||
sampled_fps = sampled_fps if sampled_fps else video_processor.fps
|
||||
num_frames = int(total_num_frames / metadata["fps"] * sampled_fps)
|
||||
|
||||
num_frames = min(
|
||||
min(
|
||||
max(num_frames, video_processor.min_frames),
|
||||
@@ -987,6 +997,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
|
||||
metadata=metadata,
|
||||
do_sample_frames=video_mm_kwargs["do_sample_frames"],
|
||||
sampled_fps=video_mm_kwargs.get("fps"),
|
||||
sampled_num_frames=video_mm_kwargs.get("num_frames"),
|
||||
)
|
||||
timestamps_per_video.append(timestamps)
|
||||
|
||||
@@ -994,6 +1005,13 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
|
||||
video_mm_data["videos"] = [[video_array]]
|
||||
video_mm_data["video_metadata"] = [[metadata]]
|
||||
|
||||
# When num_frames is specified, explicitly set fps=None
|
||||
# to prevent HF's BaseVideoProcessor.preprocess() from
|
||||
# filling in the class default (fps=2) via setdefault(),
|
||||
# which would conflict with num_frames (mutually exclusive).
|
||||
if "num_frames" in video_mm_kwargs and "fps" not in video_mm_kwargs:
|
||||
video_mm_kwargs["fps"] = None
|
||||
|
||||
video_outputs = super()._call_hf_processor(
|
||||
prompt="<|vision_start|><|video_pad|><|vision_end|>",
|
||||
mm_data=video_mm_data,
|
||||
|
||||
Reference in New Issue
Block a user