[Bugfix] Fix Qwen3-VL timestamp mismatch when using num_frames without fps (#36136)

Signed-off-by: OiPunk <codingpunk@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 18:13:06 +08:00
parent 9c34e9d24f
commit 724759684c
2 changed files with 116 additions and 4 deletions
--- a/tests/models/multimodal/processing/test_qwen3_vl.py
+++ b/tests/models/multimodal/processing/test_qwen3_vl.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Regression tests for Qwen3-VL processor.
+
+Covers the fix for num_frames-based timestamp calculation
+(issue vllm-project/vllm#35909).
+"""
+
+from typing import Any
+
+import numpy as np
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ...utils import build_model_context
+
+MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
+
+
+def _build_video_mm_data(
+    num_frames: int,
+    width: int = 128,
+    height: int = 128,
+    original_fps: float = 30.0,
+) -> dict[str, Any]:
+    """Create synthetic video data with metadata indicating that
+    HF processor should re-sample frames (do_sample_frames=True).
+
+    ``total_num_frames`` is set equal to the ndarray frame count so
+    that HF's ``sample_frames`` indices stay within bounds of the
+    actual tensor that is passed."""
+    video = np.zeros((num_frames, height, width, 3), dtype=np.uint8)
+    metadata = {
+        "fps": original_fps,
+        "duration": num_frames / original_fps,
+        "total_num_frames": num_frames,
+        "frames_indices": list(range(num_frames)),
+        "video_backend": "opencv",
+        "do_sample_frames": True,
+    }
+    return {"video": [(video, metadata)]}
+
+
+@pytest.mark.parametrize("model_id", [MODEL_ID])
+@pytest.mark.parametrize(
+    "num_frames",
+    [8, 16],
+)
+def test_processor_num_frames_timestamp(
+    model_id: str,
+    num_frames: int,
+) -> None:
+    """Regression test: using ``num_frames`` (without ``fps``) must not
+    cause a timestamp / token-count mismatch.
+
+    Before the fix, ``_get_video_second_idx`` ignored the explicit
+    ``num_frames`` and fell back to an fps-based calculation, which
+    produced a different number of timestamp entries and ultimately led
+    to shape mismatches in downstream token construction.
+
+    We deliberately choose ``num_frames`` values (8, 16) that differ
+    from what the default fps-based path would compute (which clamps
+    to ``min_frames=4`` for a short video at 30 fps), so this test
+    would fail without the fix.
+    """
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"image": 0, "video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    prompt = "<|vision_start|><|video_pad|><|vision_end|>"
+    mm_data = _build_video_mm_data(num_frames=num_frames)
+
+    # Process with explicit num_frames (no fps) -- this is the path
+    # that was broken before the fix.
+    hf_mm_kwargs: dict[str, Any] = {"num_frames": num_frames}
+    processed = processor(
+        prompt,
+        mm_items=processor.info.parse_mm_data(mm_data),
+        hf_processor_mm_kwargs=hf_mm_kwargs,
+    )
+
+    # Basic sanity: the processor must produce video tokens.
+    token_ids = processed["prompt_token_ids"]
+    assert len(token_ids) > 0, "Processor produced empty token list"
+
+    # Verify that video placeholders were actually inserted.
+    assert "mm_placeholders" in processed
+    video_phs = processed["mm_placeholders"].get("video", [])
+    assert len(video_phs) == 1, (
+        f"Expected exactly 1 video placeholder, got {len(video_phs)}"
+    )
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -768,6 +768,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
        metadata: dict[str, Any],
        do_sample_frames: bool | None = None,
        sampled_fps: float | None = None,
+        sampled_num_frames: int | None = None,
    ) -> list[int]:
        video_processor = self.get_video_processor()
        merge_size = video_processor.merge_size
@@ -782,11 +783,20 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
        # video loader), we need to re-calculate the indices from original
        # metadata.
        if do_sample_frames:
-            # here video_fps is the fps of the sampled video, and
-            # metadata["fps"] refers to the fps of the original video.
-            sampled_fps = sampled_fps if sampled_fps else video_processor.fps
            total_num_frames = metadata["total_num_frames"]
-            num_frames = int(total_num_frames / metadata["fps"] * sampled_fps)
+
+            # When num_frames is explicitly provided, use it directly
+            # instead of computing from fps. This mirrors the behavior of
+            # HF's Qwen3VLVideoProcessor.sample_frames where num_frames
+            # and fps are mutually exclusive.
+            if sampled_num_frames is not None:
+                num_frames = sampled_num_frames
+            else:
+                # here video_fps is the fps of the sampled video, and
+                # metadata["fps"] refers to the fps of the original video.
+                sampled_fps = sampled_fps if sampled_fps else video_processor.fps
+                num_frames = int(total_num_frames / metadata["fps"] * sampled_fps)
+
            num_frames = min(
                min(
                    max(num_frames, video_processor.min_frames),
@@ -987,6 +997,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
                    metadata=metadata,
                    do_sample_frames=video_mm_kwargs["do_sample_frames"],
                    sampled_fps=video_mm_kwargs.get("fps"),
+                    sampled_num_frames=video_mm_kwargs.get("num_frames"),
                )
                timestamps_per_video.append(timestamps)

@@ -994,6 +1005,13 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
                video_mm_data["videos"] = [[video_array]]
                video_mm_data["video_metadata"] = [[metadata]]

+                # When num_frames is specified, explicitly set fps=None
+                # to prevent HF's BaseVideoProcessor.preprocess() from
+                # filling in the class default (fps=2) via setdefault(),
+                # which would conflict with num_frames (mutually exclusive).
+                if "num_frames" in video_mm_kwargs and "fps" not in video_mm_kwargs:
+                    video_mm_kwargs["fps"] = None
+
                video_outputs = super()._call_hf_processor(
                    prompt="<|vision_start|><|video_pad|><|vision_end|>",
                    mm_data=video_mm_data,