# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Regression tests for Qwen3-VL processor. Covers the fix for num_frames-based timestamp calculation (issue vllm-project/vllm#35909). """ from typing import Any import numpy as np import pytest from vllm.multimodal import MULTIMODAL_REGISTRY from ...utils import build_model_context MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct" def _build_video_mm_data( num_frames: int, width: int = 128, height: int = 128, original_fps: float = 30.0, ) -> dict[str, Any]: """Create synthetic video data with metadata indicating that HF processor should re-sample frames (do_sample_frames=True). ``total_num_frames`` is set equal to the ndarray frame count so that HF's ``sample_frames`` indices stay within bounds of the actual tensor that is passed.""" video = np.zeros((num_frames, height, width, 3), dtype=np.uint8) metadata = { "fps": original_fps, "duration": num_frames / original_fps, "total_num_frames": num_frames, "frames_indices": list(range(num_frames)), "video_backend": "opencv", "do_sample_frames": True, } return {"video": [(video, metadata)]} @pytest.mark.parametrize("model_id", [MODEL_ID]) @pytest.mark.parametrize( "num_frames", [8, 16], ) def test_processor_num_frames_timestamp( model_id: str, num_frames: int, ) -> None: """Regression test: using ``num_frames`` (without ``fps``) must not cause a timestamp / token-count mismatch. Before the fix, ``_get_video_second_idx`` ignored the explicit ``num_frames`` and fell back to an fps-based calculation, which produced a different number of timestamp entries and ultimately led to shape mismatches in downstream token construction. We deliberately choose ``num_frames`` values (8, 16) that differ from what the default fps-based path would compute (which clamps to ``min_frames=4`` for a short video at 30 fps), so this test would fail without the fix. """ ctx = build_model_context( model_id, limit_mm_per_prompt={"image": 0, "video": 1}, ) processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) prompt = "<|vision_start|><|video_pad|><|vision_end|>" mm_data = _build_video_mm_data(num_frames=num_frames) # Process with explicit num_frames (no fps) -- this is the path # that was broken before the fix. hf_mm_kwargs: dict[str, Any] = {"num_frames": num_frames} processed = processor( prompt, mm_items=processor.info.parse_mm_data(mm_data), hf_processor_mm_kwargs=hf_mm_kwargs, ) # Basic sanity: the processor must produce video tokens. token_ids = processed["prompt_token_ids"] assert len(token_ids) > 0, "Processor produced empty token list" # Verify that video placeholders were actually inserted. assert "mm_placeholders" in processed video_phs = processed["mm_placeholders"].get("video", []) assert len(video_phs) == 1, ( f"Expected exactly 1 video placeholder, got {len(video_phs)}" )