tests/models/multimodal/processing/test_qwen3_vl.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Regression tests for Qwen3-VL processor.

Covers the fix for num_frames-based timestamp calculation
(issue vllm-project/vllm#35909).
"""

from typing import Any

import numpy as np
import pytest

from vllm.multimodal import MULTIMODAL_REGISTRY

from ...utils import build_model_context

MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"


def _build_video_mm_data(
    num_frames: int,
    width: int = 128,
    height: int = 128,
    original_fps: float = 30.0,
) -> dict[str, Any]:
    """Create synthetic video data with metadata indicating that
    HF processor should re-sample frames (do_sample_frames=True).

    ``total_num_frames`` is set equal to the ndarray frame count so
    that HF's ``sample_frames`` indices stay within bounds of the
    actual tensor that is passed."""
    video = np.zeros((num_frames, height, width, 3), dtype=np.uint8)
    metadata = {
        "fps": original_fps,
        "duration": num_frames / original_fps,
        "total_num_frames": num_frames,
        "frames_indices": list(range(num_frames)),
        "video_backend": "opencv",
        "do_sample_frames": True,
    }
    return {"video": [(video, metadata)]}


@pytest.mark.parametrize("model_id", [MODEL_ID])
@pytest.mark.parametrize(
    "num_frames",
    [8, 16],
)
def test_processor_num_frames_timestamp(
    model_id: str,
    num_frames: int,
) -> None:
    """Regression test: using ``num_frames`` (without ``fps``) must not
    cause a timestamp / token-count mismatch.

    Before the fix, ``_get_video_second_idx`` ignored the explicit
    ``num_frames`` and fell back to an fps-based calculation, which
    produced a different number of timestamp entries and ultimately led
    to shape mismatches in downstream token construction.

    We deliberately choose ``num_frames`` values (8, 16) that differ
    from what the default fps-based path would compute (which clamps
    to ``min_frames=4`` for a short video at 30 fps), so this test
    would fail without the fix.
    """
    ctx = build_model_context(
        model_id,
        limit_mm_per_prompt={"image": 0, "video": 1},
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

    prompt = "<|vision_start|><|video_pad|><|vision_end|>"
    mm_data = _build_video_mm_data(num_frames=num_frames)

    # Process with explicit num_frames (no fps) -- this is the path
    # that was broken before the fix.
    hf_mm_kwargs: dict[str, Any] = {"num_frames": num_frames}
    processed = processor(
        prompt,
        mm_items=processor.info.parse_mm_data(mm_data),
        hf_processor_mm_kwargs=hf_mm_kwargs,
    )

    # Basic sanity: the processor must produce video tokens.
    token_ids = processed["prompt_token_ids"]
    assert len(token_ids) > 0, "Processor produced empty token list"

    # Verify that video placeholders were actually inserted.
    assert "mm_placeholders" in processed
    video_phs = processed["mm_placeholders"].get("video", [])
    assert len(video_phs) == 1, (
        f"Expected exactly 1 video placeholder, got {len(video_phs)}"
    )
[Bugfix] Fix Qwen3-VL timestamp mismatch when using num_frames without fps (#36136) Signed-off-by: OiPunk <codingpunk@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 18:13:06 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""Regression tests for Qwen3-VL processor.`

			`Covers the fix for num_frames-based timestamp calculation`
			`(issue vllm-project/vllm#35909).`
			`"""`

			`from typing import Any`

			`import numpy as np`
			`import pytest`

			`from vllm.multimodal import MULTIMODAL_REGISTRY`

			`from ...utils import build_model_context`

			`MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"`


			`def _build_video_mm_data(`
			`num_frames: int,`
			`width: int = 128,`
			`height: int = 128,`
			`original_fps: float = 30.0,`
			`) -> dict[str, Any]:`
			`"""Create synthetic video data with metadata indicating that`
			`HF processor should re-sample frames (do_sample_frames=True).`

			``total_num_frames`` is set equal to the ndarray frame count so
			that HF's ``sample_frames`` indices stay within bounds of the
			`actual tensor that is passed."""`
			`video = np.zeros((num_frames, height, width, 3), dtype=np.uint8)`
			`metadata = {`
			`"fps": original_fps,`
			`"duration": num_frames / original_fps,`
			`"total_num_frames": num_frames,`
			`"frames_indices": list(range(num_frames)),`
			`"video_backend": "opencv",`
			`"do_sample_frames": True,`
			`}`
			`return {"video": [(video, metadata)]}`


			`@pytest.mark.parametrize("model_id", [MODEL_ID])`
			`@pytest.mark.parametrize(`
			`"num_frames",`
			`[8, 16],`
			`)`
			`def test_processor_num_frames_timestamp(`
			`model_id: str,`
			`num_frames: int,`
			`) -> None:`
			"""Regression test: using ``num_frames`` (without ``fps``) must not
			`cause a timestamp / token-count mismatch.`

			Before the fix, ``_get_video_second_idx`` ignored the explicit
			``num_frames`` and fell back to an fps-based calculation, which
			`produced a different number of timestamp entries and ultimately led`
			`to shape mismatches in downstream token construction.`

			We deliberately choose ``num_frames`` values (8, 16) that differ
			`from what the default fps-based path would compute (which clamps`
			to ``min_frames=4`` for a short video at 30 fps), so this test
			`would fail without the fix.`
			`"""`
			`ctx = build_model_context(`
			`model_id,`
			`limit_mm_per_prompt={"image": 0, "video": 1},`
			`)`
			`processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)`

			`prompt = "<\|vision_start\|><\|video_pad\|><\|vision_end\|>"`
			`mm_data = _build_video_mm_data(num_frames=num_frames)`

			`# Process with explicit num_frames (no fps) -- this is the path`
			`# that was broken before the fix.`
			`hf_mm_kwargs: dict[str, Any] = {"num_frames": num_frames}`
			`processed = processor(`
			`prompt,`
			`mm_items=processor.info.parse_mm_data(mm_data),`
			`hf_processor_mm_kwargs=hf_mm_kwargs,`
			`)`

			`# Basic sanity: the processor must produce video tokens.`
			`token_ids = processed["prompt_token_ids"]`
			`assert len(token_ids) > 0, "Processor produced empty token list"`

			`# Verify that video placeholders were actually inserted.`
			`assert "mm_placeholders" in processed`
			`video_phs = processed["mm_placeholders"].get("video", [])`
			`assert len(video_phs) == 1, (`
			`f"Expected exactly 1 video placeholder, got {len(video_phs)}"`
			`)`