[Bugfix] Fix Qwen3-VL timestamp mismatch when using num_frames without fps (#36136)

Signed-off-by: OiPunk <codingpunk@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Weiguang Li
2026-03-11 18:13:06 +08:00
committed by GitHub
parent 9c34e9d24f
commit 724759684c
2 changed files with 116 additions and 4 deletions

View File

@@ -768,6 +768,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
metadata: dict[str, Any],
do_sample_frames: bool | None = None,
sampled_fps: float | None = None,
sampled_num_frames: int | None = None,
) -> list[int]:
video_processor = self.get_video_processor()
merge_size = video_processor.merge_size
@@ -782,11 +783,20 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
# video loader), we need to re-calculate the indices from original
# metadata.
if do_sample_frames:
# here video_fps is the fps of the sampled video, and
# metadata["fps"] refers to the fps of the original video.
sampled_fps = sampled_fps if sampled_fps else video_processor.fps
total_num_frames = metadata["total_num_frames"]
num_frames = int(total_num_frames / metadata["fps"] * sampled_fps)
# When num_frames is explicitly provided, use it directly
# instead of computing from fps. This mirrors the behavior of
# HF's Qwen3VLVideoProcessor.sample_frames where num_frames
# and fps are mutually exclusive.
if sampled_num_frames is not None:
num_frames = sampled_num_frames
else:
# here video_fps is the fps of the sampled video, and
# metadata["fps"] refers to the fps of the original video.
sampled_fps = sampled_fps if sampled_fps else video_processor.fps
num_frames = int(total_num_frames / metadata["fps"] * sampled_fps)
num_frames = min(
min(
max(num_frames, video_processor.min_frames),
@@ -987,6 +997,7 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
metadata=metadata,
do_sample_frames=video_mm_kwargs["do_sample_frames"],
sampled_fps=video_mm_kwargs.get("fps"),
sampled_num_frames=video_mm_kwargs.get("num_frames"),
)
timestamps_per_video.append(timestamps)
@@ -994,6 +1005,13 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo])
video_mm_data["videos"] = [[video_array]]
video_mm_data["video_metadata"] = [[metadata]]
# When num_frames is specified, explicitly set fps=None
# to prevent HF's BaseVideoProcessor.preprocess() from
# filling in the class default (fps=2) via setdefault(),
# which would conflict with num_frames (mutually exclusive).
if "num_frames" in video_mm_kwargs and "fps" not in video_mm_kwargs:
video_mm_kwargs["fps"] = None
video_outputs = super()._call_hf_processor(
prompt="<|vision_start|><|video_pad|><|vision_end|>",
mm_data=video_mm_data,