[Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (#24822)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-15 20:45:06 +08:00
parent 72c99f2a75
commit 0e219cd50b
6 changed files with 118 additions and 70 deletions
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1716,6 +1716,13 @@ model_example_map = {
 }


+MODELS_NEED_VIDEO_METADATA = [
+    "glm4_1v",
+    "glm4_5v",
+    "glm4_5v_fp8",
+]
+
+
 def get_multi_modal_input(args):
    """
    return {
@@ -1740,12 +1747,13 @@ def get_multi_modal_input(args):

    if args.modality == "video":
        # Input video and question
+        needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
        video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
        metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
        vid_questions = ["Why is this video funny?"]

        return {
-            "data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
+            "data": ([(video, metadata)] if needs_metadata else video),
            "questions": vid_questions,
        }

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -32,11 +32,14 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
    # Ensure video metadata is included
    if "video" in mm_data:
        video = mm_data["video"]
+        num_frames = len(video)
        mm_data["video"] = (video, {
-            "total_num_frames": len(video),
-            "fps": len(video),
+            "total_num_frames": num_frames,
+            "fps": num_frames,
            "duration": 1,
-            "video_backend": "opencv"
+            "frames_indices": [i for i in range(num_frames)],
+            "video_backend": "opencv",
+            "do_sample_frames": True,
        })
    return mm_data

--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -12,8 +12,19 @@ from ...utils import build_model_context

@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("expected_toks_per_frame", [299])
-@pytest.mark.parametrize("num_frames", [32, 128])
-@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
+@pytest.mark.parametrize(
+    "num_frames, fps, expected_grid_t",
+    [
+        # pre-sampled fixed frames (unexpected behavior,
+        # but we still expect it to work without errors)
+        (32, 1, 16),
+        (32, 2, 16),
+        (128, 1, 64),
+        (128, 2, 64),
+        # post-sampled frames (expected behavior)
+        (-1, 1, 5),
+        (-1, 2, 10),
+    ])
 def test_processor_override(
    model_id: str,
    expected_toks_per_frame: int,
@@ -80,7 +91,7 @@ def test_video_loader_consistency(

    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
-        video_bytes, requested_fps=fps)
+        video_bytes, fps=fps)

    # pre-sampled loader shouldn't read all frames
    assert len(dynamic_video) < len(static_video)
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -76,7 +76,7 @@ def video_to_pil_images_list(path: str,
    return [Image.fromarray(frame) for frame in frames]


-def video_get_metadata(path: str) -> dict[str, Any]:
+def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        raise ValueError(f"Could not open video file {path}")
@@ -85,11 +85,18 @@ def video_get_metadata(path: str) -> dict[str, Any]:
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_frames / fps if fps > 0 else 0

+    if num_frames == -1 or num_frames > total_frames:
+        num_frames = total_frames
+
    metadata = {
-        "total_num_frames": total_frames,
+        "total_num_frames": num_frames,
        "fps": fps,
        "duration": duration,
-        "video_backend": "opencv"
+        "video_backend": "opencv",
+        "frames_indices": list(range(num_frames)),
+        # extra field used to control hf processor's video
+        # sampling behavior
+        "do_sample_frames": num_frames == total_frames,
    }
    return metadata

@@ -126,7 +133,7 @@ class VideoAsset:

    @property
    def metadata(self) -> dict[str, Any]:
-        ret = video_get_metadata(self.video_path)
+        ret = video_get_metadata(self.video_path, self.num_frames)
        return ret

    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -36,7 +36,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from packaging.version import Version
 from transformers import BatchFeature
+from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
    Glm4vImageProcessor, smart_resize)
@@ -1001,28 +1003,32 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
        max_frame_idx = meta_frames - 1
        duration = metadata.get("duration",
                                round(max_frame_idx / video_fps) + 1)
-        if duration <= video_processor.max_duration:
-            n = int(math.floor(duration * video_processor.fps))
-            frame_indices = [
-                min(
-                    max_frame_idx,
-                    int(math.ceil(i * video_fps / video_processor.fps)),
-                ) for i in range(n)
-            ]
+        do_sample_frames = metadata["do_sample_frames"]
+        if not do_sample_frames:
+            frame_indices = metadata["frames_indices"]
        else:
-            num_samples = int(video_processor.max_duration *
-                              video_processor.fps)
-            if num_samples >= meta_frames:
-                frame_indices = list(range(meta_frames))
-            else:
-                target_seconds = np.linspace(0,
-                                             duration,
-                                             num_samples,
-                                             endpoint=True)
+            if duration <= video_processor.max_duration:
+                n = int(math.floor(duration * video_processor.fps))
                frame_indices = [
-                    min(max_frame_idx, int(math.ceil(t * video_fps)))
-                    for t in target_seconds
+                    min(
+                        max_frame_idx,
+                        int(math.ceil(i * video_fps / video_processor.fps)),
+                    ) for i in range(n)
                ]
+            else:
+                num_samples = int(video_processor.max_duration *
+                                  video_processor.fps)
+                if num_samples >= meta_frames:
+                    frame_indices = list(range(meta_frames))
+                else:
+                    target_seconds = np.linspace(0,
+                                                 duration,
+                                                 num_samples,
+                                                 endpoint=True)
+                    frame_indices = [
+                        min(max_frame_idx, int(math.ceil(t * video_fps)))
+                        for t in target_seconds
+                    ]

        seen, uniq = set(), []
        for idx in frame_indices:
@@ -1139,7 +1145,9 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
                "fps": 2.0,
                "duration": num_frames / 2.0,
                "total_num_frames": num_frames,
+                "frames_indices": [i for i in range(num_frames)],
                "video_backend": "opencv",
+                "do_sample_frames": False,
            }
            video_item = (video.copy(), video_metadata)
            video_items.append(video_item)
@@ -1172,34 +1180,37 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
            for item in mm_data.pop("videos", []):
                video_array, metadata = item

-                if metadata["video_backend"] == "opencv_dynamic":
-                    mm_kwargs["do_sample_frames"] = False
-
-                elif metadata["total_num_frames"] != len(video_array):
-                    logger.warning(
-                        "Total frames in metadata "
-                        "(%s) does not match the length of "
-                        "video array %s. This can "
-                        "be because the video is resampled "
-                        "in advance. This may cause "
-                        "a divergence with HF implementation.",
-                        metadata["total_num_frames"],
-                        len(video_array),
-                    )
-                    metadata["total_num_frames"] = len(video_array)
+                # don't update mm_kwargs inplace
+                video_mm_kwargs = dict(**mm_kwargs)
+                video_mm_kwargs["do_sample_frames"] = metadata.get(
+                    "do_sample_frames", True)

                video_mm_data = dict()
                video_mm_data["videos"] = [[video_array]]
-                video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]]
+
+                # backward compatibility for Transformers 4.55
+                unuse_metadata = ["do_sample_frames"]
+                if not hasattr(
+                        VideoMetadata,
+                        "frames_indices") and "frames_indices" in metadata:
+                    unuse_metadata.append("frames_indices")
+
+                video_mm_data["video_metadata"] = [[
+                    VideoMetadata(
+                        **{
+                            k: metadata[k]
+                            for k in metadata if k not in unuse_metadata
+                        })
+                ]]

                video_outputs = super()._call_hf_processor(
                    prompt="<|begin_of_video|><|video|><|end_of_video|>",
                    mm_data=video_mm_data,
-                    mm_kwargs=mm_kwargs,
+                    mm_kwargs=video_mm_kwargs,
                    tok_kwargs=tok_kwargs,
                )
-                if "do_sample_frames" in mm_kwargs and not mm_kwargs[
-                        "do_sample_frames"]:
+                if not video_mm_kwargs["do_sample_frames"] and Version(
+                        TRANSFORMERS_VERSION) < Version("4.56.0"):
                    # Transformers v4.55 has incorrect timestamps issue for
                    # skip sampling. We construct the placeholder manually to
                    # get placeholders with correct timestamps.
@@ -1218,6 +1229,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                prompt = prompt.replace(
                    "<|begin_of_video|><|video|><|end_of_video|>",
                    video_placeholder,
+                    1,
                )

                video_grid_thw_lst.append(video_outputs["video_grid_thw"])
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -121,14 +121,6 @@ class OpenCVVideoBackend(VideoLoader):
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames_num / original_fps if original_fps > 0 else 0

-        # Use transformers transformers.video_utils.VideoMetadata format
-        metadata = {
-            "total_num_frames": total_frames_num,
-            "fps": original_fps,
-            "duration": duration,
-            "video_backend": "opencv"
-        }
-
        # resample video to target num_frames
        full_read = num_frames == -1 or total_frames_num < num_frames
        if full_read:
@@ -159,6 +151,20 @@ class OpenCVVideoBackend(VideoLoader):
        assert i == num_frames, (f"Expected reading {num_frames} frames, "
                                 f"but only loaded {i} frames from video.")

+        # Use transformers transformers.video_utils.VideoMetadata format
+        # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
+        # can cause incorrect timestamp calculation without num_frames=-1.
+        metadata = {
+            "total_num_frames": num_frames,
+            "fps": original_fps,
+            "duration": duration,
+            "video_backend": "opencv",
+            "frames_indices": list(range(num_frames)),
+            # extra field used to control hf processor's video
+            # sampling behavior
+            "do_sample_frames": num_frames == total_frames_num,
+        }
+
        return frames, metadata


@@ -170,7 +176,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
        cls,
        data: bytes,
        num_frames: int = -1,
-        requested_fps: int = 2,
+        fps: int = 2,
        max_duration: int = 300,
        **kwargs,
    ) -> tuple[npt.NDArray, dict[str, Any]]:
@@ -185,14 +191,6 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames_num / original_fps if original_fps > 0 else 0

-        # Use transformers transformers.video_utils.VideoMetadata format
-        metadata = {
-            "total_num_frames": total_frames_num,
-            "fps": original_fps,
-            "duration": duration,
-            "video_backend": "opencv_dynamic"
-        }
-
        # resample video to target num_frames
        max_frame_idx = total_frames_num - 1
        duration = duration or round(max_frame_idx / original_fps) + 1
@@ -201,14 +199,13 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
        # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
        frame_indices: Union[range, list[int]]
        if duration <= max_duration:
-            n = int(math.floor(duration * requested_fps))
+            n = int(math.floor(duration * fps))
            frame_indices = sorted({
-                min(max_frame_idx,
-                    int(math.ceil(i * original_fps / requested_fps)))
+                min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
                for i in range(n)
            })
        else:
-            num_samples = int(max_duration * requested_fps)
+            num_samples = int(max_duration * fps)
            if num_samples >= total_frames_num:
                frame_indices = range(total_frames_num)
            else:
@@ -241,6 +238,16 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
            f"Expected reading {len(frame_indices)} frames, "
            f"but only loaded {i} frames from video.")

+        # Use transformers transformers.video_utils.VideoMetadata format
+        metadata = {
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
+            "duration": duration,
+            "video_backend": "opencv_dynamic",
+            "frames_indices": list(frame_indices),
+            "do_sample_frames": False,
+        }
+
        return frames, metadata