[Bugfix] Fix glm4.1v video inference issue (#22067)

Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-02 00:33:30 +08:00
parent 326a1b001d
commit 3f8e952179
2 changed files with 53 additions and 6 deletions
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -937,7 +937,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
                              total_frames: int) -> list[int]:
        video_processor = self.get_video_processor()

-        video_fps = metadata.get("fps", 2.0)
+        video_fps = metadata.get("fps", video_processor.fps)
        meta_frames = metadata.get("total_num_frames", total_frames)
        max_frame_idx = meta_frames - 1
        duration = metadata.get("duration",
@@ -1120,11 +1120,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                    video_placeholder,
                )

-                grid_t = len(video_outputs["video_grid_thw"])
-                _, grid_h, grid_w = video_outputs["video_grid_thw"][0]
-                grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
-
-                video_grid_thw_lst.append(grid_thw)
+                video_grid_thw_lst.append(video_outputs["video_grid_thw"])
                pixel_values_videos_lst.append(
                    video_outputs["pixel_values_videos"])
            video_outputs = dict(