[VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2025-09-12 00:44:34 +08:00
committed by GitHub
parent 51d41265ad
commit bcbe2a4d9e
5 changed files with 233 additions and 55 deletions

View File

@@ -5,6 +5,7 @@ import pytest
from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
from ...utils import build_model_context
@@ -50,3 +51,49 @@ def test_processor_override(
assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("fps", [2])
def test_video_loader_consistency(
model_id: str,
fps: int,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
loader (post-sampled by processor) produce same video processing outputs.
"""
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
with open(video_path, "rb") as f:
video_bytes = f.read()
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
video_bytes, requested_fps=fps)
# pre-sampled loader shouldn't read all frames
assert len(dynamic_video) < len(static_video)
static_mm_data = {"video": [(static_video, static_metadata)]}
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
static_outputs = processor.apply(prompt, static_mm_data,
hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
hf_processor_mm_kwargs)
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
"prompt_token_ids"]
assert static_outputs["mm_kwargs"].get_data(
) == dynamic_outputs["mm_kwargs"].get_data()