[VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -5,6 +5,7 @@ import pytest
|
||||
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
@@ -50,3 +51,49 @@ def test_processor_override(
|
||||
|
||||
assert grid_t == expected_grid_t
|
||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
|
||||
@pytest.mark.parametrize("fps", [2])
|
||||
def test_video_loader_consistency(
|
||||
model_id: str,
|
||||
fps: int,
|
||||
):
|
||||
"""
|
||||
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||
loader (post-sampled by processor) produce same video processing outputs.
|
||||
"""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
|
||||
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
|
||||
with open(video_path, "rb") as f:
|
||||
video_bytes = f.read()
|
||||
|
||||
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
|
||||
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
|
||||
video_bytes, requested_fps=fps)
|
||||
|
||||
# pre-sampled loader shouldn't read all frames
|
||||
assert len(dynamic_video) < len(static_video)
|
||||
|
||||
static_mm_data = {"video": [(static_video, static_metadata)]}
|
||||
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
|
||||
|
||||
static_outputs = processor.apply(prompt, static_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
|
||||
"prompt_token_ids"]
|
||||
assert static_outputs["mm_kwargs"].get_data(
|
||||
) == dynamic_outputs["mm_kwargs"].get_data()
|
||||
|
||||
Reference in New Issue
Block a user