[VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2025-09-12 00:44:34 +08:00
committed by GitHub
parent 51d41265ad
commit bcbe2a4d9e
5 changed files with 233 additions and 55 deletions

View File

@@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
assert metadata_sync == metadata_async
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
video_url: str, max_duration: int, requested_fps: int,
monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
connector = MediaConnector(
media_io_kwargs={
"video": {
"max_duration": max_duration,
"requested_fps": requested_fps,
}
})
video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(
video_url)
assert np.array_equal(video_sync, video_async)
assert metadata_sync == metadata_async
assert metadata_sync["video_backend"] == "opencv_dynamic"
# Used for `test_argsort_mm_positions`.
class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict"