[CI/Build] Automatically patch video metadata for multimodal processor test (#35822)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-03-03 12:27:45 +08:00
committed by GitHub
parent 25e02647c2
commit 7d8bbe6f42

View File

@@ -33,32 +33,9 @@ from ...registry import (
)
def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
def add_video_metadata(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for GLM4.1V model.
"""
# Ensure video metadata is included
if "video" in mm_data:
# GLM4.1V doesn't support multiple videos
video = mm_data["video"]
num_frames = len(video)
mm_data["video"] = (
video,
{
"total_num_frames": num_frames,
"fps": num_frames,
"duration": 1,
"frames_indices": [i for i in range(num_frames)],
"video_backend": "opencv",
"do_sample_frames": True,
},
)
return mm_data
def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for Qwen3-VL model.
Add metadata to video mm_data
"""
def create_metadata(frames: np.ndarray):
@@ -119,18 +96,7 @@ _IGNORE_MM_KEYS = {
}
MM_DATA_PATCHES = {
# Ernie4.5-VL, GLM4.1V and Qwen3-VL requires video metadata
"ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
"glm4v": glm4_1v_patch_mm_data,
"glm4v_moe": glm4_1v_patch_mm_data,
"glm_ocr": glm4_1v_patch_mm_data,
"glmasr": glmasr_patch_mm_data,
"interns1_pro": qwen3_vl_patch_mm_data,
"molmo2": qwen3_vl_patch_mm_data,
"qwen3_5": qwen3_vl_patch_mm_data,
"qwen3_5_moe": qwen3_vl_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data,
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
}
@@ -176,6 +142,9 @@ def get_text_token_prompts(
tokenizer: TokenizerLike = processor.info.get_tokenizer()
model_config = processor.info.ctx.model_config
if processor.info.data_parser.video_needs_metadata:
mm_data = add_video_metadata(mm_data)
model_type = model_config.hf_config.model_type
if model_type in MM_DATA_PATCHES:
mm_data = MM_DATA_PATCHES[model_type](mm_data)