[Model] Support Qwen3-VL Model Series (#24727)
Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -31,6 +31,7 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
"""
|
||||
# Ensure video metadata is included
|
||||
if "video" in mm_data:
|
||||
# GLM4.1V doesn't support multiple videos
|
||||
video = mm_data["video"]
|
||||
num_frames = len(video)
|
||||
mm_data["video"] = (video, {
|
||||
@@ -44,6 +45,34 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
return mm_data
|
||||
|
||||
|
||||
def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
"""
|
||||
Patch the multimodal data for Qwen3-VL model.
|
||||
"""
|
||||
|
||||
def create_metadata(frames: np.ndarray):
|
||||
num_frames = len(frames)
|
||||
return {
|
||||
"total_num_frames": num_frames,
|
||||
"fps": 2.0,
|
||||
"duration": num_frames / 2.0,
|
||||
"video_backend": "opencv",
|
||||
"frames_indices": list(range(num_frames)),
|
||||
"do_sample_frames": True,
|
||||
}
|
||||
|
||||
# Ensure video metadata is included
|
||||
if "video" in mm_data:
|
||||
video = mm_data["video"]
|
||||
if isinstance(video, list):
|
||||
# multiple videos
|
||||
mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
|
||||
else:
|
||||
# single video
|
||||
mm_data["video"] = (video, create_metadata(video))
|
||||
return mm_data
|
||||
|
||||
|
||||
def _test_processing_correctness(
|
||||
model_id_or_arch: str,
|
||||
hit_rate: float,
|
||||
@@ -182,8 +211,10 @@ _IGNORE_MM_KEYS = {
|
||||
}
|
||||
|
||||
MM_DATA_PATCHES = {
|
||||
# GLM4.1V requires video metadata to be included in the input
|
||||
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
|
||||
"glm4v": glm4_1v_patch_mm_data,
|
||||
"qwen3_vl": qwen3_vl_patch_mm_data,
|
||||
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
|
||||
}
|
||||
|
||||
|
||||
@@ -326,6 +357,8 @@ def _test_processing_correctness_one(
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||
"Qwen/Qwen2.5-Omni-3B",
|
||||
"Qwen/Qwen3-VL-4B-Instruct",
|
||||
"Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||
"YannQi/R-4B",
|
||||
"Skywork/Skywork-R1V-38B",
|
||||
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
||||
|
||||
Reference in New Issue
Block a user