Add GLM4.1V model (Draft) (#19331)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Yuxuan Zhang
2025-07-01 20:48:26 +08:00
committed by GitHub
parent 650d5dbd04
commit ed70f3c64f
17 changed files with 1946 additions and 16 deletions

View File

@@ -309,6 +309,34 @@ VLM_TEST_SETTINGS = {
num_logprobs=10,
marks=[large_gpu_mark(min_gb=32)],
),
"glm4_1v": VLMTestInfo(
models=["THUDM/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
auto_cls=AutoModelForImageTextToText,
),
"glm4_1v-video": VLMTestInfo(
models=["THUDM/GLM-4.1V-9B-Thinking"],
# GLM4.1V require include video metadata for input
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.video_with_metadata_glm4_1v(),
limit_mm_per_prompt={"video": 1},
)],
# This is needed to run on machine with 24GB VRAM
vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
),
"h2ovl": VLMTestInfo(
models = [
"h2oai/h2ovl-mississippi-800m",

View File

@@ -129,3 +129,23 @@ def windows_attention_image_qwen2_5_vl():
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
def video_with_metadata_glm4_1v():
video_array = VIDEO_ASSETS[0].np_ndarrays
metadata = VIDEO_ASSETS[0].metadata
question = "Describe the video."
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [[(rescale_video_size(video_array, scale), metadata)]
for scale in scales]
prompts = [formatted_prompt] * len(video_input)
return [
PromptWithMultiModalInput(
prompts=prompts,
video_data=video_input,
)
]

View File

@@ -16,9 +16,11 @@ import torch
from PIL.Image import Image
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
GenerationConfig, GenerationMixin)
from transformers.video_utils import VideoMetadata
from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side
from vllm.utils import is_list_of
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
from .types import RunnerOutput
@@ -373,6 +375,28 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4.1V."""
hf_processor = hf_model.processor
def processor(*args, videos=None, **kwargs):
if videos is not None and is_list_of(videos, tuple):
# If videos is a list of tuples, we assume each tuple contains
# (video_array, metadata) as in the case of GLM4.1V.
video_metadata = [[VideoMetadata(**video[1])] for video in videos]
videos = [[video[0]] for video in videos]
else:
video_metadata = None
return hf_processor(*args,
videos=videos,
video_metadata=video_metadata,
**kwargs)
hf_model.processor = processor
return hf_model
def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for H2OVL."""