Add GLM4.1V model (Draft) (#19331)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -309,6 +309,34 @@ VLM_TEST_SETTINGS = {
|
||||
num_logprobs=10,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"glm4_1v": VLMTestInfo(
|
||||
models=["THUDM/GLM-4.1V-9B-Thinking"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
|
||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
num_logprobs=10,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
),
|
||||
"glm4_1v-video": VLMTestInfo(
|
||||
models=["THUDM/GLM-4.1V-9B-Thinking"],
|
||||
# GLM4.1V require include video metadata for input
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.video_with_metadata_glm4_1v(),
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)],
|
||||
# This is needed to run on machine with 24GB VRAM
|
||||
vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
|
||||
),
|
||||
"h2ovl": VLMTestInfo(
|
||||
models = [
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
|
||||
@@ -129,3 +129,23 @@ def windows_attention_image_qwen2_5_vl():
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
|
||||
|
||||
def video_with_metadata_glm4_1v():
|
||||
video_array = VIDEO_ASSETS[0].np_ndarrays
|
||||
metadata = VIDEO_ASSETS[0].metadata
|
||||
question = "Describe the video."
|
||||
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [[(rescale_video_size(video_array, scale), metadata)]
|
||||
for scale in scales]
|
||||
prompts = [formatted_prompt] * len(video_input)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=prompts,
|
||||
video_data=video_input,
|
||||
)
|
||||
]
|
||||
|
||||
@@ -16,9 +16,11 @@ import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig, GenerationMixin)
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import patch_padding_side
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
||||
from .types import RunnerOutput
|
||||
@@ -373,6 +375,28 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return hf_model
|
||||
|
||||
|
||||
def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4.1V."""
|
||||
hf_processor = hf_model.processor
|
||||
|
||||
def processor(*args, videos=None, **kwargs):
|
||||
if videos is not None and is_list_of(videos, tuple):
|
||||
# If videos is a list of tuples, we assume each tuple contains
|
||||
# (video_array, metadata) as in the case of GLM4.1V.
|
||||
video_metadata = [[VideoMetadata(**video[1])] for video in videos]
|
||||
videos = [[video[0]] for video in videos]
|
||||
else:
|
||||
video_metadata = None
|
||||
|
||||
return hf_processor(*args,
|
||||
videos=videos,
|
||||
video_metadata=video_metadata,
|
||||
**kwargs)
|
||||
|
||||
hf_model.processor = processor
|
||||
return hf_model
|
||||
|
||||
|
||||
def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for H2OVL."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user