[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
litianjian
2024-09-23 01:51:44 +08:00
committed by GitHub
parent ca2b628b3c
commit 5b59532760
10 changed files with 1330 additions and 21 deletions

View File

@@ -4,6 +4,7 @@ within a vision language model."""
import math
from typing import Iterable, List, Optional, Tuple, Union
import numpy as np
import torch
from PIL import Image
from torch import nn
@@ -89,6 +90,24 @@ def dummy_image_for_siglip(
return {"image": image if num_images == 1 else [image] * num_images}
def dummy_video_for_siglip(
hf_config: SiglipVisionConfig,
num_frames: int,
*,
image_width_override: Optional[int] = None,
image_height_override: Optional[int] = None,
):
pil_frame = dummy_image_for_siglip(
hf_config,
num_images=1,
image_width_override=image_width_override,
image_height_override=image_height_override)
np_frame = np.array(pil_frame["image"])
mm_data_per_video = np.repeat([np_frame], num_frames, axis=0)
mm_data = {"video": mm_data_per_video}
return mm_data
def input_processor_for_siglip(
model_config: ModelConfig,
hf_config: SiglipVisionConfig,