[Model][VLM] Add LLaVA-Onevision model support (#8486)
Co-authored-by: litianjian <litianjian@bytedance.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -4,6 +4,7 @@ within a vision language model."""
|
||||
import math
|
||||
from typing import Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torch import nn
|
||||
@@ -89,6 +90,24 @@ def dummy_image_for_siglip(
|
||||
return {"image": image if num_images == 1 else [image] * num_images}
|
||||
|
||||
|
||||
def dummy_video_for_siglip(
|
||||
hf_config: SiglipVisionConfig,
|
||||
num_frames: int,
|
||||
*,
|
||||
image_width_override: Optional[int] = None,
|
||||
image_height_override: Optional[int] = None,
|
||||
):
|
||||
pil_frame = dummy_image_for_siglip(
|
||||
hf_config,
|
||||
num_images=1,
|
||||
image_width_override=image_width_override,
|
||||
image_height_override=image_height_override)
|
||||
np_frame = np.array(pil_frame["image"])
|
||||
mm_data_per_video = np.repeat([np_frame], num_frames, axis=0)
|
||||
mm_data = {"video": mm_data_per_video}
|
||||
return mm_data
|
||||
|
||||
|
||||
def input_processor_for_siglip(
|
||||
model_config: ModelConfig,
|
||||
hf_config: SiglipVisionConfig,
|
||||
|
||||
Reference in New Issue
Block a user