[Model] Add multi-image input support for LLaVA-Next offline inference (#7230)

This commit is contained in:
zifeitong
2024-08-27 16:09:02 -07:00
committed by GitHub
parent 345be0e244
commit 5340a2dccf
7 changed files with 173 additions and 51 deletions

View File

@@ -3,7 +3,7 @@ within a vision language model."""
import math
from array import array
from typing import Iterable, Optional, Tuple
from typing import Iterable, List, Optional, Tuple, Union
import torch
from PIL import Image
@@ -93,7 +93,7 @@ def input_processor_for_siglip(
llm_inputs: LLMInputs,
*,
image_token_id: int,
image_feature_size_override: Optional[int] = None,
image_feature_size_override: Optional[Union[int, List[int]]] = None,
):
multi_modal_data = llm_inputs.get("multi_modal_data")
if multi_modal_data is None or "image" not in multi_modal_data: