[VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-01-04 19:40:53 +08:00
committed by GitHub
parent 300acb8347
commit eed11ebee9
31 changed files with 1104 additions and 973 deletions

View File

@@ -96,7 +96,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor):
nrows = math.ceil(image_height / 30)
return ncols, nrows
def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
target_width, target_height = self._get_image_target_size()
max_ncols, max_nrows = self._get_image_feature_grid_size(
@@ -208,8 +208,9 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor):
return result
def _get_dummy_mm_inputs(
def _get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
target_width, target_height = self._get_image_target_size()