[VLM] Merged multi-modal processor for InternVL-based models (#12553)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
Cyrus Leung
2025-02-04 16:44:52 +08:00
committed by GitHub
parent 96b23621c1
commit d1ca7df84d
34 changed files with 1469 additions and 1021 deletions

View File

@@ -23,7 +23,6 @@
# limitations under the License.
"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
from functools import partial
from itertools import accumulate
from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
Tuple, TypedDict, Union)
@@ -138,11 +137,15 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None, "audio": None}
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {
"image": self.get_max_image_tokens(),
"audio": self.get_max_audio_tokens(),
"video": self.get_max_video_tokens(seq_len)
"video": self.get_max_video_tokens(seq_len),
}
def get_default_audio_pool_step(self) -> int:
@@ -369,23 +372,18 @@ class MiniCPMOMultiModalProcessor(
hf_inputs,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0))
def get_slices(num_slices: List[int]) -> List[int]:
slice_indices = [0] + list(accumulate(num_slices))
slices = [(slice_indices[i], slice_indices[i + 1])
for i in range(len(num_slices))]
return [slice(*slice_item) for slice_item in slices]
audio_slices = get_slices(
hf_inputs.get("audio_num_slices", torch.empty(0)))
return dict(
**super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
audio_features=MultiModalFieldConfig.flat("audio", audio_slices),
audio_feature_lens=MultiModalFieldConfig.flat(
"audio", audio_slices),
audio_features=MultiModalFieldConfig.flat_from_sizes(
"audio", audio_num_slices),
audio_feature_lens=MultiModalFieldConfig.flat_from_sizes(
"audio", audio_num_slices),
audio_num_slices=MultiModalFieldConfig.batched("audio"),
audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices))
audio_embeds=MultiModalFieldConfig.flat_from_sizes(
"audio", audio_num_slices))
class MultiModalProjector(nn.Module):