[VLM] Various cleanup and fixes (#14806)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-03-14 20:58:19 +08:00
committed by GitHub
parent 40253bab44
commit ab93f1360f
14 changed files with 283 additions and 273 deletions

View File

@@ -23,7 +23,6 @@
# limitations under the License.
"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
from collections.abc import Iterable, Mapping, Sequence
from functools import partial
from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
TypedDict, Union)
@@ -36,11 +35,12 @@ from transformers.models.whisper.modeling_whisper import (
from vllm.config import VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import MultiModalFieldConfig
from vllm.multimodal.parse import (AudioItem, DictEmbeddingItems, ModalityData,
from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
DictEmbeddingItems, ModalityData,
ModalityDataItems, MultiModalDataItems,
MultiModalDataParser)
from vllm.multimodal.processing import PromptReplacement
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from vllm.multimodal.profiling import ProcessorInputs
from vllm.sequence import IntermediateTensors
@@ -272,8 +272,13 @@ class MiniCPMOMultiModalProcessor(
tokenizer.audio_end_id)
return special_tokens
def process_audios(self, mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
def process_audios(
self,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> Mapping[str, NestedTensors]:
mm_data = dict(mm_data)
audios = mm_data.pop("audios", [])
audio_embeds = mm_data.pop("audio_embeds", [])
if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0:
@@ -332,11 +337,15 @@ class MiniCPMOMultiModalProcessor(
def get_placeholder_split_pattern(self) -> str:
return r"\(<(?:image|video|audio)>./</(?:image|video|audio)>\)"
def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
def process_mm_inputs(
self,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> Mapping[str, Mapping[str, NestedTensors]]:
return {
"image": self.process_images(mm_data, mm_kwargs),
"video": self.process_videos(mm_data, mm_kwargs),
"audio": self.process_audios(mm_data, mm_kwargs)
"audio": self.process_audios(mm_data, mm_kwargs),
}
def get_modality_num_counter(self, modality: str) -> str:
@@ -358,39 +367,38 @@ class MiniCPMOMultiModalProcessor(
return super().get_prompt_texts_by_modality(inputs, modality, index)
def _get_prompt_updates(
self, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs) -> Sequence[PromptReplacement]:
placeholder = {
"image": self.info.image_pattern,
"video": self.info.video_pattern,
"audio": self.info.audio_pattern
}
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
base_updates = super()._get_prompt_updates(
mm_items=mm_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
out_mm_kwargs=out_mm_kwargs,
)
def get_replacement_minicpmv(item_idx: int, modality: str):
if modality == "image":
return self.get_image_prompt_texts(
mm_items["image"].get_image_size(item_idx), item_idx)
elif modality == "video":
return self.get_video_prompt_texts(
mm_items["video"].get_frame_size(item_idx),
mm_items["video"].get_num_frames(item_idx))
else: # audio
if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems):
single_audio_embeds = mm_items["audio"].get(item_idx)
audio_len = self.info.get_audio_len_by_num_chunks(
sum(chunk_embeds.shape[0]
for chunk_embeds in single_audio_embeds))
return self.get_audio_prompt_texts(audio_len)
return self.get_audio_prompt_texts(
len(mm_items["audio"].get(item_idx)))
audio_placeholder = self.info.audio_pattern
def get_audio_replacement(item_idx: int):
audios = mm_items.get_items(
"audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems))
if isinstance(audios, MiniCPMOAudioEmbeddingItems):
single_audio_embeds = audios.get(item_idx)["audio_embeds"]
audio_len = self.info.get_audio_len_by_num_chunks(
sum(chunk_embeds.shape[0]
for chunk_embeds in single_audio_embeds))
else:
audio_len = audios.get_audio_length(item_idx)
return self.get_audio_prompt_texts(audio_len)
return [
PromptReplacement(modality=modality,
target=placeholder[modality],
replacement=partial(get_replacement_minicpmv,
modality=modality))
for modality in ("image", "video", "audio")
*base_updates,
PromptReplacement(modality="audio",
target=audio_placeholder,
replacement=get_audio_replacement),
]
def _get_mm_fields_config(