[VLM] Abstract out multi-modal data parsing in merged processor (#11620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2024-12-30 23:01:35 +08:00
committed by GitHub
parent b12e87f942
commit 8d9b6721e7
15 changed files with 559 additions and 311 deletions

View File

@@ -15,7 +15,7 @@ from vllm.transformers_utils.processor import get_video_processor
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import PlaceholderModule, is_list_of
from .base import MediaIO, MultiModalData
from .base import MediaIO, ModalityData
from .image import ImageMediaIO, ImagePlugin
from .inputs import MultiModalKwargs, VideoItem
@@ -54,7 +54,7 @@ class VideoPlugin(ImagePlugin):
def _default_input_mapper(
self,
ctx: InputContext,
data: MultiModalData[VideoItem],
data: ModalityData[VideoItem],
**mm_processor_kwargs,
) -> MultiModalKwargs:
model_config = ctx.model_config