[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126)

This commit is contained in:
Cyrus Leung
2024-08-15 01:55:42 +08:00
committed by GitHub
parent 70b746efcf
commit 3f674a49b5
38 changed files with 572 additions and 216 deletions

View File

@@ -1,7 +1,8 @@
import enum
import json
from dataclasses import dataclass, field, fields
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple,
Type, Union)
import torch
from transformers import PretrainedConfig
@@ -1429,10 +1430,15 @@ class PromptAdapterConfig:
@dataclass
class MultiModalConfig:
"""Configs the input data format and how models should run for
multimodal models."""
"""Controls the behavior of multimodal models."""
limit_per_prompt: Mapping[str, int]
"""
The maximum number of multi-modal input instances allowed per prompt
for each :class:`~vllm.multimodal.MultiModalPlugin`.
"""
# TODO: Add configs to init vision tower or not.
pass
_STR_DTYPE_TO_TORCH_DTYPE = {