[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126)
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
import enum
|
||||
import json
|
||||
from dataclasses import dataclass, field, fields
|
||||
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
|
||||
from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple,
|
||||
Type, Union)
|
||||
|
||||
import torch
|
||||
from transformers import PretrainedConfig
|
||||
@@ -1429,10 +1430,15 @@ class PromptAdapterConfig:
|
||||
|
||||
@dataclass
|
||||
class MultiModalConfig:
|
||||
"""Configs the input data format and how models should run for
|
||||
multimodal models."""
|
||||
"""Controls the behavior of multimodal models."""
|
||||
|
||||
limit_per_prompt: Mapping[str, int]
|
||||
"""
|
||||
The maximum number of multi-modal input instances allowed per prompt
|
||||
for each :class:`~vllm.multimodal.MultiModalPlugin`.
|
||||
"""
|
||||
|
||||
# TODO: Add configs to init vision tower or not.
|
||||
pass
|
||||
|
||||
|
||||
_STR_DTYPE_TO_TORCH_DTYPE = {
|
||||
|
||||
Reference in New Issue
Block a user