[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126)

2024-08-15 01:55:42 +08:00
parent 70b746efcf
commit 3f674a49b5
38 changed files with 572 additions and 216 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,7 +1,8 @@
 import enum
 import json
 from dataclasses import dataclass, field, fields
-from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple,
+                    Type, Union)

 import torch
 from transformers import PretrainedConfig
@@ -1429,10 +1430,15 @@ class PromptAdapterConfig:

@dataclass
 class MultiModalConfig:
-    """Configs the input data format and how models should run for
-    multimodal models."""
+    """Controls the behavior of multimodal models."""
+
+    limit_per_prompt: Mapping[str, int]
+    """
+    The maximum number of multi-modal input instances allowed per prompt
+    for each :class:`~vllm.multimodal.MultiModalPlugin`.
+    """
+
    # TODO: Add configs to init vision tower or not.
-    pass


 _STR_DTYPE_TO_TORCH_DTYPE = {