[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126)

2024-08-15 01:55:42 +08:00
parent 70b746efcf
commit 3f674a49b5
38 changed files with 572 additions and 216 deletions
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -11,14 +11,11 @@ logger = init_logger(__name__)

@runtime_checkable
 class SupportsMultiModal(Protocol):
-    """
-    The interface required for all multimodal (vision or audio) language
-    models.
-    """
+    """The interface required for all multi-modal models."""

    supports_multimodal: ClassVar[Literal[True]] = True
    """
-    A flag that indicates this model supports multimodal inputs.
+    A flag that indicates this model supports multi-modal inputs.

    Note:
        There is no need to redefine this flag if this class is in the