[VLM] Remove image_input_type from VLM config (#5852)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-02 00:57:09 -07:00
parent 2c37540aa6
commit 98d6682cd1
35 changed files with 329 additions and 751 deletions
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
 from typing_extensions import NotRequired

 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import MultiModalDataDict


 class ParsedText(TypedDict):
@@ -72,7 +72,7 @@ class TextPrompt(TypedDict):
    prompt: str
    """The input text to be tokenized before passing to the model."""

-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
@@ -85,7 +85,7 @@ class TokensPrompt(TypedDict):
    prompt_token_ids: List[int]
    """A list of token IDs to pass to the model."""

-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
@@ -103,7 +103,7 @@ class TextTokensPrompt(TypedDict):
    prompt_token_ids: List[int]
    """The token IDs of the prompt."""

-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
@@ -128,7 +128,6 @@ class LLMInputs(TypedDict):
    The inputs in :class:`~vllm.LLMEngine` before they are
    passed to the model executor.
    """
-
    prompt_token_ids: List[int]
    """The token IDs of the prompt."""

@@ -137,7 +136,7 @@ class LLMInputs(TypedDict):
    The original prompt text corresponding to the token IDs, if available.
    """

-    multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@ from .data import LLMInputs

 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VisionLanguageConfig
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import MultiModalDataDict
    from vllm.sequence import SequenceData

 logger = init_logger(__name__)
@@ -66,7 +66,8 @@ class InputContext:
 N = TypeVar("N", bound=Type[nn.Module])

 DummyDataFactory = Callable[[InputContext, int],
-                            Tuple["SequenceData", Optional["MultiModalData"]]]
+                            Tuple["SequenceData",
+                                  Optional["MultiModalDataDict"]]]
 """
 Create dummy data to be inputted into the model.

@@ -94,7 +95,7 @@ class InputRegistry:
        self,
        ctx: InputContext,
        seq_len: int,
-    ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
        """
        The default dummy data factory represents the longest possible text
        that can be inputted to the model.