[VLM] Remove image_input_type from VLM config (#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
|
||||
from typing_extensions import NotRequired
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal import MultiModalData
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
|
||||
|
||||
class ParsedText(TypedDict):
|
||||
@@ -72,7 +72,7 @@ class TextPrompt(TypedDict):
|
||||
prompt: str
|
||||
"""The input text to be tokenized before passing to the model."""
|
||||
|
||||
multi_modal_data: NotRequired["MultiModalData"]
|
||||
multi_modal_data: NotRequired["MultiModalDataDict"]
|
||||
"""
|
||||
Optional multi-modal data to pass to the model,
|
||||
if the model supports it.
|
||||
@@ -85,7 +85,7 @@ class TokensPrompt(TypedDict):
|
||||
prompt_token_ids: List[int]
|
||||
"""A list of token IDs to pass to the model."""
|
||||
|
||||
multi_modal_data: NotRequired["MultiModalData"]
|
||||
multi_modal_data: NotRequired["MultiModalDataDict"]
|
||||
"""
|
||||
Optional multi-modal data to pass to the model,
|
||||
if the model supports it.
|
||||
@@ -103,7 +103,7 @@ class TextTokensPrompt(TypedDict):
|
||||
prompt_token_ids: List[int]
|
||||
"""The token IDs of the prompt."""
|
||||
|
||||
multi_modal_data: NotRequired["MultiModalData"]
|
||||
multi_modal_data: NotRequired["MultiModalDataDict"]
|
||||
"""
|
||||
Optional multi-modal data to pass to the model,
|
||||
if the model supports it.
|
||||
@@ -128,7 +128,6 @@ class LLMInputs(TypedDict):
|
||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
||||
passed to the model executor.
|
||||
"""
|
||||
|
||||
prompt_token_ids: List[int]
|
||||
"""The token IDs of the prompt."""
|
||||
|
||||
@@ -137,7 +136,7 @@ class LLMInputs(TypedDict):
|
||||
The original prompt text corresponding to the token IDs, if available.
|
||||
"""
|
||||
|
||||
multi_modal_data: NotRequired[Optional["MultiModalData"]]
|
||||
multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
|
||||
"""
|
||||
Optional multi-modal data to pass to the model,
|
||||
if the model supports it.
|
||||
|
||||
@@ -12,7 +12,7 @@ from .data import LLMInputs
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VisionLanguageConfig
|
||||
from vllm.multimodal import MultiModalData
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -66,7 +66,8 @@ class InputContext:
|
||||
N = TypeVar("N", bound=Type[nn.Module])
|
||||
|
||||
DummyDataFactory = Callable[[InputContext, int],
|
||||
Tuple["SequenceData", Optional["MultiModalData"]]]
|
||||
Tuple["SequenceData",
|
||||
Optional["MultiModalDataDict"]]]
|
||||
"""
|
||||
Create dummy data to be inputted into the model.
|
||||
|
||||
@@ -94,7 +95,7 @@ class InputRegistry:
|
||||
self,
|
||||
ctx: InputContext,
|
||||
seq_len: int,
|
||||
) -> Tuple["SequenceData", Optional["MultiModalData"]]:
|
||||
) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
|
||||
"""
|
||||
The default dummy data factory represents the longest possible text
|
||||
that can be inputted to the model.
|
||||
|
||||
Reference in New Issue
Block a user