Online video support for VLMs (#10020)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: litianjian <litianjian@bytedance.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -30,7 +30,9 @@ from vllm.logger import init_logger
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.utils import (async_get_and_parse_audio,
|
||||
async_get_and_parse_image,
|
||||
get_and_parse_audio, get_and_parse_image)
|
||||
async_get_and_parse_video,
|
||||
get_and_parse_audio, get_and_parse_image,
|
||||
get_and_parse_video)
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.utils import print_warning_once
|
||||
|
||||
@@ -51,6 +53,20 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
|
||||
"""The type of the content part."""
|
||||
|
||||
|
||||
class VideoURL(TypedDict, total=False):
|
||||
url: Required[str]
|
||||
"""
|
||||
Either a URL of the video or a data URL with base64 encoded video data.
|
||||
"""
|
||||
|
||||
|
||||
class ChatCompletionContentPartVideoParam(TypedDict, total=False):
|
||||
video_url: Required[VideoURL]
|
||||
|
||||
type: Required[Literal["video_url"]]
|
||||
"""The type of the content part."""
|
||||
|
||||
|
||||
class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
|
||||
"""A simpler version of the param that only accepts a plain image_url.
|
||||
This is supported by OpenAI API, although it is not documented.
|
||||
@@ -74,11 +90,23 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
|
||||
audio_url: Required[str]
|
||||
|
||||
|
||||
class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
|
||||
"""A simpler version of the param that only accepts a plain audio_url.
|
||||
|
||||
Example:
|
||||
{
|
||||
"video_url": "https://example.com/video.mp4"
|
||||
}
|
||||
"""
|
||||
video_url: Required[str]
|
||||
|
||||
|
||||
ChatCompletionContentPartParam: TypeAlias = Union[
|
||||
OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
|
||||
ChatCompletionContentPartRefusalParam,
|
||||
ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
|
||||
CustomChatCompletionContentSimpleImageParam,
|
||||
CustomChatCompletionContentSimpleAudioParam, str]
|
||||
CustomChatCompletionContentSimpleAudioParam,
|
||||
CustomChatCompletionContentSimpleVideoParam, str]
|
||||
|
||||
|
||||
class CustomChatCompletionMessageParam(TypedDict, total=False):
|
||||
@@ -201,6 +229,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
elif modality == "video":
|
||||
if model_type == "qwen2_vl":
|
||||
return "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
if model_type.startswith("llava"):
|
||||
return self._cached_token_str(self._tokenizer,
|
||||
hf_config.video_token_index)
|
||||
raise TypeError(f"Unknown {modality} model type: {model_type}")
|
||||
else:
|
||||
raise TypeError(f"Unknown modality: {modality}")
|
||||
@@ -291,6 +322,10 @@ class BaseMultiModalContentParser(ABC):
|
||||
def parse_audio(self, audio_url: str) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class MultiModalContentParser(BaseMultiModalContentParser):
|
||||
|
||||
@@ -313,6 +348,12 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
||||
placeholder = self._tracker.add("audio", audio)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
video = get_and_parse_video(video_url)
|
||||
|
||||
placeholder = self._tracker.add("video", video)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
|
||||
class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
||||
|
||||
@@ -336,6 +377,12 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
||||
placeholder = self._tracker.add("audio", audio_coro)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
video = async_get_and_parse_video(video_url)
|
||||
|
||||
placeholder = self._tracker.add("video", video)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
|
||||
def validate_chat_template(chat_template: Optional[Union[Path, str]]):
|
||||
"""Raises if the provided chat template appears invalid."""
|
||||
@@ -416,6 +463,7 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
|
||||
_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
|
||||
_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
|
||||
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
|
||||
_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
|
||||
MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'}
|
||||
|
||||
# Define a mapping from part types to their corresponding parsing functions.
|
||||
@@ -428,6 +476,8 @@ MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
|
||||
lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
|
||||
"refusal":
|
||||
lambda part: _RefusalParser(part).get("refusal", ""),
|
||||
"video_url":
|
||||
lambda part: _VideoParser(part).get("video_url", {}).get("url", ""),
|
||||
}
|
||||
|
||||
|
||||
@@ -472,7 +522,10 @@ def _parse_chat_message_content_mm_part(
|
||||
audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
|
||||
part)
|
||||
return "audio_url", audio_params.get("audio_url", "")
|
||||
|
||||
if part.get("video_url") is not None:
|
||||
video_params = cast(CustomChatCompletionContentSimpleVideoParam,
|
||||
part)
|
||||
return "video_url", video_params.get("video_url", "")
|
||||
# Raise an error if no 'type' or direct URL is found.
|
||||
raise ValueError("Missing 'type' field in multimodal part.")
|
||||
|
||||
@@ -482,7 +535,7 @@ def _parse_chat_message_content_mm_part(
|
||||
|
||||
|
||||
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
|
||||
"audio_url")
|
||||
"audio_url", "video_url")
|
||||
|
||||
|
||||
def _parse_chat_message_content_parts(
|
||||
@@ -542,7 +595,7 @@ def _parse_chat_message_content_part(
|
||||
# Handle structured dictionary parts
|
||||
part_type, content = _parse_chat_message_content_mm_part(part)
|
||||
|
||||
# if part_type is text/refusal/image_url/audio_url but
|
||||
# if part_type is text/refusal/image_url/audio_url/video_url but
|
||||
# content is empty, log a warning and skip
|
||||
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
|
||||
logger.warning(
|
||||
@@ -561,6 +614,10 @@ def _parse_chat_message_content_part(
|
||||
mm_parser.parse_audio(content)
|
||||
return {'type': 'audio'} if wrap_dicts else None
|
||||
|
||||
if part_type == "video_url":
|
||||
mm_parser.parse_video(content)
|
||||
return {'type': 'video'} if wrap_dicts else None
|
||||
|
||||
raise NotImplementedError(f"Unknown part type: {part_type}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user