[Frontend] User-provided uuids for medias in chat. (RFC #22044) (#23449)

Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-09-08 06:42:20 -07:00
parent 03dd652c16
commit 01dfb5e982
8 changed files with 1079 additions and 79 deletions
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -41,7 +41,8 @@ from typing_extensions import Required, TypeAlias, TypedDict
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalUUIDDict)
 from vllm.multimodal.utils import MediaConnector
 # yapf: disable
 from vllm.transformers_utils.chat_templates import (
@@ -72,6 +73,11 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):

    type: Required[Literal["audio_url"]]
    """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """


 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
@@ -83,6 +89,11 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
    """
    type: Required[Literal["image_embeds"]]
    """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """


 class VideoURL(TypedDict, total=False):
@@ -97,6 +108,11 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):

    type: Required[Literal["video_url"]]
    """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """


 class PILImage(BaseModel):
@@ -118,6 +134,11 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
    """

    image_pil: Required[PILImage]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """


 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
@@ -131,6 +152,11 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
    """

    image_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """


 class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
@@ -155,6 +181,11 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
    """

    video_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """


 class CustomThinkCompletionContentParam(TypedDict, total=False):
@@ -567,6 +598,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        self._tokenizer = tokenizer

        self._items_by_modality = defaultdict[str, list[_T]](list)
+        self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)

    @property
    def model_config(self) -> ModelConfig:
@@ -591,10 +623,15 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
    def mm_processor(self):
        return self.mm_registry.create_processor(self.model_config)

-    def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
+    def add(
+        self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
+    ) -> Optional[str]:
        """
        Add a multi-modal item to the current prompt and returns the
        placeholder string to use, if any.
+
+        An optional uuid can be added which serves as a unique identifier of the
+        media. 
        """
        input_modality = modality.replace("_embeds", "")
        num_items = len(self._items_by_modality[modality]) + 1
@@ -602,9 +639,35 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        self.mm_processor.validate_num_items(input_modality, num_items)

        self._items_by_modality[modality].append(item)
+        self._uuids_by_modality[modality].append(uuid)

        return self.model_cls.get_placeholder_str(modality, num_items)

+    def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]:
+        if not self._items_by_modality:
+            return None
+        mm_uuids = {}
+        uuids_by_modality = dict(self._uuids_by_modality)
+        if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed"
+            )
+
+        if "image_embeds" in uuids_by_modality:
+            image_embeds_uuids = uuids_by_modality["image_embeds"]
+            if len(image_embeds_uuids) > 1:
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
+            mm_uuids["image"] = uuids_by_modality["image_embeds"]
+        if "image" in uuids_by_modality:
+            mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
+        if "audio" in uuids_by_modality:
+            mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
+        if "video" in uuids_by_modality:
+            mm_uuids["video"] = uuids_by_modality["video"]  # UUIDs of videos
+        return mm_uuids
+
    @abstractmethod
    def create_parser(self) -> "BaseMultiModalContentParser":
        raise NotImplementedError
@@ -697,29 +760,35 @@ class BaseMultiModalContentParser(ABC):
        return dict(self._placeholder_storage)

    @abstractmethod
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
        raise NotImplementedError

    @abstractmethod
    def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
    ) -> None:
        raise NotImplementedError

    @abstractmethod
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
        raise NotImplementedError

    @abstractmethod
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
        raise NotImplementedError

    @abstractmethod
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
        raise NotImplementedError

    @abstractmethod
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
        raise NotImplementedError


@@ -734,49 +803,55 @@ class MultiModalContentParser(BaseMultiModalContentParser):
            allowed_local_media_path=tracker.allowed_local_media_path,
        )

-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
        image = self._connector.fetch_image(image_url)

-        placeholder = self._tracker.add("image", image)
+        placeholder = self._tracker.add("image", image, uuid)
        self._add_placeholder("image", placeholder)

    def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
    ) -> None:
        if isinstance(image_embeds, dict):
            embeds = {
                k: self._connector.fetch_image_embedding(v)
                for k, v in image_embeds.items()
            }
-            placeholder = self._tracker.add("image_embeds", embeds)
+            placeholder = self._tracker.add("image_embeds", embeds, uuid)

        if isinstance(image_embeds, str):
            embedding = self._connector.fetch_image_embedding(image_embeds)
-            placeholder = self._tracker.add("image_embeds", embedding)
+            placeholder = self._tracker.add("image_embeds", embedding, uuid)

        self._add_placeholder("image", placeholder)

-    def parse_image_pil(self, image_pil: Image.Image) -> None:
-        placeholder = self._tracker.add("image", image_pil)
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
+        placeholder = self._tracker.add("image", image_pil, uuid)
        self._add_placeholder("image", placeholder)

-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
        audio = self._connector.fetch_audio(audio_url)

-        placeholder = self._tracker.add("audio", audio)
+        placeholder = self._tracker.add("audio", audio, uuid)
        self._add_placeholder("audio", placeholder)

-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
        audio_data = input_audio.get("data", "")
        audio_format = input_audio.get("format", "")
        audio_url = f"data:audio/{audio_format};base64,{audio_data}"

-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)

-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
        video = self._connector.fetch_video(video_url=video_url)

-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
        self._add_placeholder("video", placeholder)


@@ -790,14 +865,16 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
            allowed_local_media_path=tracker.allowed_local_media_path,
        )

-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
        image_coro = self._connector.fetch_image_async(image_url)

-        placeholder = self._tracker.add("image", image_coro)
+        placeholder = self._tracker.add("image", image_coro, uuid)
        self._add_placeholder("image", placeholder)

    def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
    ) -> None:
        future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()

@@ -812,33 +889,37 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
            embedding = self._connector.fetch_image_embedding(image_embeds)
            future.set_result(embedding)

-        placeholder = self._tracker.add("image_embeds", future)
+        placeholder = self._tracker.add("image_embeds", future, uuid)
        self._add_placeholder("image", placeholder)

-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
        future: asyncio.Future[Image.Image] = asyncio.Future()
        future.set_result(image_pil)

-        placeholder = self._tracker.add("image", future)
+        placeholder = self._tracker.add("image", future, uuid)
        self._add_placeholder("image", placeholder)

-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
        audio_coro = self._connector.fetch_audio_async(audio_url)

-        placeholder = self._tracker.add("audio", audio_coro)
+        placeholder = self._tracker.add("audio", audio_coro, uuid)
        self._add_placeholder("audio", placeholder)

-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
        audio_data = input_audio.get("data", "")
        audio_format = input_audio.get("format", "")
        audio_url = f"data:audio/{audio_format};base64,{audio_data}"

-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)

-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
        video = self._connector.fetch_video_async(video_url=video_url)

-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
        self._add_placeholder("video", placeholder)


@@ -1177,30 +1258,36 @@ def _parse_chat_message_content_part(
        else:
            return str_content

+    # For media items, if a user has provided one, use it. Otherwise, insert
+    # a placeholder empty uuid.
+    uuid = part.get("uuid", None)
+    if uuid is not None:
+        uuid = str(uuid)
+
    modality = None
    if part_type == "image_pil":
        image_content = cast(Image.Image, content)
-        mm_parser.parse_image_pil(image_content)
+        mm_parser.parse_image_pil(image_content, uuid)
        modality = "image"
    elif part_type in ("image_url", "input_image"):
        str_content = cast(str, content)
-        mm_parser.parse_image(str_content)
+        mm_parser.parse_image(str_content, uuid)
        modality = "image"
    elif part_type == "image_embeds":
        content = cast(Union[str, dict[str, str]], content)
-        mm_parser.parse_image_embeds(content)
+        mm_parser.parse_image_embeds(content, uuid)
        modality = "image"
    elif part_type == "audio_url":
        str_content = cast(str, content)
-        mm_parser.parse_audio(str_content)
+        mm_parser.parse_audio(str_content, uuid)
        modality = "audio"
    elif part_type == "input_audio":
        dict_content = cast(InputAudio, content)
-        mm_parser.parse_input_audio(dict_content)
+        mm_parser.parse_input_audio(dict_content, uuid)
        modality = "audio"
    elif part_type == "video_url":
        str_content = cast(str, content)
-        mm_parser.parse_video(str_content)
+        mm_parser.parse_video(str_content, uuid)
        modality = "video"
    else:
        raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -1288,7 +1375,11 @@ def parse_chat_messages(
    model_config: ModelConfig,
    tokenizer: AnyTokenizer,
    content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
+) -> tuple[
+    list[ConversationMessage],
+    Optional[MultiModalDataDict],
+    Optional[MultiModalUUIDDict],
+]:
    conversation: list[ConversationMessage] = []
    mm_tracker = MultiModalItemTracker(model_config, tokenizer)

@@ -1308,7 +1399,7 @@ def parse_chat_messages(

    _postprocess_messages(conversation)

-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()


 def parse_chat_messages_futures(
@@ -1316,7 +1407,11 @@ def parse_chat_messages_futures(
    model_config: ModelConfig,
    tokenizer: AnyTokenizer,
    content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
+) -> tuple[
+    list[ConversationMessage],
+    Awaitable[Optional[MultiModalDataDict]],
+    Optional[MultiModalUUIDDict],
+]:
    conversation: list[ConversationMessage] = []
    mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)

@@ -1336,7 +1431,7 @@ def parse_chat_messages_futures(

    _postprocess_messages(conversation)

-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()


 def apply_hf_chat_template(