[Core][Frontend] Add Support for Inference Time mm_processor_kwargs (#9131)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2024-10-08 08:12:56 -06:00
parent 8c746226c9
commit a3691b6b5e
21 changed files with 440 additions and 118 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -472,6 +472,7 @@ class LLM:
        add_generation_prompt: bool = True,
        continue_final_message: bool = False,
        tools: Optional[List[Dict[str, Any]]] = None,
+        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
    ) -> List[RequestOutput]:
        """
        Generate responses for a chat conversation.
@@ -501,6 +502,8 @@ class LLM:
            continue_final_message: If True, continues the final message in
                the conversation instead of starting a new one. Cannot be `True`
                if `add_generation_prompt` is also `True`.
+            mm_processor_kwargs: Multimodal processor kwarg overrides for this
+                chat request. Only used for offline requests.

        Returns:
            A list of ``RequestOutput`` objects containing the generated
@@ -522,6 +525,9 @@ class LLM:
            tokenizer = self.get_tokenizer()
            model_config = self.llm_engine.get_model_config()

+            # NOTE: _parse_chat_message_content_parts() currently doesn't
+            # handle mm_processor_kwargs, since there is no implementation in
+            # the chat message parsing for it.
            conversation, mm_data = parse_chat_messages(
                msgs, model_config, tokenizer)

@@ -554,6 +560,9 @@ class LLM:
            if mm_data is not None:
                prompt["multi_modal_data"] = mm_data

+            if mm_processor_kwargs is not None:
+                prompt["mm_processor_kwargs"] = mm_processor_kwargs
+
            prompts.append(prompt)

        return self.generate(