[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-09-12 00:31:19 +08:00
parent cea95dfb94
commit 3b7fea770f
14 changed files with 1531 additions and 31 deletions
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -108,7 +108,7 @@ class ConversationMessage(TypedDict, total=False):
    """The tool calls generated by the model, such as function calls."""


-ModalityStr = Literal["image", "audio"]
+ModalityStr = Literal["image", "audio", "video"]
 _T = TypeVar("_T")


@@ -158,12 +158,18 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                                              hf_config.image_token_index)
            if model_type in ("chameleon", "internvl_chat"):
                return "<image>"
+            if model_type == "qwen2_vl":
+                return "<|vision_start|><|image_pad|><|vision_end|>"

            raise TypeError(f"Unknown model type: {model_type}")
        elif modality == "audio":
            if model_type == "ultravox":
                return "<|reserved_special_token_0|>"
            raise TypeError(f"Unknown model type: {model_type}")
+        elif modality == "video":
+            if model_type == "qwen2_vl":
+                return "<|vision_start|><|video_pad|><|vision_end|>"
+            raise TypeError(f"Unknown model type: {model_type}")
        else:
            raise TypeError(f"Unknown modality: {modality}")