[VLM] Initialize video input support for InternVL models (#18499)

Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-25 12:51:25 +08:00
parent 6ab681bcbe
commit 75f81750f3
10 changed files with 596 additions and 62 deletions
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -556,6 +556,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                return "(<audio>./</audio>)"
            raise TypeError(f"Unknown model type: {model_type}")
        elif modality == "video":
+            if model_type == "internvl_chat":
+                return "<video>"
            if model_type in ("qwen2_vl", "qwen2_5_vl"):
                return "<|vision_start|><|video_pad|><|vision_end|>"
            if model_type == "qwen2_5_omni":