[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)

Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-19 17:26:11 +08:00
parent d9737ca1c6
commit 83f3c3bd91
15 changed files with 818 additions and 1246 deletions
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -482,11 +482,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        if modality in ("image", "image_embeds"):
            if model_type == "chatglm":
                return "<|begin_of_image|><|endoftext|><|end_of_image|>"
-            if model_type == "phi3_v":
-                # Workaround since this token is not defined in the tokenizer
+            if model_type in ("phi3_v", "phi4mm"):
                return f"<|image_{current_count}|>"
-            if model_type == "phi4mm":
-                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
            if model_type in ("minicpmo", "minicpmv"):
                return "(<image>./</image>)"
            if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
@@ -522,7 +519,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
            if model_type == "ultravox":
                return "<|audio|>"
            if model_type == "phi4mm":
-                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
+                return f"<|audio_{current_count}|>"
            if model_type in ("qwen2_audio", "qwen2_5_omni"):
                return (f"Audio {current_count}: "
                        f"<|audio_bos|><|AUDIO|><|audio_eos|>")