[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)
Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -482,11 +482,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
if modality in ("image", "image_embeds"):
|
||||
if model_type == "chatglm":
|
||||
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
|
||||
if model_type == "phi3_v":
|
||||
# Workaround since this token is not defined in the tokenizer
|
||||
if model_type in ("phi3_v", "phi4mm"):
|
||||
return f"<|image_{current_count}|>"
|
||||
if model_type == "phi4mm":
|
||||
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
|
||||
if model_type in ("minicpmo", "minicpmv"):
|
||||
return "(<image>./</image>)"
|
||||
if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
|
||||
@@ -522,7 +519,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
if model_type == "ultravox":
|
||||
return "<|audio|>"
|
||||
if model_type == "phi4mm":
|
||||
return "<|endoftext11|>" # 200011 (see vocab.json in hf model)
|
||||
return f"<|audio_{current_count}|>"
|
||||
if model_type in ("qwen2_audio", "qwen2_5_omni"):
|
||||
return (f"Audio {current_count}: "
|
||||
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
|
||||
|
||||
Reference in New Issue
Block a user