[Bugfix] Fix prompt format of GLM4V (#14539)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-03-13 19:37:17 +08:00
committed by GitHub
parent b1cc4dfef5
commit f53a0586b9
7 changed files with 43 additions and 19 deletions

View File

@@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
hf_config = self._model_config.hf_config
model_type = hf_config.model_type
if modality in ["image", "image_embeds"]:
if modality in ("image", "image_embeds"):
if model_type == "chatglm":
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer
return f"<|image_{current_count}|>"
@@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
if model_type in ("minicpmo", "minicpmv"):
return "(<image>./</image>)"
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
"pixtral"):
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
# These models do not use image tokens in the prompt
return None
if model_type == "qwen":