[Bugfix] Fix prompt format of GLM4V (#14539)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-13 19:37:17 +08:00
parent b1cc4dfef5
commit f53a0586b9
7 changed files with 43 additions and 19 deletions
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        hf_config = self._model_config.hf_config
        model_type = hf_config.model_type

-        if modality in ["image", "image_embeds"]:
+        if modality in ("image", "image_embeds"):
+            if model_type == "chatglm":
+                return "<|begin_of_image|><|endoftext|><|end_of_image|>"
            if model_type == "phi3_v":
                # Workaround since this token is not defined in the tokenizer
                return f"<|image_{current_count}|>"
@@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
            if model_type in ("minicpmo", "minicpmv"):
                return "(<image>./</image>)"
-            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
-                              "pixtral"):
+            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
                # These models do not use image tokens in the prompt
                return None
            if model_type == "qwen":