[VLM] Merged multi-modal processor for GLM4V (#12449)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-02-09 04:32:16 +08:00
parent fe743b798d
commit 86222a3dab
4 changed files with 237 additions and 182 deletions
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -106,7 +106,9 @@ def run_glm4v(question: str, modality: str):
              trust_remote_code=True,
              enforce_eager=True,
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    prompt = question
+    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>"
+
    stop_token_ids = [151329, 151336, 151338]
    return llm, prompt, stop_token_ids