[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)

Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-12 08:56:30 +08:00
parent 7de18d541b
commit 021c16c7ca
16 changed files with 330 additions and 212 deletions
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -678,12 +678,8 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model


-def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.visual_tokenizer.to(hf_model.dtype)
-    hf_model.model.vte.to(hf_model.dtype)
-    hf_model.model.llm.to(hf_model.dtype)
-
    hf_model.model.get_output_embeddings = lambda: \
        hf_model.model.llm.get_output_embeddings()

@@ -691,7 +687,16 @@ def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        text_tokenizer = hf_model.model.get_text_tokenizer()
        images = [images] if isinstance(images, Image) else images

-        text = text.split("<|im_start|>user\n")[1].split("<|im_end|>\n")[0]
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break

        prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
            text_or_conversations=text, images=images)