[VLM] Implement merged multimodal processor for Mllama (#11427)

2025-02-13 12:26:21 +08:00
parent d88c8666a1
commit bc55d13070
8 changed files with 444 additions and 221 deletions
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -85,6 +85,14 @@ def _test_processing_correctness(
        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
    }

+    tokenizer_encode_kwargs = {}
+    if model_config.hf_config.model_type == "mllama":
+        # For Mllama, tokenizer will always add bos_token at the beginning of
+        # prompt by default, causing hf_processor outputs incorrect token ids.
+        # So we need use `add_special_tokens=False` here to leave bos_token
+        # to be added by the processor.
+        tokenizer_encode_kwargs = {"add_special_tokens": False}
+
    for batch_idx in range(num_batches):
        mm_data = {
            k:
@@ -122,7 +130,7 @@ def _test_processing_correctness(
            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")

        baseline_tokenized_result = baseline_processor.apply(
-            tokenizer.encode(prompt),
+            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
            mm_data=mm_data,
            hf_processor_mm_kwargs={},
        )
@@ -131,7 +139,7 @@ def _test_processing_correctness(
            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")

        cached_tokenized_result = cached_processor.apply(
-            tokenizer.encode(prompt),
+            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
            mm_data=mm_data,
            hf_processor_mm_kwargs={},
        )
@@ -155,6 +163,7 @@ def _test_processing_correctness(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "TIGER-Lab/Mantis-8B-siglip-llama3",
    "mistral-community/pixtral-12b",
    "openbmb/MiniCPM-o-2_6",