[VLM] Merged multi-modal processor for Molmo (#12966)

2025-02-13 20:34:00 +08:00
parent fdcf64d3c6
commit c9d3ecf016
9 changed files with 745 additions and 493 deletions
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -404,11 +404,10 @@ VLM_TEST_SETTINGS = {
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
        test_type=(VLMTestType.IMAGE),
-        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
+        prompt_formatter=identity,
        max_model_len=4096,
        max_num_seqs=2,
-        image_size_factors=[(),(1.0, 1.0, 1.0)],
-        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
+        patch_hf_runner=model_utils.molmo_patch_hf_runner,
        postprocess_inputs=model_utils.molmo_post_processor,
    ),
    # Tests for phi3v currently live in another file because of a bug in