[V1] Fix multimodal profiling for Molmo (#11325)

Signed-off-by: ywang96 <ywang@example.com> Co-authored-by: ywang96 <ywang@example.com>
2024-12-19 08:27:22 -08:00
parent 6c7f881541
commit 7379b3d4b2
4 changed files with 24 additions and 4 deletions
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -928,7 +928,11 @@ def image_input_mapper_for_molmo(
    data: object,
 ):
    if isinstance(data, list):
+        assert len(data) == 1, "Molmo supports only one image per prompt."
        data = data[0]
+
+    # Remove unused dummy PIL image
+    data.pop('raw_mm_data', None)
    return MultiModalKwargs(data)


@@ -974,6 +978,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
    dummy_imgdata = {
        "images": out["images"],
        "image_input_idx": out["image_input_idx"],
+        "raw_mm_data": dummy_image,
    }
    if "image_masks" in out:
        dummy_imgdata["image_masks"] = out["image_masks"]