Enable conversion of multimodal models to pooling tasks (#24451)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-09-12 00:30:41 -03:00
parent 6a50eaa0d3
commit e090b7b45b
5 changed files with 282 additions and 75 deletions
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.platforms import current_platform
+
+
+def test_idefics_multimodal(
+    vllm_runner,
+    monkeypatch,
+) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
+                     runner="pooling",
+                     task="classify",
+                     convert="classify",
+                     load_format="dummy",
+                     max_model_len=512,
+                     enforce_eager=True,
+                     tensor_parallel_size=1,
+                     disable_log_stats=True,
+                     dtype="bfloat16") as vllm_model:
+        llm = vllm_model.get_llm()
+        outputs = llm.classify(prompts)
+        for output in outputs:
+            assert len(output.outputs.probs) == 2
+
+
+def update_config(config):
+    config.text_config.update({
+        "architectures": ["Gemma3ForSequenceClassification"],
+        "classifier_from_token": ["A", "B", "C", "D", "E"],
+        "method":
+        "no_post_processing",
+        "id2label": {
+            "A": "Chair",
+            "B": "Couch",
+            "C": "Table",
+            "D": "Bed",
+            "E": "Cupboard"
+        },
+    })
+    return config
+
+
+def test_gemma_multimodal(
+    vllm_runner,
+    monkeypatch,
+) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    messages = [{
+        "role":
+        "system",
+        "content":
+        """
+    You are a helpful assistant. You will be given a product description
+    which may also include an image. Classify the following product into
+    one of the categories:
+
+    A = chair
+    B = couch
+    C = table
+    D = bed
+    E = cupboard
+
+    You'll answer with exactly one letter (A, B, C, D, or E)."""
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url":
+                "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
+            }
+        }, {
+            "type": "text",
+            "text": "A fine 19th century piece of furniture."
+        }]
+    }]
+
+    with vllm_runner(model_name="google/gemma-3-4b-it",
+                     runner="pooling",
+                     task="classify",
+                     convert="classify",
+                     load_format="auto",
+                     hf_overrides=update_config,
+                     override_pooler_config={"pooling_type": "LAST"},
+                     max_model_len=512,
+                     enforce_eager=True,
+                     tensor_parallel_size=1,
+                     disable_log_stats=True,
+                     dtype="bfloat16") as vllm_model:
+
+        llm = vllm_model.get_llm()
+        prompts = llm.preprocess_chat(messages)
+
+        result = llm.classify(prompts)
+        assert result[0].outputs.probs[0] > 0.95
+        assert all(c < 0.05 for c in result[0].outputs.probs[1:])