Enable conversion of multimodal models to pooling tasks (#24451)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-09-12 00:30:41 -03:00
parent 6a50eaa0d3
commit e090b7b45b
5 changed files with 282 additions and 75 deletions
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -19,10 +19,11 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.models.adapters import (as_embedding_model,
-                                                 as_reward_model,
-                                                 as_seq_cls_model)
-from vllm.model_executor.models.interfaces import SupportsQuant
+from vllm.model_executor.models.adapters import (
+    as_embedding_model, as_reward_model, as_seq_cls_model,
+    try_create_mm_pooling_model_cls)
+from vllm.model_executor.models.interfaces import (SupportsQuant,
+                                                   supports_multimodal)
 from vllm.utils import is_pin_memory_available

 logger = init_logger(__name__)
@@ -183,6 +184,15 @@ def get_model_architecture(
                "performance may not be optimal.", arch)

    convert_type = model_config.convert_type
+    if convert_type != "none" and supports_multimodal(model_cls):
+        logger.debug_once("Detected conversion of Multi Modal model.")
+        converted = try_create_mm_pooling_model_cls(model_cls)
+        if converted is not None:
+            logger.debug_once("Creating wrapper class to forward pooler.")
+            return converted, arch
+        else:
+            logger.debug_once("Attempting direct conversion.")
+
    if convert_type == "none":
        pass
    elif convert_type == "embed":