Enable conversion of multimodal models to pooling tasks (#24451)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
This commit is contained in:
Maximilien de Bayser
2025-09-12 00:30:41 -03:00
committed by GitHub
parent 6a50eaa0d3
commit e090b7b45b
5 changed files with 282 additions and 75 deletions

View File

@@ -19,10 +19,11 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.linear import QKVCrossParallelLinear
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model,
as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.adapters import (
as_embedding_model, as_reward_model, as_seq_cls_model,
try_create_mm_pooling_model_cls)
from vllm.model_executor.models.interfaces import (SupportsQuant,
supports_multimodal)
from vllm.utils import is_pin_memory_available
logger = init_logger(__name__)
@@ -183,6 +184,15 @@ def get_model_architecture(
"performance may not be optimal.", arch)
convert_type = model_config.convert_type
if convert_type != "none" and supports_multimodal(model_cls):
logger.debug_once("Detected conversion of Multi Modal model.")
converted = try_create_mm_pooling_model_cls(model_cls)
if converted is not None:
logger.debug_once("Creating wrapper class to forward pooler.")
return converted, arch
else:
logger.debug_once("Attempting direct conversion.")
if convert_type == "none":
pass
elif convert_type == "embed":