[Model] Support VLMs with transformers backend (#20543)

Signed-off-by: raushan <raushan@huggingface.co> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-20 15:25:50 +02:00
parent 51ba839555
commit 9499e26e2a
7 changed files with 625 additions and 87 deletions
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -25,6 +25,7 @@ from vllm.model_executor.models.adapters import (as_embedding_model,
                                                 as_reward_model,
                                                 as_seq_cls_model)
 from vllm.model_executor.models.interfaces import SupportsQuant
+from vllm.model_executor.models.registry import _TRANSFORMERS_MODELS
 from vllm.utils import is_pin_memory_available

 logger = init_logger(__name__)
@@ -169,9 +170,22 @@ def device_loading_context(module: torch.nn.Module,

 def resolve_transformers_arch(model_config: ModelConfig,
                              architectures: list[str]):
+    if model_config.model_impl == ModelImpl.VLLM:
+        raise ValueError(
+            "Attempting to resolve architecture from the Transformers library "
+            "but the model implementation is set to vLLM. This should never "
+            "happen.")
+
    for i, arch in enumerate(architectures):
-        if arch == "TransformersForCausalLM":
+        if arch in _TRANSFORMERS_MODELS:
            continue
+
+        if model_config.model_impl == ModelImpl.AUTO:
+            logger.warning(
+                "%s has no vLLM implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.", arch)
+
        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                           None) or dict()
        # Make sure that config class is always initialized before model class,
@@ -199,25 +213,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
                    "not present in the model config's 'auto_map' (relevant "
                    "if the model is custom).")
            model_module = auto_modules["AutoModel"]
-        # TODO(Isotr0py): Further clean up these raises.
-        # perhaps handled them in _ModelRegistry._raise_for_unsupported?
-        if model_config.model_impl == ModelImpl.TRANSFORMERS:
-            if not model_module.is_backend_compatible():
-                raise ValueError(
-                    f"The Transformers implementation of {arch} is not "
-                    "compatible with vLLM.")
-            architectures[i] = "TransformersForCausalLM"
-        if model_config.model_impl == ModelImpl.AUTO:
-            if not model_module.is_backend_compatible():
-                raise ValueError(
-                    f"{arch} has no vLLM implementation and the Transformers "
-                    "implementation is not compatible with vLLM. Try setting "
-                    "VLLM_USE_V1=0.")
-            logger.warning(
-                "%s has no vLLM implementation, falling back to Transformers "
-                "implementation. Some features may not be supported and "
-                "performance may not be optimal.", arch)
-            architectures[i] = "TransformersForCausalLM"
+
+        if not model_module.is_backend_compatible():
+            raise ValueError(
+                f"The Transformers implementation of '{arch}' is not "
+                "compatible with vLLM.")
+
+        architectures[i] = model_config._get_transformers_backend_cls()
    return architectures


@@ -237,8 +239,9 @@ def get_model_architecture(
    ]

    vllm_supported_archs = ModelRegistry.get_supported_archs()
-    vllm_not_supported = not any(arch in vllm_supported_archs
-                                 for arch in architectures)
+    is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
+                                 _TRANSFORMERS_MODELS)
+    vllm_not_supported = not any(is_supported(arch) for arch in architectures)

    if vllm_not_supported:
        # try automatic conversion in adapters.py
@@ -259,7 +262,7 @@ def get_model_architecture(
            break

    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
-            model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
+            model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
        architectures = resolve_transformers_arch(model_config, architectures)
        logger.debug_once("Resolve transformers arch %s", str(architectures))
    elif (model_config.quantization is not None