[Model] Extend collect_children and no_init_weights contexts (#32757)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-22 16:20:27 +08:00
parent 1bf1a34b19
commit 2b8a38b6d6
20 changed files with 444 additions and 257 deletions
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig,
    QuantizeMethodBase,
 )
-from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
+from vllm.model_executor.models.interfaces import SupportsQuant
 from vllm.utils.platform_utils import is_pin_memory_available

 logger = init_logger(__name__)
@@ -165,11 +165,7 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()


 def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
-    from vllm.model_executor.models.adapters import (
-        as_embedding_model,
-        as_seq_cls_model,
-        try_create_mm_pooling_model_cls,
-    )
+    from vllm.model_executor.models.adapters import as_embedding_model, as_seq_cls_model

    architectures = getattr(model_config.hf_config, "architectures", [])

@@ -189,15 +185,6 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
            )

    convert_type = model_config.convert_type
-    if convert_type != "none" and supports_multimodal(model_cls):
-        logger.debug_once("Detected conversion of Multi Modal model.")
-        converted = try_create_mm_pooling_model_cls(model_cls)
-        if converted is not None:
-            logger.debug_once("Creating wrapper class to forward pooler.")
-            return converted, arch
-        else:
-            logger.debug_once("Attempting direct conversion.")
-
    if convert_type == "none":
        pass
    elif convert_type == "embed":