[Model] Support VLMs with transformers backend (#20543)
Signed-off-by: raushan <raushan@huggingface.co> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
committed by
GitHub
parent
51ba839555
commit
9499e26e2a
@@ -25,6 +25,7 @@ from vllm.model_executor.models.adapters import (as_embedding_model,
|
||||
as_reward_model,
|
||||
as_seq_cls_model)
|
||||
from vllm.model_executor.models.interfaces import SupportsQuant
|
||||
from vllm.model_executor.models.registry import _TRANSFORMERS_MODELS
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -169,9 +170,22 @@ def device_loading_context(module: torch.nn.Module,
|
||||
|
||||
def resolve_transformers_arch(model_config: ModelConfig,
|
||||
architectures: list[str]):
|
||||
if model_config.model_impl == ModelImpl.VLLM:
|
||||
raise ValueError(
|
||||
"Attempting to resolve architecture from the Transformers library "
|
||||
"but the model implementation is set to vLLM. This should never "
|
||||
"happen.")
|
||||
|
||||
for i, arch in enumerate(architectures):
|
||||
if arch == "TransformersForCausalLM":
|
||||
if arch in _TRANSFORMERS_MODELS:
|
||||
continue
|
||||
|
||||
if model_config.model_impl == ModelImpl.AUTO:
|
||||
logger.warning(
|
||||
"%s has no vLLM implementation, falling back to Transformers "
|
||||
"implementation. Some features may not be supported and "
|
||||
"performance may not be optimal.", arch)
|
||||
|
||||
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
|
||||
None) or dict()
|
||||
# Make sure that config class is always initialized before model class,
|
||||
@@ -199,25 +213,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
|
||||
"not present in the model config's 'auto_map' (relevant "
|
||||
"if the model is custom).")
|
||||
model_module = auto_modules["AutoModel"]
|
||||
# TODO(Isotr0py): Further clean up these raises.
|
||||
# perhaps handled them in _ModelRegistry._raise_for_unsupported?
|
||||
if model_config.model_impl == ModelImpl.TRANSFORMERS:
|
||||
if not model_module.is_backend_compatible():
|
||||
raise ValueError(
|
||||
f"The Transformers implementation of {arch} is not "
|
||||
"compatible with vLLM.")
|
||||
architectures[i] = "TransformersForCausalLM"
|
||||
if model_config.model_impl == ModelImpl.AUTO:
|
||||
if not model_module.is_backend_compatible():
|
||||
raise ValueError(
|
||||
f"{arch} has no vLLM implementation and the Transformers "
|
||||
"implementation is not compatible with vLLM. Try setting "
|
||||
"VLLM_USE_V1=0.")
|
||||
logger.warning(
|
||||
"%s has no vLLM implementation, falling back to Transformers "
|
||||
"implementation. Some features may not be supported and "
|
||||
"performance may not be optimal.", arch)
|
||||
architectures[i] = "TransformersForCausalLM"
|
||||
|
||||
if not model_module.is_backend_compatible():
|
||||
raise ValueError(
|
||||
f"The Transformers implementation of '{arch}' is not "
|
||||
"compatible with vLLM.")
|
||||
|
||||
architectures[i] = model_config._get_transformers_backend_cls()
|
||||
return architectures
|
||||
|
||||
|
||||
@@ -237,8 +239,9 @@ def get_model_architecture(
|
||||
]
|
||||
|
||||
vllm_supported_archs = ModelRegistry.get_supported_archs()
|
||||
vllm_not_supported = not any(arch in vllm_supported_archs
|
||||
for arch in architectures)
|
||||
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
|
||||
_TRANSFORMERS_MODELS)
|
||||
vllm_not_supported = not any(is_supported(arch) for arch in architectures)
|
||||
|
||||
if vllm_not_supported:
|
||||
# try automatic conversion in adapters.py
|
||||
@@ -259,7 +262,7 @@ def get_model_architecture(
|
||||
break
|
||||
|
||||
if (model_config.model_impl == ModelImpl.TRANSFORMERS or
|
||||
model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
|
||||
model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
|
||||
architectures = resolve_transformers_arch(model_config, architectures)
|
||||
logger.debug_once("Resolve transformers arch %s", str(architectures))
|
||||
elif (model_config.quantization is not None
|
||||
|
||||
Reference in New Issue
Block a user