[Model] Composite weight loading for multimodal Qwen2 (#10944)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2024-12-07 22:22:52 +08:00
committed by GitHub
parent b26b4cd03c
commit bf0e382e16
7 changed files with 148 additions and 206 deletions

View File

@@ -17,7 +17,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
from vllm.platforms import _Backend, current_platform
from vllm.sequence import IntermediateTensors
from vllm.utils import is_pin_memory_available
from vllm.utils import is_pin_memory_available, print_warning_once
logger = init_logger(__name__)
@@ -251,12 +251,15 @@ def init_vllm_registered_model(
"""
from vllm.model_executor.model_loader.loader import _initialize_model
if hf_config is not None:
vllm_config = vllm_config.with_hf_config(hf_config)
if hf_config is None and architectures is not None:
# So that the architectures field is overridden
hf_config = vllm_config.model_config.hf_config
return _initialize_model(vllm_config=vllm_config,
prefix=prefix,
architectures=architectures)
if hf_config is not None:
vllm_config = vllm_config.with_hf_config(hf_config,
architectures=architectures)
return _initialize_model(vllm_config=vllm_config, prefix=prefix)
@overload
@@ -592,7 +595,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
if is_flash_attn_2_available():
selected_backend = _Backend.FLASH_ATTN
else:
logger.warning(
print_warning_once(
"Current `vllm-flash-attn` has a bug inside vision module, "
"so we use xformers backend instead. You can run "
"`pip install flash-attn` to use flash-attention backend.")