[Bugfix] Fix Qwen2-VL LoRA weight loading (#11430)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -28,7 +28,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor,
|
||||
parse_fine_tuned_lora_name, replace_submodule)
|
||||
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.models.utils import PPMissingLayer
|
||||
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -113,13 +113,14 @@ class LoRAModel(AdapterModel):
|
||||
target_embedding_padding: Optional[int] = None,
|
||||
embedding_modules: Optional[Dict[str, str]] = None,
|
||||
embedding_padding_modules: Optional[List[str]] = None,
|
||||
weights_mapper: Optional[WeightsMapper] = None,
|
||||
) -> "LoRAModel":
|
||||
"""Create a LoRAModel from a dictionary of tensors."""
|
||||
pin_memory = str(device) == "cpu" and is_pin_memory_available()
|
||||
loras: Dict[str, LoRALayerWeights] = {}
|
||||
for tensor_name, tensor in tensors.items():
|
||||
module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
|
||||
tensor_name)
|
||||
tensor_name, weights_mapper)
|
||||
if module_name not in loras:
|
||||
lora_embeddings_tensor = None
|
||||
if embeddings:
|
||||
@@ -187,6 +188,7 @@ class LoRAModel(AdapterModel):
|
||||
target_embedding_padding: Optional[int] = None,
|
||||
embedding_modules: Optional[Dict[str, str]] = None,
|
||||
embedding_padding_modules: Optional[List[str]] = None,
|
||||
weights_mapper: Optional[WeightsMapper] = None,
|
||||
) -> "LoRAModel":
|
||||
"""Create a LoRAModel from a local checkpoint.
|
||||
|
||||
@@ -289,7 +291,8 @@ class LoRAModel(AdapterModel):
|
||||
embeddings=embeddings,
|
||||
target_embedding_padding=target_embedding_padding,
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embedding_padding_modules)
|
||||
embedding_padding_modules=embedding_padding_modules,
|
||||
weights_mapper=weights_mapper)
|
||||
|
||||
|
||||
class LoRAModelManager(AdapterModelManager):
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import copy
|
||||
import os
|
||||
import re
|
||||
from typing import List, Optional, Set, Tuple, Type, Union
|
||||
@@ -30,6 +31,8 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
# yapf: enable
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
from vllm.utils import print_warning_once
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -91,28 +94,54 @@ def replace_submodule(model: nn.Module, module_name: str,
|
||||
return new_module
|
||||
|
||||
|
||||
def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]:
|
||||
def parse_fine_tuned_lora_name(
|
||||
name: str,
|
||||
weights_mapper: Optional[WeightsMapper] = None
|
||||
) -> Tuple[str, bool, bool]:
|
||||
"""Parse the name of lora weights.
|
||||
|
||||
args:
|
||||
name: the name of the fine-tuned LoRA, e.g.
|
||||
base_model.model.dense1.weight
|
||||
weights_mapper: maps the name of weight, e.g.
|
||||
`model.` -> `language_model.model.`,
|
||||
return:
|
||||
Tuple(module_name, is_lora_a):
|
||||
module_name: the name of the module, e.g. model.dense1,
|
||||
is_lora_a whether the tensor is lora_a or lora_b.
|
||||
is_bias whether the tensor is lora bias.
|
||||
"""
|
||||
|
||||
w_mapper = None
|
||||
if weights_mapper:
|
||||
w_mapper = copy.deepcopy(weights_mapper)
|
||||
# TODO: Currently only supports mapping for prefix, mapping for
|
||||
# substr and subfix will be supported in the future.
|
||||
for attr, mapping in [
|
||||
("orig_to_new_substr", w_mapper.orig_to_new_substr),
|
||||
("orig_to_new_suffix", w_mapper.orig_to_new_suffix),
|
||||
]:
|
||||
if mapping:
|
||||
print_warning_once(
|
||||
f"vLLM currently does not support mapping of LoRA weights "
|
||||
f"for {mapping}.")
|
||||
setattr(w_mapper, attr, {})
|
||||
|
||||
mapper = (lambda name: w_mapper._map_name(name)
|
||||
if w_mapper is not None else name)
|
||||
parts = name.split(".")
|
||||
if parts[-1] == "weight" and (parts[-2] == "lora_A"
|
||||
or parts[-2] == "lora_B"):
|
||||
return ".".join(parts[2:-2]), parts[-2] == "lora_A", False
|
||||
new_name = ".".join(parts[2:-2])
|
||||
return mapper(new_name), parts[-2] == "lora_A", False
|
||||
|
||||
if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
|
||||
return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False
|
||||
new_name = ".".join(parts[2:-1])
|
||||
return mapper(new_name), parts[-1] == "lora_embedding_A", False
|
||||
|
||||
if parts[-1] == "bias":
|
||||
return ".".join(parts[2:-2]), False, True
|
||||
new_name = ".".join(parts[2:-2])
|
||||
return mapper(new_name), False, True
|
||||
|
||||
raise ValueError(f"{name} is unsupported LoRA weight")
|
||||
|
||||
|
||||
@@ -92,6 +92,14 @@ class WorkerLoRAManager(AbstractWorkerManager):
|
||||
else:
|
||||
expected_lora_modules.append(module)
|
||||
lora_path = get_adapter_absolute_path(lora_request.lora_path)
|
||||
|
||||
# For some models like Qwen2VL, we need to use hf_to_vllm_mapper
|
||||
# to ensure correct loading of lora weights.
|
||||
hf_to_vllm_mapper = None
|
||||
if (hasattr(model, "hf_to_vllm_mapper")
|
||||
and model.hf_to_vllm_mapper is not None):
|
||||
hf_to_vllm_mapper = model.hf_to_vllm_mapper
|
||||
|
||||
lora = self._lora_model_cls.from_local_checkpoint(
|
||||
lora_path,
|
||||
expected_lora_modules,
|
||||
@@ -103,7 +111,8 @@ class WorkerLoRAManager(AbstractWorkerManager):
|
||||
self.lora_config.lora_extra_vocab_size,
|
||||
embedding_modules=self.embedding_modules,
|
||||
embedding_padding_modules=self.embedding_padding_modules,
|
||||
)
|
||||
weights_mapper=hf_to_vllm_mapper)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Loading lora {lora_path} failed") from e
|
||||
if lora.rank > self.lora_config.max_lora_rank:
|
||||
|
||||
@@ -901,6 +901,11 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
]
|
||||
embedding_modules = {}
|
||||
embedding_padding_modules = []
|
||||
# To ensure correct weight loading and mapping.
|
||||
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
|
||||
"lm_head.": "language_model.lm_head.",
|
||||
"model.": "language_model.model.",
|
||||
})
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
@@ -1190,11 +1195,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_prefix={
|
||||
"lm_head.": "language_model.lm_head.",
|
||||
"model.": "language_model.model.",
|
||||
})
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
|
||||
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
||||
|
||||
Reference in New Issue
Block a user