[LoRA][2/2]Remove LoRA extra vocab (#28545)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict, Field, model_validator
|
||||
@@ -11,7 +11,6 @@ from typing_extensions import Self
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
@@ -46,19 +45,6 @@ class LoRAConfig:
|
||||
`max_loras`."""
|
||||
lora_dtype: torch.dtype | LoRADType = "auto"
|
||||
"""Data type for LoRA. If auto, will default to base model dtype."""
|
||||
lora_extra_vocab_size: LoRAExtraVocabSize = Field(
|
||||
default=256,
|
||||
deprecated=(
|
||||
"`lora_extra_vocab_size` is deprecated and will be removed "
|
||||
"in v0.12.0. Additional vocabulary support for "
|
||||
"LoRA adapters is being phased out."
|
||||
),
|
||||
)
|
||||
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
|
||||
LoRA adapter. Will be removed in v0.12.0."""
|
||||
lora_vocab_padding_size: ClassVar[int] = (
|
||||
current_platform.get_lora_vocab_padding_size()
|
||||
)
|
||||
default_mm_loras: dict[str, str] | None = None
|
||||
"""Dictionary mapping specific modalities to LoRA model paths; this field
|
||||
is only applicable to multimodal models and should be leveraged when a
|
||||
@@ -87,8 +73,6 @@ class LoRAConfig:
|
||||
factors.append(self.max_loras)
|
||||
factors.append(self.fully_sharded_loras)
|
||||
factors.append(self.lora_dtype)
|
||||
factors.append(self.lora_extra_vocab_size)
|
||||
factors.append(self.lora_vocab_padding_size)
|
||||
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@@ -484,7 +484,6 @@ class EngineArgs:
|
||||
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
|
||||
max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
|
||||
lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
|
||||
lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
|
||||
|
||||
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
|
||||
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
|
||||
@@ -1011,9 +1010,6 @@ class EngineArgs:
|
||||
)
|
||||
lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
|
||||
lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
|
||||
lora_group.add_argument(
|
||||
"--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"]
|
||||
)
|
||||
lora_group.add_argument(
|
||||
"--lora-dtype",
|
||||
**lora_kwargs["lora_dtype"],
|
||||
@@ -1680,7 +1676,6 @@ class EngineArgs:
|
||||
max_loras=self.max_loras,
|
||||
default_mm_loras=self.default_mm_loras,
|
||||
fully_sharded_loras=self.fully_sharded_loras,
|
||||
lora_extra_vocab_size=self.lora_extra_vocab_size,
|
||||
lora_dtype=self.lora_dtype,
|
||||
max_cpu_loras=self.max_cpu_loras
|
||||
if self.max_cpu_loras and self.max_cpu_loras > 0
|
||||
|
||||
@@ -44,7 +44,6 @@ class BaseLayerWithLoRA(nn.Module):
|
||||
index: int,
|
||||
lora_a: torch.Tensor,
|
||||
lora_b: torch.Tensor,
|
||||
embeddings_tensor: torch.Tensor | None,
|
||||
):
|
||||
"""Overwrites lora tensors at index."""
|
||||
...
|
||||
|
||||
@@ -96,7 +96,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
index: int,
|
||||
lora_a: torch.Tensor,
|
||||
lora_b: torch.Tensor,
|
||||
embeddings_tensor: torch.Tensor | None,
|
||||
):
|
||||
# Except for QKVParallelLinearWithLoRA and
|
||||
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
|
||||
|
||||
@@ -248,7 +248,6 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
||||
index: int,
|
||||
lora_a: torch.Tensor,
|
||||
lora_b: torch.Tensor,
|
||||
embeddings_tensor: torch.Tensor | None,
|
||||
):
|
||||
self.reset_lora(index)
|
||||
|
||||
|
||||
@@ -406,8 +406,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
|
||||
index: int,
|
||||
lora_a: torch.Tensor,
|
||||
lora_b: torch.Tensor,
|
||||
embeddings_tensor: torch.Tensor | None,
|
||||
bias: torch.Tensor | None = None,
|
||||
):
|
||||
"""Overwrites lora tensors at index."""
|
||||
self.reset_lora(index)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -108,22 +107,13 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
|
||||
(
|
||||
max_loras,
|
||||
1,
|
||||
# Pad for kernel compatibility
|
||||
math.ceil(
|
||||
self.base_layer.vocab_size / lora_config.lora_vocab_padding_size
|
||||
)
|
||||
* lora_config.lora_vocab_padding_size,
|
||||
self.base_layer.vocab_size,
|
||||
lora_config.max_lora_rank,
|
||||
),
|
||||
dtype=lora_config.lora_dtype,
|
||||
device=self.device,
|
||||
)
|
||||
self.embeddings_tensors = torch.full(
|
||||
(max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
|
||||
fill_value=float("-inf"),
|
||||
dtype=self.dtype,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
if self.sharded_to_full_mapping is not None:
|
||||
self.sharded_to_full_mapping_gpu = torch.tensor(
|
||||
self.sharded_to_full_mapping, device=self.device, dtype=torch.long
|
||||
@@ -134,14 +124,12 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
|
||||
def reset_lora(self, index: int):
|
||||
self.lora_a_stacked[index] = 0
|
||||
self.lora_b_stacked[index] = 0
|
||||
self.embeddings_tensors[index] = float("-inf")
|
||||
|
||||
def set_lora(
|
||||
self,
|
||||
index: int,
|
||||
lora_a: torch.Tensor,
|
||||
lora_b: torch.Tensor,
|
||||
embeddings_tensor: torch.Tensor | None,
|
||||
):
|
||||
self.reset_lora(index)
|
||||
self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
|
||||
@@ -150,12 +138,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
|
||||
self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
|
||||
lora_b, non_blocking=True
|
||||
)
|
||||
if embeddings_tensor is not None:
|
||||
self.embeddings_tensors[
|
||||
index,
|
||||
: embeddings_tensor.shape[0],
|
||||
: embeddings_tensor.shape[1],
|
||||
] = embeddings_tensor
|
||||
|
||||
def _get_logits(
|
||||
self,
|
||||
@@ -193,39 +175,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
|
||||
# token_id: [0, 1, 2, 3, 4, 5, -1, -1]
|
||||
logits = logits[:, self.sharded_to_full_mapping_gpu]
|
||||
|
||||
lora_logits = torch.empty(
|
||||
self.embeddings_tensors.shape[0] + 1,
|
||||
self.embeddings_tensors.shape[1],
|
||||
hidden_states.shape[0],
|
||||
dtype=self.embeddings_tensors.dtype,
|
||||
device=self.embeddings_tensors.device,
|
||||
)
|
||||
torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1])
|
||||
|
||||
neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype)
|
||||
|
||||
lora_logits[-1] = neg_inf
|
||||
lora_logits = lora_logits.mT
|
||||
indices_padded = self.punica_wrapper.sampler_indices_padded
|
||||
|
||||
if current_platform.is_tpu() or current_platform.is_xpu():
|
||||
indices_padded = indices_padded[: logits.size(0)]
|
||||
|
||||
lora_logits = (
|
||||
lora_logits.reshape(
|
||||
lora_logits.shape[0] * lora_logits.shape[1],
|
||||
lora_logits.shape[2],
|
||||
)
|
||||
.index_select(0, indices_padded)
|
||||
.nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf)
|
||||
)
|
||||
|
||||
logits[
|
||||
:,
|
||||
self.base_layer.org_vocab_size : self.base_layer.org_vocab_size
|
||||
+ lora_logits.shape[1],
|
||||
] = lora_logits
|
||||
|
||||
lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits(
|
||||
logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0
|
||||
)
|
||||
|
||||
@@ -46,19 +46,10 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
|
||||
self.embeddings_slice = None
|
||||
self.embeddings_weights = None
|
||||
|
||||
self.embeddings_tensors = torch.zeros(
|
||||
(
|
||||
max_loras,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
self.base_layer.embedding_dim,
|
||||
),
|
||||
dtype=self.base_layer.weight.dtype,
|
||||
device=self.base_layer.weight.device,
|
||||
)
|
||||
self.lora_a_stacked = torch.zeros(
|
||||
(
|
||||
max_loras,
|
||||
self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size,
|
||||
self.base_layer.org_vocab_size,
|
||||
lora_config.max_lora_rank,
|
||||
),
|
||||
dtype=lora_config.lora_dtype,
|
||||
@@ -82,14 +73,12 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
|
||||
def reset_lora(self, index: int):
|
||||
self.lora_a_stacked[index] = 0
|
||||
self.lora_b_stacked[index] = 0
|
||||
self.embeddings_tensors[index] = 0
|
||||
|
||||
def set_lora(
|
||||
self,
|
||||
index: int,
|
||||
lora_a: torch.Tensor,
|
||||
lora_b: torch.Tensor,
|
||||
embeddings_tensor: torch.Tensor | None,
|
||||
):
|
||||
self.reset_lora(index)
|
||||
# NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
|
||||
@@ -100,36 +89,18 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
|
||||
self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
|
||||
lora_b, non_blocking=True
|
||||
)
|
||||
if embeddings_tensor is not None:
|
||||
self.embeddings_tensors[
|
||||
index,
|
||||
: embeddings_tensor.shape[0],
|
||||
: embeddings_tensor.shape[1],
|
||||
].copy_(embeddings_tensor, non_blocking=True)
|
||||
if self.embeddings_slice is not None:
|
||||
# TODO(yard1): Optimize this copy, we don't need to copy
|
||||
# everything, just the modified part
|
||||
embeddings = self.embeddings_tensors.view(
|
||||
self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1],
|
||||
self.embeddings_tensors.shape[2],
|
||||
)[self.embeddings_slice[0] : self.embeddings_slice[1]]
|
||||
assert self.embeddings_weights is not None
|
||||
self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0)
|
||||
|
||||
# NB: Don't use torch.narrow here. torch.narrow triggers some
|
||||
# Dynamic Shape specialization in torch.compile
|
||||
num_tokens = x.shape[0]
|
||||
indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens]
|
||||
indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens]
|
||||
|
||||
full_lora_a_embeddings = F.embedding(
|
||||
x + indices_1,
|
||||
self.lora_a_stacked_2d,
|
||||
)
|
||||
full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask))
|
||||
full_output = self.base_layer.forward(x)
|
||||
|
||||
full_output_org = full_output
|
||||
if full_output.ndim == 3:
|
||||
|
||||
@@ -21,7 +21,6 @@ class LoRALayerWeights:
|
||||
lora_alpha: int,
|
||||
lora_a: torch.Tensor,
|
||||
lora_b: torch.Tensor,
|
||||
embeddings_tensor: torch.Tensor | None = None,
|
||||
scaling: float | None = None,
|
||||
) -> None:
|
||||
self.module_name = module_name
|
||||
@@ -29,7 +28,6 @@ class LoRALayerWeights:
|
||||
self.lora_alpha = lora_alpha
|
||||
self.lora_a = lora_a
|
||||
self.lora_b = lora_b
|
||||
self.embeddings_tensor = embeddings_tensor
|
||||
|
||||
if scaling is None:
|
||||
self.scaling = self.lora_alpha / self.rank
|
||||
@@ -56,18 +54,11 @@ class LoRALayerWeights:
|
||||
def is_packed(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def extra_vocab_size(self) -> int:
|
||||
return (
|
||||
self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_config(
|
||||
cls,
|
||||
module_name: str,
|
||||
peft_helper: PEFTHelper,
|
||||
embeddings_tensor: torch.Tensor | None = None,
|
||||
) -> "LoRALayerWeights":
|
||||
# lora_a and lora_b are set to None for config-based construction
|
||||
return cls(
|
||||
@@ -76,7 +67,6 @@ class LoRALayerWeights:
|
||||
peft_helper.lora_alpha,
|
||||
None,
|
||||
None,
|
||||
embeddings_tensor,
|
||||
peft_helper.vllm_lora_scaling_factor,
|
||||
)
|
||||
|
||||
@@ -89,7 +79,6 @@ class LoRALayerWeights:
|
||||
rank: int,
|
||||
dtype: torch.dtype,
|
||||
device: torch.types.Device,
|
||||
embeddings_tensor_dim: int | None = None,
|
||||
) -> "LoRALayerWeights":
|
||||
pin_memory = str(device) == "cpu" and is_pin_memory_available()
|
||||
lora_a = torch.zeros(
|
||||
@@ -99,24 +88,12 @@ class LoRALayerWeights:
|
||||
[output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory
|
||||
)
|
||||
|
||||
embeddings_tensor = (
|
||||
torch.rand(
|
||||
10,
|
||||
embeddings_tensor_dim,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
pin_memory=pin_memory,
|
||||
)
|
||||
if embeddings_tensor_dim
|
||||
else None
|
||||
)
|
||||
return cls(
|
||||
module_name,
|
||||
rank=rank,
|
||||
lora_alpha=1,
|
||||
lora_a=lora_a,
|
||||
lora_b=lora_b,
|
||||
embeddings_tensor=embeddings_tensor,
|
||||
)
|
||||
|
||||
|
||||
@@ -139,7 +116,6 @@ class PackedLoRALayerWeights(LoRALayerWeights):
|
||||
lora_a=lora_a,
|
||||
lora_b=lora_b,
|
||||
scaling=scaling, # type: ignore
|
||||
embeddings_tensor=None,
|
||||
)
|
||||
self.lora_alphas = lora_alphas
|
||||
if scaling is None:
|
||||
|
||||
@@ -21,6 +21,7 @@ from vllm.lora.utils import (
|
||||
from_layer,
|
||||
from_layer_logits_processor,
|
||||
get_supported_lora_modules,
|
||||
is_base_embeddding_weights,
|
||||
is_regex_target_modules,
|
||||
parse_fine_tuned_lora_name,
|
||||
process_packed_modules_mapping,
|
||||
@@ -93,14 +94,6 @@ class LoRAModel:
|
||||
loras=self.loras.copy(),
|
||||
)
|
||||
|
||||
@property
|
||||
def extra_vocab_size(self) -> int:
|
||||
return (
|
||||
max(lora.extra_vocab_size for lora in self.loras.values())
|
||||
if self.loras
|
||||
else 0
|
||||
)
|
||||
|
||||
def get_lora(self, module_name: str) -> LoRALayerWeights | None:
|
||||
"""Get LoRA for a given module by name"""
|
||||
return self.loras.get(module_name, None)
|
||||
@@ -117,7 +110,6 @@ class LoRAModel:
|
||||
peft_helper: PEFTHelper,
|
||||
device: str = "cuda",
|
||||
dtype: torch.dtype | None = None,
|
||||
embeddings: dict[str, torch.Tensor] | None = None,
|
||||
target_embedding_padding: int | None = None,
|
||||
embedding_modules: dict[str, str] | None = None,
|
||||
embedding_padding_modules: list[str] | None = None,
|
||||
@@ -127,24 +119,14 @@ class LoRAModel:
|
||||
pin_memory = str(device) == "cpu" and is_pin_memory_available()
|
||||
loras: dict[str, LoRALayerWeights] = {}
|
||||
for tensor_name, tensor in tensors.items():
|
||||
if is_base_embeddding_weights(tensor_name):
|
||||
continue
|
||||
module_name, is_lora_a = parse_fine_tuned_lora_name(
|
||||
tensor_name, weights_mapper
|
||||
)
|
||||
if module_name not in loras:
|
||||
lora_embeddings_tensor = None
|
||||
if embeddings:
|
||||
assert embedding_modules is not None
|
||||
embeddings_module = next(
|
||||
(k for k in embedding_modules if k in module_name), None
|
||||
)
|
||||
if embeddings_module:
|
||||
lora_embeddings_tensor = embeddings[
|
||||
embedding_modules[embeddings_module]
|
||||
].to(device=device, dtype=dtype)
|
||||
if pin_memory:
|
||||
lora_embeddings_tensor = lora_embeddings_tensor.pin_memory()
|
||||
loras[module_name] = LoRALayerWeights.from_config(
|
||||
module_name, peft_helper, lora_embeddings_tensor
|
||||
module_name, peft_helper
|
||||
)
|
||||
|
||||
if is_lora_a:
|
||||
@@ -206,15 +188,17 @@ class LoRAModel:
|
||||
lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
|
||||
lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
|
||||
lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
|
||||
new_embeddings_tensor_path = os.path.join(
|
||||
lora_dir, "new_embeddings.safetensors"
|
||||
)
|
||||
new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
|
||||
# new_embeddings_tensor_path = os.path.join(
|
||||
# lora_dir, "new_embeddings.safetensors"
|
||||
# )
|
||||
# new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
|
||||
tensors: dict[str, torch.Tensor] = {}
|
||||
unexpected_modules: list[list[str] | str] = []
|
||||
|
||||
def check_unexpected_modules(modules: dict):
|
||||
for lora_module in modules.keys(): # noqa
|
||||
if is_base_embeddding_weights(lora_module):
|
||||
continue
|
||||
module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
|
||||
# Handle FSDP file format where experts.base_layer is the
|
||||
# gate_up_proj and experts is the down_proj
|
||||
@@ -300,21 +284,12 @@ class LoRAModel:
|
||||
else:
|
||||
raise ValueError(f"{lora_dir} doesn't contain tensors")
|
||||
|
||||
embeddings = None
|
||||
if os.path.isfile(new_embeddings_tensor_path):
|
||||
embeddings = safetensors.torch.load_file(new_embeddings_tensor_path)
|
||||
elif os.path.isfile(new_embeddings_bin_file_path):
|
||||
embeddings = torch.load(
|
||||
new_embeddings_bin_file_path, map_location=device, weights_only=True
|
||||
)
|
||||
|
||||
return cls.from_lora_tensors(
|
||||
lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
|
||||
tensors=tensors,
|
||||
peft_helper=peft_helper,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
embeddings=embeddings,
|
||||
target_embedding_padding=target_embedding_padding,
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embedding_padding_modules,
|
||||
@@ -474,7 +449,6 @@ class LoRAModelManager:
|
||||
index,
|
||||
module_lora.lora_a,
|
||||
module_lora.lora_b,
|
||||
module_lora.embeddings_tensor,
|
||||
)
|
||||
else:
|
||||
module.reset_lora(index)
|
||||
@@ -505,7 +479,6 @@ class LoRAModelManager:
|
||||
self.lora_index_to_id,
|
||||
self.lora_slots + 1,
|
||||
self.vocab_size,
|
||||
self.lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
def remove_all_adapters(self):
|
||||
@@ -616,7 +589,6 @@ class LoRAModelManager:
|
||||
if parts[-1] in embedding_modules:
|
||||
input_dim = (
|
||||
module.base_layer.org_vocab_size
|
||||
+ self.lora_config.lora_extra_vocab_size
|
||||
if hasattr(module.base_layer, "org_vocab_size")
|
||||
else module.base_layer.weight.shape[1]
|
||||
)
|
||||
@@ -625,11 +597,6 @@ class LoRAModelManager:
|
||||
if hasattr(module.base_layer, "embedding_dim")
|
||||
else module.base_layer.weight.shape[0]
|
||||
)
|
||||
embeddings_tensor_dim = (
|
||||
module.base_layer.embedding_dim
|
||||
if hasattr(module.base_layer, "embedding_dim")
|
||||
else module.base_layer.weight.shape[1]
|
||||
)
|
||||
lora = LoRALayerWeights.create_dummy_lora_weights(
|
||||
module_name,
|
||||
input_dim,
|
||||
@@ -637,7 +604,6 @@ class LoRAModelManager:
|
||||
rank,
|
||||
module.lora_a_stacked[0].dtype,
|
||||
"cpu",
|
||||
embeddings_tensor_dim=embeddings_tensor_dim,
|
||||
)
|
||||
else:
|
||||
lora = LoRALayerWeights.create_dummy_lora_weights(
|
||||
|
||||
@@ -31,7 +31,6 @@ class PunicaWrapperABC(ABC):
|
||||
lora_index_to_id: list[int | None],
|
||||
max_loras: int,
|
||||
vocab_size: int,
|
||||
extra_vocab_size: int,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
@@ -172,8 +171,11 @@ class PunicaWrapperBase(PunicaWrapperABC):
|
||||
lora_index_to_id: list[int | None],
|
||||
max_loras: int,
|
||||
vocab_size: int,
|
||||
extra_vocab_size: int,
|
||||
):
|
||||
# NOTE We have remove lora extra vocab support for now. So we set
|
||||
# extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed.
|
||||
|
||||
extra_vocab_size = 0
|
||||
(
|
||||
base_indices,
|
||||
sampler_indices,
|
||||
@@ -285,12 +287,9 @@ class PunicaWrapperBase(PunicaWrapperABC):
|
||||
lora_index_to_id: list[int | None],
|
||||
max_loras: int,
|
||||
vocab_size: int,
|
||||
extra_vocab_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
self._update_base_metadata(
|
||||
mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
|
||||
)
|
||||
self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
|
||||
|
||||
if mapping.is_prefill:
|
||||
# Update metadata required for prefill-related operators.
|
||||
|
||||
@@ -65,13 +65,10 @@ class PunicaWrapperGPU(PunicaWrapperBase):
|
||||
lora_index_to_id: list[int | None],
|
||||
max_loras: int,
|
||||
vocab_size: int,
|
||||
extra_vocab_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
self.is_prefill = mapping.is_prefill
|
||||
self._update_base_metadata(
|
||||
mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
|
||||
)
|
||||
self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
|
||||
|
||||
# Prepare cuda kernel metadata tensors
|
||||
self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
|
||||
|
||||
@@ -292,7 +292,6 @@ class PunicaWrapperTPU(PunicaWrapperBase):
|
||||
lora_index_to_id: list[int | None],
|
||||
max_loras: int,
|
||||
vocab_size: int,
|
||||
extra_vocab_size: int,
|
||||
):
|
||||
# Make sure we don't accidentally collect outside operations
|
||||
torch_xla.sync()
|
||||
@@ -313,7 +312,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
|
||||
lora_index_to_id,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
extra_vocab_size,
|
||||
0, # extra_vocab_size
|
||||
"cpu",
|
||||
)
|
||||
self._token_lora_indices = self._pad_to_shape(
|
||||
|
||||
@@ -43,13 +43,10 @@ class PunicaWrapperXPU(PunicaWrapperBase):
|
||||
lora_index_to_id: list[int | None],
|
||||
max_loras: int,
|
||||
vocab_size: int,
|
||||
extra_vocab_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
self.is_prefill = mapping.is_prefill
|
||||
self._update_base_metadata(
|
||||
mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
|
||||
)
|
||||
self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
|
||||
|
||||
def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
|
||||
return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
|
||||
|
||||
@@ -166,6 +166,16 @@ def parse_fine_tuned_lora_name(
|
||||
raise ValueError(f"{name} is unsupported LoRA weight")
|
||||
|
||||
|
||||
def is_base_embeddding_weights(name: str) -> bool:
|
||||
# hardcoded subfixes for input & output embedding weights
|
||||
input_embedding_subfix = ".embed_tokens.base_layer.weight"
|
||||
output_embedding_subfix = ".lm_head.base_layer.weight"
|
||||
|
||||
return name.endswith(input_embedding_subfix) or name.endswith(
|
||||
output_embedding_subfix
|
||||
)
|
||||
|
||||
|
||||
def is_regex_target_modules(
|
||||
load_modules: str | list[str], expected_lora_modules: list[str]
|
||||
) -> bool:
|
||||
|
||||
@@ -121,8 +121,7 @@ class WorkerLoRAManager:
|
||||
lora_model_id=lora_request.lora_int_id,
|
||||
device="cpu",
|
||||
dtype=self.lora_config.lora_dtype,
|
||||
target_embedding_padding=self.vocab_size
|
||||
+ self.lora_config.lora_extra_vocab_size,
|
||||
target_embedding_padding=self.vocab_size,
|
||||
embedding_modules=self.embedding_modules,
|
||||
embedding_padding_modules=self.embedding_padding_modules,
|
||||
tensorizer_config_dict=lora_request.tensorizer_config_dict,
|
||||
@@ -143,12 +142,6 @@ class WorkerLoRAManager:
|
||||
# For BadRequestError
|
||||
raise e
|
||||
|
||||
if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
|
||||
raise ValueError(
|
||||
f"LoRA added vocab size {lora.extra_vocab_size} "
|
||||
f"is greater than lora_extra_vocab_size "
|
||||
f"{self.lora_config.lora_extra_vocab_size}."
|
||||
)
|
||||
return lora
|
||||
|
||||
def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
|
||||
|
||||
@@ -46,7 +46,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE,
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
)
|
||||
@@ -261,29 +260,16 @@ class GraniteModel(nn.Module):
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
lora_vocab = (
|
||||
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
|
||||
if lora_config
|
||||
else 0
|
||||
)
|
||||
self.vocab_size = config.vocab_size + lora_vocab
|
||||
self.org_vocab_size = config.vocab_size
|
||||
|
||||
if get_pp_group().is_first_rank or (
|
||||
config.tie_word_embeddings and get_pp_group().is_last_rank
|
||||
):
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
self.vocab_size,
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
org_num_embeddings=config.vocab_size,
|
||||
padding_size=DEFAULT_VOCAB_PADDING_SIZE
|
||||
# We need bigger padding if using lora for kernel
|
||||
# compatibility
|
||||
if not lora_config
|
||||
else lora_config.lora_vocab_padding_size,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
else:
|
||||
@@ -420,28 +406,18 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
|
||||
self.config = config
|
||||
self.lora_config = lora_config
|
||||
|
||||
self.quant_config = quant_config
|
||||
|
||||
self.model = GraniteModel(
|
||||
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
|
||||
)
|
||||
if get_pp_group().is_last_rank:
|
||||
self.unpadded_vocab_size = config.vocab_size
|
||||
if lora_config:
|
||||
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
|
||||
self.lm_head = ParallelLMHead(
|
||||
self.unpadded_vocab_size,
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
org_num_embeddings=config.vocab_size,
|
||||
padding_size=DEFAULT_VOCAB_PADDING_SIZE
|
||||
# We need bigger padding if using lora for kernel
|
||||
# compatibility
|
||||
if not lora_config
|
||||
else lora_config.lora_vocab_padding_size,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "lm_head"),
|
||||
)
|
||||
@@ -453,7 +429,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
logit_scale /= config.logits_scaling
|
||||
|
||||
self.logits_processor = LogitsProcessor(
|
||||
self.unpadded_vocab_size, config.vocab_size, scale=logit_scale
|
||||
config.vocab_size, scale=logit_scale
|
||||
)
|
||||
else:
|
||||
self.lm_head = PPMissingLayer()
|
||||
|
||||
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE,
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
)
|
||||
@@ -368,24 +367,18 @@ class LlamaModel(nn.Module):
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
lora_vocab = (
|
||||
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
|
||||
if lora_config
|
||||
else 0
|
||||
)
|
||||
self.vocab_size = config.vocab_size + lora_vocab
|
||||
self.org_vocab_size = config.vocab_size
|
||||
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
if get_pp_group().is_first_rank or (
|
||||
config.tie_word_embeddings and get_pp_group().is_last_rank
|
||||
):
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
self.vocab_size,
|
||||
config.hidden_size,
|
||||
org_num_embeddings=config.vocab_size,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
else:
|
||||
@@ -562,9 +555,7 @@ class LlamaForCausalLM(
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
self.config = config
|
||||
self.lora_config = lora_config
|
||||
|
||||
self.model = self._init_model(
|
||||
vllm_config=vllm_config,
|
||||
@@ -573,20 +564,9 @@ class LlamaForCausalLM(
|
||||
)
|
||||
|
||||
if get_pp_group().is_last_rank:
|
||||
self.unpadded_vocab_size = config.vocab_size
|
||||
if lora_config:
|
||||
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
|
||||
self.lm_head = ParallelLMHead(
|
||||
self.unpadded_vocab_size,
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
org_num_embeddings=config.vocab_size,
|
||||
padding_size=(
|
||||
DEFAULT_VOCAB_PADDING_SIZE
|
||||
# We need bigger padding if using lora for kernel
|
||||
# compatibility
|
||||
if not lora_config
|
||||
else lora_config.lora_vocab_padding_size
|
||||
),
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "lm_head"),
|
||||
)
|
||||
@@ -595,7 +575,7 @@ class LlamaForCausalLM(
|
||||
|
||||
logit_scale = getattr(config, "logit_scale", 1.0)
|
||||
self.logits_processor = LogitsProcessor(
|
||||
self.unpadded_vocab_size, config.vocab_size, logit_scale
|
||||
config.vocab_size, scale=logit_scale
|
||||
)
|
||||
else:
|
||||
self.lm_head = PPMissingLayer()
|
||||
|
||||
@@ -51,7 +51,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE,
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
)
|
||||
@@ -301,23 +300,18 @@ class MixtralModel(nn.Module):
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
lora_vocab = (
|
||||
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
|
||||
if lora_config
|
||||
else 0
|
||||
)
|
||||
self.vocab_size = config.vocab_size + lora_vocab
|
||||
|
||||
self.vocab_size = config.vocab_size
|
||||
self.org_vocab_size = config.vocab_size
|
||||
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
self.vocab_size,
|
||||
config.hidden_size,
|
||||
org_num_embeddings=config.vocab_size,
|
||||
)
|
||||
|
||||
self.enable_eplb = parallel_config.enable_eplb
|
||||
@@ -508,34 +502,24 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
|
||||
self.config = config
|
||||
self.lora_config = lora_config
|
||||
|
||||
self.quant_config = quant_config
|
||||
|
||||
self.model = MixtralModel(
|
||||
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
|
||||
)
|
||||
self.unpadded_vocab_size = config.vocab_size
|
||||
if lora_config:
|
||||
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
|
||||
|
||||
self.lm_head = ParallelLMHead(
|
||||
self.unpadded_vocab_size,
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
org_num_embeddings=config.vocab_size,
|
||||
padding_size=DEFAULT_VOCAB_PADDING_SIZE
|
||||
# We need bigger padding if using lora for kernel
|
||||
# compatibility
|
||||
if not lora_config
|
||||
else lora_config.lora_vocab_padding_size,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "lm_head"),
|
||||
)
|
||||
if self.config.tie_word_embeddings:
|
||||
self.lm_head.weight = self.model.embed_tokens.weight
|
||||
self.logits_processor = LogitsProcessor(
|
||||
self.unpadded_vocab_size, config.vocab_size
|
||||
)
|
||||
self.logits_processor = LogitsProcessor(config.vocab_size)
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors
|
||||
)
|
||||
|
||||
@@ -74,5 +74,5 @@ class TeleFLMForCausalLM(LlamaForCausalLM):
|
||||
self.output_mult = self.config.output_mult / self.mup_scale_factor
|
||||
logit_scale = self.output_mult
|
||||
self.logits_processor = LogitsProcessor(
|
||||
self.unpadded_vocab_size, self.config.vocab_size, logit_scale
|
||||
self.config.vocab_size, scale=logit_scale
|
||||
)
|
||||
|
||||
@@ -219,9 +219,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.hidden_size = model_config.get_hidden_size()
|
||||
self.vocab_size = model_config.get_vocab_size()
|
||||
|
||||
if self.lora_config is not None:
|
||||
self.vocab_size += self.lora_config.lora_extra_vocab_size
|
||||
|
||||
# Multi-modal data support
|
||||
self.mm_registry = MULTIMODAL_REGISTRY
|
||||
self.uses_mrope = model_config.uses_mrope
|
||||
|
||||
Reference in New Issue
Block a user