diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py index 711d514a3..f4269750f 100644 --- a/tests/lora/test_gptoss_tp.py +++ b/tests/lora/test_gptoss_tp.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + import vllm from vllm.lora.request import LoRARequest @@ -84,14 +86,17 @@ def test_gpt_oss_lora(gptoss20b_lora_files): @multi_gpu_test(num_gpus=2) -def test_gpt_oss_lora_tp2(gptoss20b_lora_files): +@pytest.mark.parametrize("fully_sharded_loras", [False, True]) +def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras): llm = vllm.LLM( MODEL_PATH, max_model_len=1024, enable_lora=True, max_loras=2, max_lora_rank=8, + max_num_seqs=16, tensor_parallel_size=2, + fully_sharded_loras=fully_sharded_loras, compilation_config=vllm.config.CompilationConfig( # Avoid OOM cudagraph_specialize_lora=False, ), diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py index 8a4f5ff17..25364a588 100644 --- a/vllm/lora/layers/__init__.py +++ b/vllm/lora/layers/__init__.py @@ -11,7 +11,7 @@ from vllm.lora.layers.column_parallel_linear import ( QKVParallelLinearWithLoRA, QKVParallelLinearWithShardedLoRA, ) -from vllm.lora.layers.fused_moe import FusedMoEWithLoRA +from vllm.lora.layers.fused_moe import FusedMoE3DWithLoRA, FusedMoEWithLoRA from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA from vllm.lora.layers.row_parallel_linear import ( @@ -38,4 +38,5 @@ __all__ = [ "ReplicatedLinearWithLoRA", "LoRAMapping", "FusedMoEWithLoRA", + "FusedMoE3DWithLoRA", ] diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py index 62326c05b..3bfb88c00 100644 --- a/vllm/lora/layers/base.py +++ b/vllm/lora/layers/base.py @@ -42,8 +42,8 @@ class BaseLayerWithLoRA(nn.Module): def set_lora( self, index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, + lora_a: torch.Tensor | list[torch.Tensor], + lora_b: torch.Tensor | list[torch.Tensor], ): """Overwrites lora tensors at index.""" ... diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index e85c5bd70..06ecc8d2f 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -94,13 +94,15 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): def set_lora( self, index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, + lora_a: torch.Tensor | list[torch.Tensor], + lora_b: torch.Tensor | list[torch.Tensor], ): # Except for QKVParallelLinearWithLoRA and # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers # store weights in a tuple of size 1. These two layers will # override this function. + assert isinstance(lora_a, torch.Tensor) + assert isinstance(lora_b, torch.Tensor) assert ( len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1 ) diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 273c4950e..3e21d426c 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -246,8 +246,8 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def set_lora( self, index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, + lora_a: torch.Tensor | list[torch.Tensor], + lora_b: torch.Tensor | list[torch.Tensor], ): self.reset_lora(index) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 5aeaca8de..0eb6562be 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -42,7 +42,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() self.device = base_layer.w2_weight.device - self.w13_slices = 2 + self._w13_slices = 2 self._inject_lora_into_fused_moe() def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]: @@ -160,7 +160,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): op_prefix="w13", num_loras=self.max_loras, rank=max_lora_rank, - num_slices=self.w13_slices, + num_slices=self._w13_slices, M=M, layer=layer, top_k=top_k, @@ -230,7 +230,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_tokens = hidden_states.size(0) M = min(num_tokens, CHUNK_SIZE) - max_lora_rank = self.w2_lora_a_stacked.shape[-2] + max_lora_rank = self.w2_lora_a_stacked[0].shape[-2] shrink_config, expand_config = self._get_lora_moe_configs( op_prefix="w2", num_loras=self.max_loras, @@ -258,8 +258,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.punica_wrapper.add_lora_fused_moe( intermediate_cache3, intermediate_cache2, - (self.w2_lora_a_stacked,), - (self.w2_lora_b_stacked,), + self.w2_lora_a_stacked, + self.w2_lora_b_stacked, topk_weights, sorted_token_ids_lora, expert_ids_lora, @@ -292,22 +292,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.base_layer.quant_method, m_fused_moe_fn ) - def create_lora_weights( + def _create_lora_a_weights( self, max_loras: int, lora_config: LoRAConfig, - model_config: PretrainedConfig | None = None, - ) -> None: - """Initializes lora matrices.""" - assert self.w13_slices == 2 - self.max_loras = lora_config.max_loras - self.fully_sharded = lora_config.fully_sharded_loras - - self.adapter_enabled = torch.tensor( - [0] * (max_loras + 1), dtype=torch.int, device=self.device - ) - - self.w13_lora_a_stacked = tuple( + ): + self.w13_lora_a_stacked: tuple[torch.Tensor, ...] = tuple( torch.zeros( ( max_loras, @@ -320,10 +310,23 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): dtype=lora_config.lora_dtype, device=self.device, ) - for _ in range(self.w13_slices) + for _ in range(self._w13_slices) + ) + self.w2_lora_a_stacked: tuple[torch.Tensor, ...] = ( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + lora_config.max_lora_rank, + self.base_layer.intermediate_size_per_partition, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ), ) - self.w13_lora_b_stacked = tuple( + def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig): + self.w13_lora_b_stacked: tuple[torch.Tensor, ...] = tuple( torch.zeros( ( max_loras, @@ -334,34 +337,42 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): dtype=lora_config.lora_dtype, device=self.device, ) - for _ in range(self.w13_slices) + for _ in range(self._w13_slices) + ) + self.w2_lora_b_stacked: tuple[torch.Tensor, ...] = ( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.hidden_size + if not self.fully_sharded + else divide(self.base_layer.hidden_size, self.tp_size), + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ), ) - self.w2_lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - lora_config.max_lora_rank, - self.base_layer.intermediate_size_per_partition, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.w2_lora_b_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - self.base_layer.hidden_size - if not self.fully_sharded - else divide(self.base_layer.hidden_size, self.tp_size), - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + """Initializes lora matrices.""" + self.max_loras = lora_config.max_loras + self.fully_sharded = lora_config.fully_sharded_loras + + self.adapter_enabled = torch.tensor( + [0] * (max_loras + 1), dtype=torch.int, device=self.device ) + self._create_lora_a_weights(max_loras, lora_config) + self._create_lora_b_weights(max_loras, lora_config) # They will be used by 'LoRALayerWeights.create_dummy_lora_weights' # to create a dummy LoRA weights. + # TODO Optimize this section self.lora_a_stacked = [] self.lora_b_stacked = [] for lora_id in range(max_loras): @@ -370,36 +381,43 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.lora_a_stacked.append( self.w13_lora_a_stacked[0][lora_id][experts_id] ) - self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id]) self.lora_a_stacked.append( - self.w13_lora_a_stacked[1][lora_id][experts_id] + self.w2_lora_a_stacked[0][lora_id][experts_id] ) self.lora_b_stacked.append( self.w13_lora_b_stacked[0][lora_id][experts_id] ) - self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id]) + self.lora_b_stacked.append( + self.w2_lora_b_stacked[0][lora_id][experts_id] + ) + + self.lora_a_stacked.append( + self.w13_lora_a_stacked[1][lora_id][experts_id] + ) self.lora_b_stacked.append( self.w13_lora_b_stacked[1][lora_id][experts_id] ) def reset_lora(self, index: int): """Resets the lora weights at index back to 0.""" - for pos in range(self.w13_slices): + for pos in range(self._w13_slices): self.w13_lora_a_stacked[pos][index] = 0 self.w13_lora_b_stacked[pos][index] = 0 - self.w2_lora_a_stacked[index] = 0 - self.w2_lora_b_stacked[index] = 0 + self.w2_lora_a_stacked[0][index] = 0 + self.w2_lora_b_stacked[0][index] = 0 self.adapter_enabled[index] = 0 def set_lora( self, index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, + lora_a: torch.Tensor | list[torch.Tensor], + lora_b: torch.Tensor | list[torch.Tensor], ): """Overwrites lora tensors at index.""" + assert isinstance(lora_a, list) + assert isinstance(lora_b, list) self.reset_lora(index) self.adapter_enabled[index] = 1 for eid in range(len(lora_a) // 3): @@ -432,7 +450,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :] w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :] - w2_shard_size = self.w2_lora_b_stacked[index, eid].shape[0] + w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0] w2_start_idx = self.tp_rank * w2_shard_size w2_end_idx = (self.tp_rank + 1) * w2_shard_size w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :] @@ -454,26 +472,14 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] ].copy_(w3_lora_b, non_blocking=True) - self.w2_lora_a_stacked[ + self.w2_lora_a_stacked[0][ index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] ].copy_(w2_lora_a, non_blocking=True) - self.w2_lora_b_stacked[ + self.w2_lora_b_stacked[0][ index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1] ].copy_(w2_lora_b, non_blocking=True) - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: PretrainedConfig | None, - ) -> bool: - """Returns True if the layer can be replaced by this LoRA layer.""" - # return type(source_layer) is FusedMoE - return isinstance(source_layer, FusedMoE) - def forward(self, *args, **kwargs): return self.base_layer.forward(*args, **kwargs) @@ -491,3 +497,220 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): @property def is_internal_router(self) -> bool: return self.base_layer.is_internal_router + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + """Returns True if the layer can be replaced by this LoRA layer.""" + # return type(source_layer) is FusedMoE + + return type(source_layer) is FusedMoE and len(packed_modules_list) == 2 + + +class FusedMoE3DWithLoRA(FusedMoEWithLoRA): + def __init__(self, base_layer): + super().__init__(base_layer) + self._w13_slices = 1 + + def _create_lora_b_weights(self, max_loras, lora_config): + self.w13_lora_b_stacked: tuple[torch.Tensor] = tuple( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.intermediate_size_per_partition * 2, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self._w13_slices) + ) + self.w2_lora_b_stacked: tuple[torch.Tensor] = ( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.hidden_size + if not self.fully_sharded + else divide(self.base_layer.hidden_size, self.tp_size), + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ), + ) + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + """Initializes lora matrices.""" + self.max_loras = lora_config.max_loras + self.fully_sharded = lora_config.fully_sharded_loras + + self.adapter_enabled = torch.tensor( + [0] * (max_loras + 1), dtype=torch.int, device=self.device + ) + + self._create_lora_a_weights(max_loras, lora_config) + self._create_lora_b_weights(max_loras, lora_config) + + def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor: + if self.tp_size == 1 or not self.fully_sharded: + return w13_lora_a + + # w13_lora_a shape (num_experts,rank,input_size) + current_lora_rank = w13_lora_a.shape[1] + assert current_lora_rank % self.tp_size == 0 + + sliced_rank = current_lora_rank // self.tp_size + start_idx = self.tp_rank * sliced_rank + end_idx = (self.tp_rank + 1) * sliced_rank + return w13_lora_a[:, start_idx:end_idx, :] + + def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True): + if self.tp_size == 1: + return w13_lora_b + + # w13_lora_b shape (num_experts,output_size,rank) + shard_size = self.base_layer.intermediate_size_per_partition + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + if is_interleave: + # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj) + # in the interleaved order, and corresponding LoRA need to be processed. + w1_lora_b = w13_lora_b[:, ::2, :] + w3_lora_b = w13_lora_b[:, 1::2, :] + sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :] + sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :] + + return torch.stack([sliced_w1_lora_b, sliced_w3_lora_b], dim=2).flatten( + 1, 2 + ) + else: + slice_size = w13_lora_b.shape[1] // 2 + w1_lora_b = w13_lora_b[:, :slice_size, :] + w3_lora_b = w13_lora_b[:, slice_size:, :] + sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :] + sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :] + + return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1) + + def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor: + if self.tp_size == 1: + return w2_lora_a + # w2_lora_a shape (num_experts,rank,input_size) + shard_size = self.base_layer.intermediate_size_per_partition + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + + return w2_lora_a[:, :, start_idx:end_idx] + + def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor: + if self.tp_size == 1 or not self.fully_sharded: + return w2_lora_b + # Based on S-LoRA, we slice W2 B along the hidden_size dim. + # w2_lora_b shape (num_experts,output_size,rank) + current_lora_size = w2_lora_b.shape[1] + + sliced_size = current_lora_size // self.tp_size + start_idx = self.tp_rank * sliced_size + end_idx = (self.tp_rank + 1) * sliced_size + return w2_lora_b[:, start_idx:end_idx, :] + + def set_lora( + self, + index: int, + lora_a: torch.Tensor | list[torch.Tensor], + lora_b: torch.Tensor | list[torch.Tensor], + ): + """Overwrites lora tensors at index.""" + # Make mypy happy + assert isinstance(lora_a, list) + assert isinstance(lora_b, list) + assert len(lora_a) == len(lora_b) == 2 + + self.reset_lora(index) + self.adapter_enabled[index] = 1 + + num_experts = self.w13_lora_a_stacked[0].shape[1] + w13_lora_a, w2_lora_a = lora_a + w13_lora_b, w2_lora_b = lora_b + + # (num_experts,rank,input_size) + w13_lora_a = w13_lora_a.reshape(num_experts, -1, w13_lora_a.shape[-1]) + w2_lora_a = w2_lora_a.reshape(num_experts, -1, w2_lora_a.shape[-1]) + # (output_size,num_experts,rank) + w13_lora_b = w13_lora_b.reshape(w13_lora_b.shape[0], num_experts, -1) + w2_lora_b = w2_lora_b.reshape(w2_lora_b.shape[0], num_experts, -1) + # (num_experts,output_size,rank) + w13_lora_b = w13_lora_b.permute(1, 0, 2) + w2_lora_b = w2_lora_b.permute(1, 0, 2) + + sliced_w13_lora_a = self._slice_w13_a(w13_lora_a) + sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True) + + sliced_w2_lora_a = self._slice_w2_a(w2_lora_a) + sliced_w2_lora_b = self._slice_w2_b(w2_lora_b) + + self.w13_lora_a_stacked[0][ + index, :, : sliced_w13_lora_a.shape[1], : sliced_w13_lora_a.shape[2] + ].copy_(sliced_w13_lora_a, non_blocking=True) + self.w2_lora_a_stacked[0][ + index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2] + ].copy_(sliced_w2_lora_a, non_blocking=True) + + self.w13_lora_b_stacked[0][ + index, :, : sliced_w13_lora_b.shape[1], : sliced_w13_lora_b.shape[2] + ].copy_(sliced_w13_lora_b, non_blocking=True) + self.w2_lora_b_stacked[0][ + index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2] + ].copy_(sliced_w2_lora_b, non_blocking=True) + + @property + def w13_input_size(self): + """ + Full size + """ + return self.w13_lora_a_stacked[0].shape[-1] + + @property + def w13_output_size(self): + """ + Full size + """ + return self.w13_lora_b_stacked[0].shape[-2] * self.tp_size + + @property + def w2_input_size(self): + """ + Full size + """ + return self.w2_lora_a_stacked[0].shape[-1] * self.tp_size + + @property + def w2_output_size(self): + """ + Full size + """ + return self.w2_lora_a_stacked[0].shape[-2] + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + """Returns True if the layer can be replaced by this LoRA layer.""" + + return type(source_layer) is FusedMoE and len(packed_modules_list) == 1 diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index 06f926520..c01984db4 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -128,9 +128,11 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): def set_lora( self, index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, + lora_a: torch.Tensor | list[torch.Tensor], + lora_b: torch.Tensor | list[torch.Tensor], ): + assert isinstance(lora_a, torch.Tensor) + assert isinstance(lora_b, torch.Tensor) self.reset_lora(index) self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_( lora_a, non_blocking=True diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index 5b1f7886b..c87ca9e24 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -77,12 +77,15 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): def set_lora( self, index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, + lora_a: torch.Tensor | list[torch.Tensor], + lora_b: torch.Tensor | list[torch.Tensor], ): + assert isinstance(lora_a, torch.Tensor) + assert isinstance(lora_b, torch.Tensor) self.reset_lora(index) # NOTE self.lora_a_stacked is row-major, and lora_a is col-major, # so we need transpose here + self.lora_a_stacked[index, : lora_a.shape[1], : lora_a.shape[0]].copy_( lora_a.T, non_blocking=True ) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index eb11cd0af..636f062fe 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -22,11 +22,13 @@ from vllm.lora.utils import ( from_layer_logits_processor, get_supported_lora_modules, is_base_embeddding_weights, + is_moe_model, is_regex_target_modules, parse_fine_tuned_lora_name, process_packed_modules_mapping, replace_submodule, ) +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model @@ -356,7 +358,11 @@ class LoRAModelManager: self.modules: dict[str, BaseLayerWithLoRA] = {} # Dict instead of a set for compatibility with LRUCache. self._last_mapping: LoRAMapping | None = None + self._is_3d_moe_model = is_moe_model(self.model) and hasattr( + self.model, "is_3d_moe_weight" + ) self._create_lora_modules() + self.model.lora_manager = self def __len__(self) -> int: @@ -400,22 +406,36 @@ class LoRAModelManager: self.lora_index_to_id[index] = lora_model.id for module_name, module in self.modules.items(): module_lora = self._get_lora_layer_weights(lora_model, module_name) - if module_lora: - # Note (gnovack) - If MOE lora weights are not split into - # num_experts chunks, we split them here - if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor( - module_lora.lora_a - ): - # Handle FSDP file format where experts.base_layer is the - # gate_up_proj and experts is the down_proj - gate_up_proj_lora = self._get_lora_layer_weights( - lora_model, module_name + ".base_layer" - ) - - assert gate_up_proj_lora is not None - assert module_lora is not None - - down_proj_lora = module_lora + if not module_lora: + module.reset_lora(index) + continue + # Note (gnovack) - If MOE lora weights are not split into + # num_experts chunks, we split them here + if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor( + module_lora.lora_a + ): + # Handle PEFT file format where experts.base_layer is the + # gate_up_proj and experts is the down_proj + gate_up_proj_lora = self._get_lora_layer_weights( + lora_model, module_name + ".base_layer" + ) + down_proj_lora = module_lora + # FIXME Edge case where LoRA is not added to gate_up_proj + # or down_proj + assert gate_up_proj_lora is not None + assert down_proj_lora is not None + if self._is_3d_moe_model: + module_lora.lora_a = [ + gate_up_proj_lora.lora_a, + down_proj_lora.lora_a, + ] + module_lora.lora_b = [ + gate_up_proj_lora.lora_b, + down_proj_lora.lora_b, + ] + else: + # Some 3D MoE models haven't added the `is_3d_moe_weight` + # attribute yet, so fallback here num_experts = module_lora.lora_a.shape[0] // module_lora.rank gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0) @@ -444,14 +464,12 @@ class LoRAModelManager: module_lora.lora_a = lora_a module_lora.lora_b = lora_b + module.set_lora( + index, + module_lora.lora_a, + module_lora.lora_b, + ) - module.set_lora( - index, - module_lora.lora_a, - module_lora.lora_b, - ) - else: - module.reset_lora(index) return True def _deactivate_adapter(self, lora_id: int): @@ -512,6 +530,13 @@ class LoRAModelManager: continue parts = module_name.split(".")[-1] packed_moduled_lst = self.packed_modules_mapping.get(parts, []) + if isinstance(module, FusedMoE): + # packed_moduled_lst is used here to just determine whether to + # instantiate FusedMoE3DWithLoRA or FusedMoEWithLoRA, and the + # difference between these two LoRA layers is whether the + # LoRA weights of w1 and w3 have already been fused on disk. + + packed_moduled_lst = ["w13"] if self._is_3d_moe_model else ["w1", "w3"] new_module = replace_submodule( self.model, module_name, @@ -560,6 +585,7 @@ class LoRAModelManager: self._register_packed_modules(module_name) # All lora layers share the same punica_wrapper based on reference. new_module.set_mapping(self.punica_wrapper) + pass def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): assert isinstance(module, BaseLayerWithLoRA), ( @@ -605,6 +631,30 @@ class LoRAModelManager: module.lora_a_stacked[0].dtype, "cpu", ) + model.loras[module_name] = lora + elif module.__class__.__name__ == "FusedMoE3DWithLoRA": + # Case for 3D moe model + # w2 + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name, + module.w2_input_size, + module.w2_output_size, + rank * module.w2_lora_a_stacked[0].shape[1], # rank*num_experts + module.w2_lora_a_stacked[0].dtype, + "cpu", + ) + model.loras[module_name] = lora + # w13 + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name, + module.w13_input_size, + module.w13_output_size, + rank + * module.w13_lora_a_stacked[0].shape[1], # rank*num_experts + module.w13_lora_a_stacked[0].dtype, + "cpu", + ) + model.loras[module_name + ".base_layer"] = lora else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, @@ -614,6 +664,7 @@ class LoRAModelManager: module.lora_a_stacked[0].dtype, "cpu", ) + model.loras[module_name] = lora else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] @@ -629,7 +680,7 @@ class LoRAModelManager: ) subloras.append(lora) lora = PackedLoRALayerWeights.pack(subloras) - model.loras[module_name] = lora + model.loras[module_name] = lora return model def _match_target_modules(self, module_name: str): diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index a49a7d9d1..12524994d 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -23,6 +23,7 @@ from vllm.lora.layers import ( BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA, + FusedMoE3DWithLoRA, FusedMoEWithLoRA, LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -62,6 +63,7 @@ _all_lora_classes: set[type[BaseLayerWithLoRA]] = { MergedQKVParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA, FusedMoEWithLoRA, + FusedMoE3DWithLoRA, } @@ -288,10 +290,12 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]: # the expert indices are expanded based on the configured number # of routed experts. packed_modules_mapping = get_packed_modules_mapping(model) - - packed_modules_mapping["experts"] = [ - weight_name.rstrip(".") for _, weight_name, _, _ in moe_packed_mapping - ] + if not hasattr(model, "is_3d_moe_weight"): + # 3D MoE LoRA does not need `packed_modules_mapping` + packed_modules_mapping["experts"] = [ + weight_name.rstrip(".") + for _, weight_name, _, _ in moe_packed_mapping + ] return packed_modules_mapping else: diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 8835acb8e..1bc0ad387 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -656,6 +656,7 @@ class GptOssModel(nn.Module): class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA): + is_3d_moe_weight: bool = True packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} hf_to_vllm_mapper = WeightsMapper(