[Bugfix] Fix fully sharded LoRAs with Mixtral (#11390)
Signed-off-by: Jason Greene <jason.greene@redhat.com>
This commit is contained in:
@@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tp_size", [4])
|
@pytest.mark.parametrize("tp_size", [4])
|
||||||
|
@pytest.mark.parametrize("fully_shard", [True, False])
|
||||||
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
|
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
|
||||||
tp_size):
|
tp_size, fully_shard):
|
||||||
"""This LoRA model has all supported Mixtral target modules"""
|
"""This LoRA model has all supported Mixtral target modules"""
|
||||||
|
|
||||||
if torch.cuda.device_count() < tp_size:
|
if torch.cuda.device_count() < tp_size:
|
||||||
@@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
|
|||||||
max_loras=4,
|
max_loras=4,
|
||||||
distributed_executor_backend="ray",
|
distributed_executor_backend="ray",
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
|
fully_sharded_loras=fully_shard,
|
||||||
max_lora_rank=32,
|
max_lora_rank=32,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
|
|||||||
if self.base_layer.skip_bias_add else None)
|
if self.base_layer.skip_bias_add else None)
|
||||||
return output, output_bias
|
return output, output_bias
|
||||||
|
|
||||||
|
# ReplicatedLinear should always be replaced, regardless of the fully
|
||||||
|
# sharded LoRAs setting, because it is, by definition, copied per GPU.
|
||||||
@classmethod
|
@classmethod
|
||||||
@_not_fully_sharded_can_replace
|
|
||||||
def can_replace_layer(
|
def can_replace_layer(
|
||||||
cls,
|
cls,
|
||||||
source_layer: nn.Module,
|
source_layer: nn.Module,
|
||||||
|
|||||||
Reference in New Issue
Block a user