[Bugfix] Fix fully sharded LoRAs with Mixtral (#11390)

Signed-off-by: Jason Greene <jason.greene@redhat.com>
2024-12-22 09:25:10 -06:00
parent 72d9c316d3
commit f1d1bf6288
2 changed files with 5 additions and 2 deletions
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
                       if self.base_layer.skip_bias_add else None)
        return output, output_bias

+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
    @classmethod
-    @_not_fully_sharded_can_replace
    def can_replace_layer(
        cls,
        source_layer: nn.Module,