Add pipeline parallel support to TransformersModel (#12832)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-03-25 02:41:45 +00:00
parent 911c8eb000
commit 97cfa65df7
4 changed files with 244 additions and 87 deletions
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -472,6 +472,16 @@ class PPMissingLayer(torch.nn.Identity):

    def __init__(self, *args, **kwargs):
        super().__init__()
+        self.return_tuple = kwargs.get("return_tuple", False)
+
+    def forward(self, *args, **kwargs):
+        """
+        Return the first arg from args or the first value from kwargs.
+
+        Wraps the input in a tuple if `self.return_tuple` is True.
+        """
+        input = args[0] if args else next(iter(kwargs.values()))
+        return (input, ) if self.return_tuple else input


 _CPU_OFFLOAD_BYTES = 0
@@ -650,4 +660,4 @@ def cast_overflow_tensors(
    if tensors.isinf().any() or tensors.isnan().any():
        clamp_value = torch.finfo(tensors.dtype).max - offset
        tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
-    return tensors
+    return tensors