[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)

2024-08-18 17:57:20 -07:00
parent 200a2ffa6b
commit ff7ec82c4d
36 changed files with 722 additions and 346 deletions
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -0,0 +1,27 @@
+from array import array
+from typing import Any, Type
+
+from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
+
+
+def encode_hook(obj: Any) -> Any:
+    """Custom msgspec enc hook that supports array types.
+
+    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
+    """
+    if isinstance(obj, array):
+        assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
+            f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
+            f"Given array has a type code of {obj.typecode}.")
+        return obj.tobytes()
+
+
+def decode_hook(type: Type, obj: Any) -> Any:
+    """Custom msgspec dec hook that supports array types.
+
+    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
+    """
+    if type is array:
+        deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
+        deserialized.frombytes(obj)
+        return deserialized
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -4,9 +4,12 @@ from collections import defaultdict
 from itertools import islice, repeat
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

+import msgspec
+
 import vllm.envs as envs
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.msgspec_utils import encode_hook
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
@@ -60,6 +63,10 @@ class RayGPUExecutor(DistributedGPUExecutor):
        # Create the parallel GPU workers.
        self._init_workers_ray(placement_group)

+        self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+        self.output_decoder = msgspec.msgpack.Decoder(
+            Optional[List[SamplerOutput]])
+
    def shutdown(self) -> None:
        if hasattr(self, "forward_dag") and self.forward_dag is not None:
            self.forward_dag.teardown()
@@ -123,6 +130,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
                ray_remote_kwargs)

        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
+
        # Create the workers.
        driver_ip = get_ip()
        worker_wrapper_kwargs = self._get_worker_wrapper_args()
@@ -304,8 +312,10 @@ class RayGPUExecutor(DistributedGPUExecutor):
        if self.forward_dag is None:
            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)

-        outputs = ray.get(self.forward_dag.execute(execute_model_req))
-        return outputs[0]
+        serialized_data = self.input_encoder.encode(execute_model_req)
+        outputs = ray.get(self.forward_dag.execute(serialized_data))
+        output = self.output_decoder.decode(outputs[0])
+        return output

    def _run_workers(
        self,
@@ -475,9 +485,10 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
        if self.forward_dag is None:
            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)

-        dag_future = await self.forward_dag.execute_async(execute_model_req)
+        serialized_data = self.input_encoder.encode(execute_model_req)
+        dag_future = await self.forward_dag.execute_async(serialized_data)
        outputs = await dag_future
-        return outputs[0]
+        return self.output_decoder.decode(outputs[0])

    async def _driver_execute_model_async(
        self,
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,6 +1,9 @@
 from typing import List, Optional, Tuple, Union

+import msgspec
+
 from vllm.config import ParallelConfig
+from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
@@ -24,6 +27,10 @@ try:
            # that thread.
            self.compiled_dag_cuda_device_set = False

+            self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
+                                                         dec_hook=decode_hook)
+            self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
+
        def get_node_ip(self) -> str:
            return get_ip()

@@ -33,16 +40,26 @@ try:
            return node_id, gpu_ids

        def execute_model_spmd(
-            self, req_or_tuple: Union[ExecuteModelRequest,
-                                      Tuple[ExecuteModelRequest,
-                                            IntermediateTensors]]):
+            self, req_or_tuple: Union[bytes,
+                                      Tuple[bytes,
+                                            Optional[IntermediateTensors]]]
+        ) -> bytes:
            """Execute model in SPMD fashion: used only when SPMD worker and
            compiled DAG are both enabled.

            Args:
-                req_or_tuple: The request to execute the model, or a tuple
-                    containing the request and intermediate tensors.
+                req_or_tuple: A request or a tuple containing the
+                    request and intermediate tensors. Intermediate tensors are
+                    None unless if it is provided because it is > 0 pipeline
+                    stage. The request is serialized by msgspec.
            """
+            if isinstance(req_or_tuple, bytes):
+                serialized_req, intermediate_tensors = req_or_tuple, None
+            else:
+                serialized_req, intermediate_tensors = req_or_tuple
+
+            execute_model_req = self.input_decoder.decode(serialized_req)
+
            # TODO(swang): This is needed right now because Ray aDAG executes
            # on a background thread, so we need to reset torch's current
            # device.
@@ -51,16 +68,14 @@ try:
                torch.cuda.set_device(self.worker.device)
                self.compiled_dag_cuda_device_set = True

-            if isinstance(req_or_tuple, tuple):
-                execute_model_req, intermediate_tensors = req_or_tuple
-            else:
-                execute_model_req = req_or_tuple
-                intermediate_tensors = None
-
            output = self.worker._execute_model_spmd(execute_model_req,
                                                     intermediate_tensors)
+            # Pipeline model request and output to the next pipeline stage.
            if isinstance(output, IntermediateTensors):
-                return execute_model_req, output
+                output = serialized_req, output
+            else:
+                output = self.output_encoder.encode(output)
+
            return output

    ray_import_err = None