[Core] Refactor Worker and ModelRunner to consolidate control plane communication (#5408)

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu> Signed-off-by: Stephanie <swang@anyscale.com> Co-authored-by: Stephanie <swang@anyscale.com>
2024-06-25 20:30:03 -07:00
parent 82079729cc
commit dda4811591
29 changed files with 1106 additions and 573 deletions
--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -64,8 +64,8 @@ class DistributedGPUExecutor(GPUExecutor):
                          num_cpu_blocks=num_cpu_blocks)

    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
        if self.parallel_worker_tasks is None:
            self.parallel_worker_tasks = self._run_workers(
                "start_worker_execution_loop",
@@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor):
        if self.parallel_worker_tasks is None:
            return

-        self._driver_execute_model()
+        self._driver_execute_model(execute_model_req=None)
        parallel_worker_tasks = self.parallel_worker_tasks
        self.parallel_worker_tasks = None
        # Ensure that workers exit model loop cleanly
@@ -123,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor):

    @abstractmethod
    def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
        """Run execute_model in the driver worker.

-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
+        Passing None will cause the driver to stop the model execution loop
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
        """
        raise NotImplementedError

--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -69,8 +69,8 @@ class ExecutorBase(ABC):

    @abstractmethod
    def execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Optional[List[SamplerOutput]]:
        """Executes at least one model step on the given sequences."""
        raise NotImplementedError

--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -87,7 +87,7 @@ class GPUExecutor(ExecutorBase):

    def execute_model(
        self, execute_model_req: ExecuteModelRequest
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
        output = self.driver_worker.execute_model(execute_model_req)
        return output

--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -78,16 +78,14 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
            worker_monitor.close()

    def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
        """Run execute_model in the driver worker.

        Passing None will cause the driver to stop the model execution
        loop running in each of the remote workers.
        """
-        return self.driver_worker.execute_model(
-            execute_model_req=execute_model_req)
+        return self.driver_worker.execute_model(execute_model_req)

    def _run_workers(
        self,
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -55,8 +55,7 @@ class NeuronExecutor(ExecutorBase):
        assert execute_model_req.num_lookahead_slots == 0, (
            "lookahead not supported for Neuron backend.")

-        output = self.driver_worker.execute_model(
-            execute_model_req.seq_group_metadata_list)
+        output = self.driver_worker.execute_model(execute_model_req)
        return output

    def add_lora(self, lora_request: LoRARequest) -> bool:
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -190,9 +190,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
                          max_parallel_loading_workers)

    def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
        """Run execute_model in the driver worker.

        Passing None will cause the driver to stop the model execution