[Core] Add MultiprocessingGPUExecutor (#4539)

Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com>
2024-05-14 10:38:59 -07:00
parent dc72402b57
commit 676a99982f
11 changed files with 225 additions and 39 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -34,6 +34,7 @@ class EngineArgs:
    seed: int = 0
    max_model_len: Optional[int] = None
    worker_use_ray: bool = False
+    distributed_executor_backend: Optional[str] = None
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
    max_parallel_loading_workers: Optional[int] = None
@@ -221,10 +222,17 @@ class EngineArgs:
            ' Can be overridden per request via guided_decoding_backend'
            ' parameter.')
        # Parallel arguments
-        parser.add_argument('--worker-use-ray',
-                            action='store_true',
-                            help='Use Ray for distributed serving, will be '
-                            'automatically set when using more than 1 GPU.')
+        parser.add_argument(
+            '--distributed-executor-backend',
+            choices=['ray', 'mp'],
+            default=EngineArgs.distributed_executor_backend,
+            help='Backend to use for distributed serving. When more than 1 GPU '
+            'is used, will be automatically set to "ray" if installed '
+            'or "mp" (multiprocessing) otherwise.')
+        parser.add_argument(
+            '--worker-use-ray',
+            action='store_true',
+            help='Deprecated, use --distributed-executor-backend=ray.')
        parser.add_argument('--pipeline-parallel-size',
                            '-pp',
                            type=int,