[Core] Improve choice of Python multiprocessing method (#8823)

Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: youkaichao <youkaichao@126.com>
2024-09-28 21:17:07 -04:00
parent cc276443b5
commit d1537039ce
4 changed files with 52 additions and 9 deletions
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -15,8 +15,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.triton_utils import maybe_set_triton_cache_manager
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id, make_async,
+                        cuda_is_initialized, get_distributed_init_method,
+                        get_open_port, get_vllm_instance_id, make_async,
                        update_environment_variables)

 logger = init_logger(__name__)
@@ -122,6 +122,13 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
            })

+        if (cuda_is_initialized()
+                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+            logger.warning("CUDA was previously initialized. We must use "
+                           "the `spawn` multiprocessing start method. Setting "
+                           "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
+            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
        cuda_device_count = cuda_device_count_stateless()
        # Use confusing message for more common TP-only case.
        assert tensor_parallel_size <= cuda_device_count, (