[Core] Improve choice of Python multiprocessing method (#8823)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: youkaichao <youkaichao@126.com>
This commit is contained in:
Russell Bryant
2024-09-28 21:17:07 -04:00
committed by GitHub
parent cc276443b5
commit d1537039ce
4 changed files with 52 additions and 9 deletions

View File

@@ -15,8 +15,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.triton_utils import maybe_set_triton_cache_manager
from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
get_distributed_init_method, get_open_port,
get_vllm_instance_id, make_async,
cuda_is_initialized, get_distributed_init_method,
get_open_port, get_vllm_instance_id, make_async,
update_environment_variables)
logger = init_logger(__name__)
@@ -122,6 +122,13 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
"CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
})
if (cuda_is_initialized()
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
logger.warning("CUDA was previously initialized. We must use "
"the `spawn` multiprocessing start method. Setting "
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
cuda_device_count = cuda_device_count_stateless()
# Use confusing message for more common TP-only case.
assert tensor_parallel_size <= cuda_device_count, (