[V1] Support MP Executor for multi node distributed inference (#23691)
Signed-off-by: Lu Fang <fanglu@fb.com> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Signed-off-by: Lucia Fang <fanglu@fb.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -183,15 +183,19 @@ def set_device_control_env_var(
|
||||
for engine subprocess.
|
||||
"""
|
||||
world_size = vllm_config.parallel_config.world_size
|
||||
local_world_size = vllm_config.parallel_config.local_world_size
|
||||
evar = current_platform.device_control_env_var
|
||||
|
||||
value = get_device_indices(evar, local_dp_rank, world_size)
|
||||
value = get_device_indices(evar, local_dp_rank, world_size, local_world_size)
|
||||
with patch.dict(os.environ, values=((evar, value),)):
|
||||
yield
|
||||
|
||||
|
||||
def get_device_indices(
|
||||
device_control_env_var: str, local_dp_rank: int, world_size: int
|
||||
device_control_env_var: str,
|
||||
local_dp_rank: int,
|
||||
world_size: int,
|
||||
local_world_size: int | None = None,
|
||||
):
|
||||
"""
|
||||
Returns a comma-separated string of device indices for the specified
|
||||
@@ -200,10 +204,15 @@ def get_device_indices(
|
||||
For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
|
||||
this will select devices 2 and 3 for local_dp_rank=1.
|
||||
"""
|
||||
if local_world_size is None:
|
||||
local_world_size = world_size
|
||||
try:
|
||||
value = ",".join(
|
||||
str(current_platform.device_id_to_physical_device_id(i))
|
||||
for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)
|
||||
for i in range(
|
||||
local_dp_rank * world_size,
|
||||
local_dp_rank * world_size + local_world_size,
|
||||
)
|
||||
)
|
||||
except IndexError as e:
|
||||
raise Exception(
|
||||
|
||||
Reference in New Issue
Block a user