Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -70,6 +70,7 @@ class EngineHandshakeMetadata:
including addresses of the front-end ZMQ queues that they should
connect to.
"""
addresses: EngineZmqAddresses
parallel_config: dict[str, Union[int, str, list[int]]]
@@ -103,8 +104,7 @@ class CoreEngineProcManager:
}
if client_handshake_address:
common_kwargs[
"client_handshake_address"] = client_handshake_address
common_kwargs["client_handshake_address"] = client_handshake_address
self.processes: list[BaseProcess] = []
local_dp_ranks = []
@@ -115,21 +115,27 @@ class CoreEngineProcManager:
# Start EngineCore in background process.
local_dp_ranks.append(local_index)
self.processes.append(
context.Process(target=target_fn,
name=f"EngineCore_DP{global_index}",
kwargs=common_kwargs | {
"dp_rank": global_index,
"local_dp_rank": local_index,
}))
context.Process(
target=target_fn,
name=f"EngineCore_DP{global_index}",
kwargs=common_kwargs
| {
"dp_rank": global_index,
"local_dp_rank": local_index,
},
)
)
self._finalizer = weakref.finalize(self, shutdown, self.processes)
data_parallel = vllm_config.parallel_config.data_parallel_size > 1
try:
for proc, local_dp_rank in zip(self.processes, local_dp_ranks):
with set_device_control_env_var(
vllm_config, local_dp_rank) if (
data_parallel) else contextlib.nullcontext():
with (
set_device_control_env_var(vllm_config, local_dp_rank)
if (data_parallel)
else contextlib.nullcontext()
):
proc.start()
finally:
# Kill other procs if not all are running.
@@ -151,13 +157,15 @@ class CoreEngineProcManager:
"""Returns dict of proc name -> exit code for any finished procs."""
return {
proc.name: proc.exitcode
for proc in self.processes if proc.exitcode is not None
for proc in self.processes
if proc.exitcode is not None
}
@contextlib.contextmanager
def set_device_control_env_var(vllm_config: VllmConfig,
local_dp_rank: int) -> Iterator[None]:
def set_device_control_env_var(
vllm_config: VllmConfig, local_dp_rank: int
) -> Iterator[None]:
"""
Temporarily set CUDA_VISIBLE_DEVICES or equivalent
for engine subprocess.
@@ -166,12 +174,13 @@ def set_device_control_env_var(vllm_config: VllmConfig,
evar = current_platform.device_control_env_var
value = get_device_indices(evar, local_dp_rank, world_size)
with patch.dict(os.environ, values=((evar, value), )):
with patch.dict(os.environ, values=((evar, value),)):
yield
def get_device_indices(device_control_env_var: str, local_dp_rank: int,
world_size: int):
def get_device_indices(
device_control_env_var: str, local_dp_rank: int, world_size: int
):
"""
Returns a comma-separated string of device indices for the specified
data parallel rank.
@@ -182,14 +191,16 @@ def get_device_indices(device_control_env_var: str, local_dp_rank: int,
try:
value = ",".join(
str(current_platform.device_id_to_physical_device_id(i))
for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
world_size))
for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)
)
except IndexError as e:
raise Exception(f"Error setting {device_control_env_var}: "
f"local range: [{local_dp_rank * world_size}, "
f"{(local_dp_rank + 1) * world_size}) "
"base value: "
f"\"{os.getenv(device_control_env_var)}\"") from e
raise Exception(
f"Error setting {device_control_env_var}: "
f"local range: [{local_dp_rank * world_size}, "
f"{(local_dp_rank + 1) * world_size}) "
"base value: "
f'"{os.getenv(device_control_env_var)}"'
) from e
return value
@@ -215,8 +226,7 @@ class CoreEngineActorManager:
import ray
from ray.runtime_env import RuntimeEnv
from ray.util.scheduling_strategies import (
PlacementGroupSchedulingStrategy)
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from vllm.v1.engine.core import DPEngineCoreActor
@@ -225,8 +235,7 @@ class CoreEngineActorManager:
env_vars_list = get_env_vars_to_copy(destination="DPEngineCoreActor")
self.env_vars_dict = {
name: os.environ[name]
for name in env_vars_list if name in os.environ
name: os.environ[name] for name in env_vars_list if name in os.environ
}
runtime_env = RuntimeEnv(env_vars=self.env_vars_dict)
@@ -234,37 +243,38 @@ class CoreEngineActorManager:
self.executor_class = executor_class
self.log_stats = log_stats
dp_size = vllm_config.parallel_config.data_parallel_size
local_engine_count = \
vllm_config.parallel_config.data_parallel_size_local
local_engine_count = vllm_config.parallel_config.data_parallel_size_local
world_size = vllm_config.parallel_config.world_size
if ray.is_initialized():
logger.info(
"Ray is already initialized. Skipping Ray initialization.")
logger.info("Ray is already initialized. Skipping Ray initialization.")
else:
ray.init()
if placement_groups is not None:
assert local_dp_ranks is not None, (
"local_dp_ranks must be provided if "
"placement_groups is provided")
"local_dp_ranks must be provided if placement_groups is provided"
)
assert len(placement_groups) == len(local_dp_ranks), (
"placement_groups and local_dp_ranks must "
"have the same length")
"placement_groups and local_dp_ranks must have the same length"
)
logger.info("Using provided placement groups")
# TODO(rui): validate passed-in placement groups
self.created_placement_groups = []
else:
placement_groups, local_dp_ranks = \
placement_groups, local_dp_ranks = (
CoreEngineActorManager.create_dp_placement_groups(vllm_config)
)
self.created_placement_groups = placement_groups
assert len(placement_groups) == dp_size, (
"Number of placement groups must match data parallel size")
"Number of placement groups must match data parallel size"
)
self.placement_group_is_local = []
refs = []
for index, local_index, pg in zip(range(dp_size), local_dp_ranks,
placement_groups):
for index, local_index, pg in zip(
range(dp_size), local_dp_ranks, placement_groups
):
dp_vllm_config = copy.deepcopy(vllm_config)
dp_vllm_config.parallel_config.placement_group = pg
local_client = index < local_engine_count
@@ -275,24 +285,32 @@ class CoreEngineActorManager:
# https://github.com/ray-project/ray/blob/master/python/ray/_private/accelerators/intel_gpu.py#L56 # noqa: E501
if current_platform.is_xpu():
device_evar = current_platform.device_control_env_var
device_indices = get_device_indices(device_evar, local_index,
world_size)
device_indices = get_device_indices(
device_evar, local_index, world_size
)
actor_env_vars = self.env_vars_dict.copy()
actor_env_vars[device_evar] = device_indices
runtime_env = RuntimeEnv(env_vars=actor_env_vars)
actor = ray.remote(DPEngineCoreActor).options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_bundle_index=world_size,
),
runtime_env=runtime_env).remote(vllm_config=dp_vllm_config,
executor_class=executor_class,
log_stats=log_stats,
local_client=local_client,
addresses=addresses,
dp_rank=index,
local_dp_rank=local_index)
actor = (
ray.remote(DPEngineCoreActor)
.options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_bundle_index=world_size,
),
runtime_env=runtime_env,
)
.remote(
vllm_config=dp_vllm_config,
executor_class=executor_class,
log_stats=log_stats,
local_client=local_client,
addresses=addresses,
dp_rank=index,
local_dp_rank=local_index,
)
)
if local_client:
self.local_engine_actors.append(actor)
else:
@@ -307,7 +325,7 @@ class CoreEngineActorManager:
@staticmethod
def create_dp_placement_groups(
vllm_config: VllmConfig
vllm_config: VllmConfig,
) -> tuple[list["PlacementGroup"], list[int]]:
"""
Create placement groups for data parallel.
@@ -317,23 +335,23 @@ class CoreEngineActorManager:
from ray._private.state import available_resources_per_node
logger.info("Creating placement groups for data parallel")
dp_master_ip = \
vllm_config.parallel_config.data_parallel_master_ip
dp_master_ip = vllm_config.parallel_config.data_parallel_master_ip
num_pg_to_create = vllm_config.parallel_config.data_parallel_size
local_engine_count = \
vllm_config.parallel_config.data_parallel_size_local
local_engine_count = vllm_config.parallel_config.data_parallel_size_local
available_resources = available_resources_per_node()
world_size = vllm_config.parallel_config.world_size
placement_groups: list[PlacementGroup] = []
local_dp_ranks: list[int] = []
dp_master_ip_key = f'node:{dp_master_ip}'
nodes = sorted(available_resources.values(),
key=lambda x: dp_master_ip_key not in x)
assert len(nodes) > 0, (
"No nodes with resources found in Ray cluster.")
dp_master_ip_key = f"node:{dp_master_ip}"
nodes = sorted(
available_resources.values(), key=lambda x: dp_master_ip_key not in x
)
assert len(nodes) > 0, "No nodes with resources found in Ray cluster."
assert dp_master_ip_key in nodes[0], (
"The DP master node (ip: %s) is missing or dead", dp_master_ip)
"The DP master node (ip: %s) is missing or dead",
dp_master_ip,
)
device_str = current_platform.ray_device_key
for node_resources in nodes:
if device_str not in node_resources:
@@ -341,19 +359,16 @@ class CoreEngineActorManager:
# For now, each DP rank can only be assigned to one node
# TODO(rui): support allocating a single DP rank
# to multiple nodes
available_engine_count = int(
node_resources[device_str]) // world_size
available_engine_count = int(node_resources[device_str]) // world_size
if dp_master_ip_key in node_resources:
assert available_engine_count >= local_engine_count, (
"Not enough resources to allocate DP ranks "
f"on DP master node {dp_master_ip}")
f"on DP master node {dp_master_ip}"
)
for i in range(local_engine_count):
bundles = [{
device_str: 1.0,
"node:" + dp_master_ip: 0.001
}] * world_size + [{
"CPU": 1.0
}]
bundles = [
{device_str: 1.0, "node:" + dp_master_ip: 0.001}
] * world_size + [{"CPU": 1.0}]
pg = ray.util.placement_group(
name=f"dp_rank_{len(placement_groups)}",
strategy="STRICT_PACK",
@@ -379,7 +394,8 @@ class CoreEngineActorManager:
"placement groups, only created "
f"{len(placement_groups)} placement groups. "
"Available resources: "
f"{available_resources}")
f"{available_resources}"
)
return placement_groups, local_dp_ranks
@staticmethod
@@ -390,8 +406,10 @@ class CoreEngineActorManager:
Add placement groups for new data parallel size.
"""
import ray
from ray._private.state import (available_resources_per_node,
total_resources_per_node)
from ray._private.state import (
available_resources_per_node,
total_resources_per_node,
)
from ray.util.state import list_nodes
old_dp_size = old_vllm_config.parallel_config.data_parallel_size
@@ -405,10 +423,10 @@ class CoreEngineActorManager:
nodes = list_nodes()
nodes = sorted(nodes, key=lambda node: node.node_ip != dp_master_ip)
assert nodes[0].node_ip == dp_master_ip, (
"The first node must be the head node")
assert nodes[0].node_ip == dp_master_ip, "The first node must be the head node"
assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
"There can only be one head node")
"There can only be one head node"
)
available_resources = available_resources_per_node()
total_resources = total_resources_per_node()
@@ -446,12 +464,9 @@ class CoreEngineActorManager:
# Create bundles with node constraint for master node
if node_ip == dp_master_ip:
bundles = [{
device_str: 1.0,
"node:" + dp_master_ip: 0.001
}] * world_size + [{
"CPU": 1.0
}]
bundles = [
{device_str: 1.0, "node:" + dp_master_ip: 0.001}
] * world_size + [{"CPU": 1.0}]
else:
bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
@@ -470,69 +485,76 @@ class CoreEngineActorManager:
return placement_groups, local_dp_ranks
def scale_up_elastic_ep(self, cur_vllm_config: VllmConfig,
new_data_parallel_size: int) -> None:
def scale_up_elastic_ep(
self, cur_vllm_config: VllmConfig, new_data_parallel_size: int
) -> None:
import copy
import ray
from ray.runtime_env import RuntimeEnv
from ray.util.scheduling_strategies import (
PlacementGroupSchedulingStrategy)
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from vllm.v1.engine.core import DPEngineCoreActor
cur_data_parallel_size = len(self.local_engine_actors) + \
len(self.remote_engine_actors)
cur_data_parallel_size = len(self.local_engine_actors) + len(
self.remote_engine_actors
)
assert new_data_parallel_size > cur_data_parallel_size, (
f"New data parallel size {new_data_parallel_size} must be greater "
f"than current data parallel size {cur_data_parallel_size} "
"for scale up")
"for scale up"
)
placement_groups, local_dp_ranks = \
self.add_dp_placement_groups(
cur_vllm_config, new_data_parallel_size)
placement_groups, local_dp_ranks = self.add_dp_placement_groups(
cur_vllm_config, new_data_parallel_size
)
world_size = cur_vllm_config.parallel_config.world_size
dp_master_ip = cur_vllm_config.parallel_config.data_parallel_master_ip
new_local_engines = 0
runtime_env = RuntimeEnv(env_vars=self.env_vars_dict
| {"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": "1"})
for i, (pg,
local_rank) in enumerate(zip(placement_groups,
local_dp_ranks)):
runtime_env = RuntimeEnv(
env_vars=self.env_vars_dict | {"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": "1"}
)
for i, (pg, local_rank) in enumerate(zip(placement_groups, local_dp_ranks)):
rank = cur_data_parallel_size + i
dp_vllm_config = copy.deepcopy(cur_vllm_config)
dp_vllm_config.parallel_config.data_parallel_size = \
new_data_parallel_size
dp_vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
dp_vllm_config.parallel_config.placement_group = pg
# Check if this placement group is on the head node
local_client = any(
bundle.get("node:" + dp_master_ip, 0) > 0
for bundle in pg.bundle_specs)
bundle.get("node:" + dp_master_ip, 0) > 0 for bundle in pg.bundle_specs
)
if local_client:
new_local_engines += 1
# Update data_parallel_size_local
dp_vllm_config.parallel_config.data_parallel_size_local = (
cur_vllm_config.parallel_config.data_parallel_size_local +
new_local_engines)
cur_vllm_config.parallel_config.data_parallel_size_local
+ new_local_engines
)
actor = ray.remote(DPEngineCoreActor).options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_bundle_index=world_size,
),
runtime_env=runtime_env).remote(
actor = (
ray.remote(DPEngineCoreActor)
.options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_bundle_index=world_size,
),
runtime_env=runtime_env,
)
.remote(
vllm_config=dp_vllm_config,
executor_class=self.executor_class,
log_stats=self.log_stats,
local_client=local_client,
addresses=self.addresses,
dp_rank=rank,
local_dp_rank=local_rank)
local_dp_rank=local_rank,
)
)
if local_client:
self.local_engine_actors.append(actor)
@@ -541,37 +563,47 @@ class CoreEngineActorManager:
self.created_placement_groups.append(pg)
self.placement_group_is_local.append(local_client)
ray.get([
actor.wait_for_init.remote()
for actor in (self.local_engine_actors[-new_local_engines:]
if new_local_engines > 0 else []) +
self.remote_engine_actors[-(len(placement_groups) -
new_local_engines):]
])
ray.get(
[
actor.wait_for_init.remote()
for actor in (
self.local_engine_actors[-new_local_engines:]
if new_local_engines > 0
else []
)
+ self.remote_engine_actors[
-(len(placement_groups) - new_local_engines) :
]
]
)
actors = (self.local_engine_actors[-new_local_engines:]
if new_local_engines > 0 else []) + \
self.remote_engine_actors[-(len(placement_groups) -
new_local_engines):]
actors = (
self.local_engine_actors[-new_local_engines:]
if new_local_engines > 0
else []
) + self.remote_engine_actors[-(len(placement_groups) - new_local_engines) :]
for actor in actors:
self.run_refs.append(actor.run.remote())
cur_vllm_config.parallel_config.data_parallel_size = \
new_data_parallel_size
cur_vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
# Update old_vllm_config with new data_parallel_size_local if any new
# local engines were added
if new_local_engines > 0:
cur_vllm_config.parallel_config.data_parallel_size_local += \
cur_vllm_config.parallel_config.data_parallel_size_local += (
new_local_engines
)
def scale_down_elastic_ep(self, cur_data_parallel_size: int,
new_data_parallel_size: int) -> None:
def scale_down_elastic_ep(
self, cur_data_parallel_size: int, new_data_parallel_size: int
) -> None:
import ray
assert cur_data_parallel_size > new_data_parallel_size, (
f"cur_data_parallel_size {cur_data_parallel_size} must be greater "
f"than new_data_parallel_size {new_data_parallel_size} "
"for scale down")
"for scale down"
)
for _ in range(cur_data_parallel_size - new_data_parallel_size):
pg = self.created_placement_groups.pop()
is_local = self.placement_group_is_local.pop()
@@ -586,6 +618,7 @@ class CoreEngineActorManager:
def close(self):
import ray
for actor in self.local_engine_actors + self.remote_engine_actors:
ray.kill(actor)
for pg in self.created_placement_groups:
@@ -598,11 +631,13 @@ def launch_core_engines(
executor_class: type[Executor],
log_stats: bool,
num_api_servers: int = 1,
) -> Iterator[tuple[
) -> Iterator[
tuple[
Optional[Union[CoreEngineProcManager, CoreEngineActorManager]],
Optional[DPCoordinator],
EngineZmqAddresses,
]]:
]
]:
"""Launch engine and DP coordinator processes as needed."""
parallel_config = vllm_config.parallel_config
@@ -611,8 +646,10 @@ def launch_core_engines(
local_start_index = parallel_config.data_parallel_rank_local
dp_rank = parallel_config.data_parallel_rank
host = parallel_config.data_parallel_master_ip
local_engines_only = (parallel_config.data_parallel_hybrid_lb
or parallel_config.data_parallel_external_lb)
local_engines_only = (
parallel_config.data_parallel_hybrid_lb
or parallel_config.data_parallel_external_lb
)
# In offline mode there is an LLM instance per DP rank and
# one core engine per LLM, see
@@ -621,8 +658,9 @@ def launch_core_engines(
# client_local_only = True for cases where this front-end
# sends requests only to colocated engines.
client_local_only = (offline_mode or local_engines_only
or (local_engine_count == dp_size))
client_local_only = (
offline_mode or local_engines_only or (local_engine_count == dp_size)
)
# Set up input and output addresses.
addresses = EngineZmqAddresses(
@@ -644,12 +682,13 @@ def launch_core_engines(
coordinator = DPCoordinator(parallel_config)
addresses.coordinator_input, addresses.coordinator_output = (
coordinator.get_engine_socket_addresses())
coordinator.get_engine_socket_addresses()
)
addresses.frontend_stats_publish_address = (
coordinator.get_stats_publish_address())
coordinator.get_stats_publish_address()
)
logger.info("Started DP Coordinator process (PID: %d)",
coordinator.proc.pid)
logger.info("Started DP Coordinator process (PID: %d)", coordinator.proc.pid)
else:
coordinator = None
@@ -675,14 +714,14 @@ def launch_core_engines(
# Note this also covers the case where we have zero local engines
# and rank 0 is headless.
engines_to_handshake = [
CoreEngine(index=i, local=(i < local_engine_count))
for i in range(dp_size)
CoreEngine(index=i, local=(i < local_engine_count)) for i in range(dp_size)
]
else:
# Rank > 0 handshakes with just the local cores it is managing.
assert local_engines_only, (
"Attempting to launch core_engines from dp_rank > 0, but "
"found internal DPLB, which is incompatible.")
"found internal DPLB, which is incompatible."
)
engines_to_handshake = [
CoreEngine(index=i, local=True)
for i in range(dp_rank, dp_rank + local_engine_count)
@@ -695,7 +734,8 @@ def launch_core_engines(
handshake_local_only = offline_mode or local_engine_count == dp_size
handshake_address = get_engine_client_zmq_addr(
handshake_local_only, host, parallel_config.data_parallel_rpc_port)
handshake_local_only, host, parallel_config.data_parallel_rpc_port
)
if local_engines_only and dp_rank > 0:
assert not handshake_local_only
@@ -705,9 +745,9 @@ def launch_core_engines(
local_handshake_address = handshake_address
client_handshake_address = None
with zmq_socket_ctx(local_handshake_address, zmq.ROUTER,
bind=True) as handshake_socket:
with zmq_socket_ctx(
local_handshake_address, zmq.ROUTER, bind=True
) as handshake_socket:
from vllm.v1.engine.core import EngineCoreProc
# Start local engines.
@@ -722,7 +762,8 @@ def launch_core_engines(
local_client=True,
local_engine_count=local_engine_count,
start_index=dp_rank,
local_start_index=local_start_index or 0)
local_start_index=local_start_index or 0,
)
else:
local_engine_manager = None
@@ -757,8 +798,10 @@ def wait_for_engine_startup(
poller = zmq.Poller()
poller.register(handshake_socket, zmq.POLLIN)
remote_should_be_headless = not parallel_config.data_parallel_hybrid_lb \
remote_should_be_headless = (
not parallel_config.data_parallel_hybrid_lb
and not parallel_config.data_parallel_external_lb
)
if proc_manager is not None:
for sentinel in proc_manager.sentinels():
@@ -770,67 +813,73 @@ def wait_for_engine_startup(
if not events:
if any(conn_pending):
logger.debug(
"Waiting for %d local, %d remote core engine proc(s) "
"to connect.", *conn_pending)
"Waiting for %d local, %d remote core engine proc(s) to connect.",
*conn_pending,
)
if any(start_pending):
logger.debug(
"Waiting for %d local, %d remote core engine proc(s) "
"to start.", *start_pending)
"Waiting for %d local, %d remote core engine proc(s) to start.",
*start_pending,
)
continue
if len(events) > 1 or events[0][0] != handshake_socket:
# One of the local core processes exited.
finished = proc_manager.finished_procs() if proc_manager else {}
if coord_process is not None and coord_process.exitcode is not None:
finished[coord_process.name] = coord_process.exitcode
raise RuntimeError("Engine core initialization failed. "
"See root cause above. "
f"Failed core proc(s): {finished}")
raise RuntimeError(
"Engine core initialization failed. "
"See root cause above. "
f"Failed core proc(s): {finished}"
)
# Receive HELLO and READY messages from the input socket.
eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
eng_index = int.from_bytes(eng_identity, "little")
engine = next((e for e in core_engines if e.identity == eng_identity),
None)
engine = next((e for e in core_engines if e.identity == eng_identity), None)
if engine is None:
raise RuntimeError(f"Message from engine with unexpected data "
f"parallel rank: {eng_index}")
raise RuntimeError(
f"Message from engine with unexpected data parallel rank: {eng_index}"
)
msg = msgspec.msgpack.decode(ready_msg_bytes)
status, local, headless = msg["status"], msg["local"], msg["headless"]
if local != engine.local:
raise RuntimeError(f"{status} message from "
f"{'local' if local else 'remote'} "
f"engine {eng_index}, expected it to be "
f"{'local' if engine.local else 'remote'}")
raise RuntimeError(
f"{status} message from "
f"{'local' if local else 'remote'} "
f"engine {eng_index}, expected it to be "
f"{'local' if engine.local else 'remote'}"
)
# Remote engines must be headless iff we aren't in hybrid dp lb mode.
if not local and headless != remote_should_be_headless:
if headless:
raise RuntimeError(f"Remote engine {eng_index} must not use "
f"--headless in external or hybrid dp lb "
f"mode")
raise RuntimeError(
f"Remote engine {eng_index} must not use "
f"--headless in external or hybrid dp lb "
f"mode"
)
else:
raise RuntimeError(f"Remote engine {eng_index} must use "
f"--headless unless in external or hybrid "
f"dp lb mode")
raise RuntimeError(
f"Remote engine {eng_index} must use "
f"--headless unless in external or hybrid "
f"dp lb mode"
)
if status == "HELLO" and engine.state == CoreEngineState.NEW:
# Send init message with DP config info.
init_message = msgspec.msgpack.encode(
EngineHandshakeMetadata(
addresses=addresses,
parallel_config={
"data_parallel_master_ip":
parallel_config.data_parallel_master_ip,
"data_parallel_master_port":
parallel_config.data_parallel_master_port,
"_data_parallel_master_port_list":
parallel_config._data_parallel_master_port_list,
"data_parallel_size":
parallel_config.data_parallel_size,
}))
handshake_socket.send_multipart((eng_identity, init_message),
copy=False)
"data_parallel_master_ip": parallel_config.data_parallel_master_ip,
"data_parallel_master_port": parallel_config.data_parallel_master_port,
"_data_parallel_master_port_list": parallel_config._data_parallel_master_port_list,
"data_parallel_size": parallel_config.data_parallel_size,
},
)
)
handshake_socket.send_multipart((eng_identity, init_message), copy=False)
conn_pending[0 if local else 1] -= 1
start_pending[0 if local else 1] += 1
engine.state = CoreEngineState.CONNECTED
@@ -846,15 +895,20 @@ def wait_for_engine_startup(
# one of the engine handshakes, and passed to the local
# front-end process in the response from the other.
if addresses.frontend_stats_publish_address is None:
addresses.frontend_stats_publish_address = msg.get(
"dp_stats_address")
addresses.frontend_stats_publish_address = msg.get("dp_stats_address")
start_pending[0 if local else 1] -= 1
engine.state = CoreEngineState.READY
else:
raise RuntimeError(f"Unexpected {status} message for "
f"{'local' if local else 'remote'} engine "
f"{eng_index} in {engine.state} state.")
raise RuntimeError(
f"Unexpected {status} message for "
f"{'local' if local else 'remote'} engine "
f"{eng_index} in {engine.state} state."
)
logger.debug("%s from %s core engine process %s.", status,
"local" if local else "remote", eng_index)
logger.debug(
"%s from %s core engine process %s.",
status,
"local" if local else "remote",
eng_index,
)