[1/N] Elastic EP Milestone 2 (#34861)
Signed-off-by: Yongji Wu <wuyongji317@gmail.com> Signed-off-by: Itay Alroy <ialroy@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Ron Tourgeman <rtourgeman@nvidia.com> Co-authored-by: Yongji Wu <wuyongji317@gmail.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>
This commit is contained in:
@@ -71,6 +71,9 @@ class DPCoordinator:
|
||||
)
|
||||
|
||||
local_only_eng = dp_size == parallel_config.data_parallel_size_local
|
||||
# NOTE(yongji): handling scaling from intra-node to inter-node
|
||||
if parallel_config.enable_elastic_ep:
|
||||
local_only_eng = False
|
||||
back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
|
||||
back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
|
||||
|
||||
@@ -201,6 +204,7 @@ class DPCoordinatorProc:
|
||||
|
||||
poller = zmq.Poller()
|
||||
poller.register(publish_front, zmq.POLLIN)
|
||||
poller.register(publish_back, zmq.POLLIN)
|
||||
poller.register(output_back, zmq.POLLIN)
|
||||
last_publish_time = 0
|
||||
while True:
|
||||
@@ -231,6 +235,22 @@ class DPCoordinatorProc:
|
||||
events = dict(events)
|
||||
wave_state_changed = False
|
||||
|
||||
if publish_back in events:
|
||||
buffer = publish_back.recv()
|
||||
if buffer == b"\x01":
|
||||
# NOTE(yongji): newly started engine subscribed
|
||||
# We need to send READY message here instead of receiving
|
||||
# SCALE_ELASTIC_EP notification from engine core client
|
||||
# as SCALE_ELASTIC_EP is only sent when
|
||||
# new engines finished initialization.
|
||||
# Subscription message, on the other hand, is sent
|
||||
# by each engine during initialization
|
||||
publish_back.send(b"READY")
|
||||
else:
|
||||
logger.error(
|
||||
"DP Coordinator receives unexpected message from engines"
|
||||
)
|
||||
|
||||
if publish_front in events:
|
||||
buffer = publish_front.recv()
|
||||
if buffer in (b"\x01", b"\x00"):
|
||||
@@ -259,7 +279,6 @@ class DPCoordinatorProc:
|
||||
# current_wave
|
||||
# we note that 0 is the wave number for the new
|
||||
# engine
|
||||
engines_running = False
|
||||
logger.info(
|
||||
"DPCoordinator scaled up from %s to %s engines",
|
||||
current_count,
|
||||
|
||||
Reference in New Issue
Block a user