fix[DP][v1]: Prevent hangs from mismatched worker configurations (#26218)

Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
This commit is contained in:
Ayush Satyam
2025-10-08 11:25:08 +05:30
committed by GitHub
parent 0d4f48fa10
commit 5e65d6b2ad
3 changed files with 46 additions and 11 deletions

View File

@@ -681,17 +681,21 @@ class EngineCoreProc(EngineCore):
# external LB case for our colocated front-end to use (coordinator
# only runs with rank 0).
dp_stats_address = self.frontend_stats_publish_address
handshake_socket.send(
msgspec.msgpack.encode(
{
"status": "READY",
"local": local_client,
"headless": headless,
"num_gpu_blocks": num_gpu_blocks,
"dp_stats_address": dp_stats_address,
}
# Include config hash for DP configuration validation
ready_msg = {
"status": "READY",
"local": local_client,
"headless": headless,
"num_gpu_blocks": num_gpu_blocks,
"dp_stats_address": dp_stats_address,
}
if vllm_config.parallel_config.data_parallel_size > 1:
ready_msg["parallel_config_hash"] = (
vllm_config.parallel_config.compute_hash()
)
)
handshake_socket.send(msgspec.msgpack.encode(ready_msg))
@staticmethod
def startup_handshake(