[core][distributed] zmq fallback for broadcasting large objects (#6183)

[core][distributed] add zmq fallback for broadcasting large objects (#6183)
This commit is contained in:
youkaichao
2024-07-09 18:49:11 -07:00
committed by GitHub
parent 2416b26e11
commit da78caecfa
6 changed files with 274 additions and 80 deletions

View File

@@ -6,8 +6,7 @@ from typing import List
import numpy as np
import torch.distributed as dist
from vllm.distributed.device_communicators.shm_broadcast import (
ShmRingBuffer, ShmRingBufferIO)
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
from vllm.utils import update_environment_variables
@@ -56,8 +55,8 @@ def worker_fn_wrapper(fn):
@worker_fn_wrapper
def worker_fn():
writer_rank = 2
broadcaster = ShmRingBufferIO.create_from_process_group(
dist.group.WORLD, 1024 * 1024, 2, writer_rank)
broadcaster = MessageQueue.create_from_process_group(
dist.group.WORLD, 40 * 1024, 2, writer_rank)
if dist.get_rank() == writer_rank:
seed = random.randint(0, 1000)
dist.broadcast_object_list([seed], writer_rank)
@@ -87,13 +86,3 @@ def worker_fn():
def test_shm_broadcast():
distributed_run(worker_fn, 4)
def test_singe_process():
buffer = ShmRingBuffer(1, 1024, 4)
reader = ShmRingBufferIO(buffer, reader_rank=0)
writer = ShmRingBufferIO(buffer, reader_rank=-1)
writer.enqueue([0])
writer.enqueue([1])
assert reader.dequeue() == [0]
assert reader.dequeue() == [1]