[Feat][RL][2/2] Native Weight Syncing API: IPC (#34171)
Signed-off-by: hao-aaron <ahao@anyscale.com> Signed-off-by: Aaron Hao <ahao@anyscale.com> Signed-off-by: ahao-anyscale <ahao@anyscale.com>
This commit is contained in:
@@ -3,18 +3,26 @@
|
||||
"""Tests for weight transfer engine backends.
|
||||
|
||||
Unit tests for engine classes (parsing, validation, registry).
|
||||
Integration test for NCCL weight transfer between processes using Ray.
|
||||
Integration tests for NCCL and IPC weight transfer between processes using Ray.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import pickle
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
from torch.multiprocessing.reductions import reduce_tensor
|
||||
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.weight_transfer import WeightTransferConfig
|
||||
from vllm.distributed.weight_transfer import WeightTransferEngineFactory
|
||||
from vllm.distributed.weight_transfer.ipc_engine import (
|
||||
IPCWeightTransferEngine,
|
||||
IPCWeightTransferInitInfo,
|
||||
IPCWeightTransferUpdateInfo,
|
||||
)
|
||||
from vllm.distributed.weight_transfer.nccl_engine import (
|
||||
NCCLWeightTransferEngine,
|
||||
NCCLWeightTransferInitInfo,
|
||||
@@ -155,9 +163,29 @@ class TestEngineRegistry:
|
||||
engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
|
||||
assert isinstance(engine, NCCLWeightTransferEngine)
|
||||
|
||||
def test_create_engine_ipc(self):
|
||||
"""Test factory creates IPC engine."""
|
||||
config = WeightTransferConfig(backend="ipc")
|
||||
parallel_config = create_mock_parallel_config()
|
||||
engine = WeightTransferEngineFactory.create_engine(config, parallel_config)
|
||||
assert isinstance(engine, IPCWeightTransferEngine)
|
||||
|
||||
def test_create_engine_invalid_backend(self):
|
||||
"""Test factory raises for invalid backend."""
|
||||
config = WeightTransferConfig(backend="invalid")
|
||||
# Pydantic validates Literal types at construction, so we can't create
|
||||
# a config with an invalid backend. Instead, we test by directly
|
||||
# accessing the registry or using model_construct to bypass validation.
|
||||
from pydantic import ValidationError
|
||||
|
||||
# Test that Pydantic prevents invalid backend at construction
|
||||
with pytest.raises(ValidationError):
|
||||
WeightTransferConfig(backend="invalid")
|
||||
|
||||
# Test factory error by creating a config with valid backend but
|
||||
# then manually modifying the backend attribute (bypassing validation)
|
||||
config = WeightTransferConfig(backend="nccl")
|
||||
# Use object.__setattr__ to bypass Pydantic validation
|
||||
object.__setattr__(config, "backend", "invalid")
|
||||
parallel_config = create_mock_parallel_config()
|
||||
with pytest.raises(ValueError, match="Invalid weight transfer backend"):
|
||||
WeightTransferEngineFactory.create_engine(config, parallel_config)
|
||||
@@ -344,3 +372,426 @@ def test_nccl_weight_transfer_between_processes():
|
||||
f"Received shape: {result['received_shape']}, "
|
||||
f"Received sum: {result['received_sum']}"
|
||||
)
|
||||
|
||||
|
||||
# --- Unit Tests: IPCWeightTransferUpdateInfo Validation ---
|
||||
|
||||
|
||||
class TestIPCWeightTransferUpdateInfoValidation:
|
||||
"""Test IPCWeightTransferUpdateInfo dataclass validation."""
|
||||
|
||||
def test_valid_update_info(self):
|
||||
"""Test creating valid IPCWeightTransferUpdateInfo."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
# Create a dummy tensor and IPC handle
|
||||
dummy_tensor = torch.ones(10, 10, device="cuda:0")
|
||||
ipc_handle = reduce_tensor(dummy_tensor)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle}]
|
||||
|
||||
info = IPCWeightTransferUpdateInfo(
|
||||
names=["layer.weight"],
|
||||
dtype_names=["float32"],
|
||||
shapes=[[10, 10]],
|
||||
ipc_handles=ipc_handles,
|
||||
)
|
||||
assert info.names == ["layer.weight"]
|
||||
assert info.dtype_names == ["float32"]
|
||||
assert info.shapes == [[10, 10]]
|
||||
assert len(info.ipc_handles) == 1
|
||||
|
||||
def test_mismatched_dtype_names_raises(self):
|
||||
"""Test that mismatched dtype_names length raises ValueError."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
dummy_tensor = torch.ones(10, 10, device="cuda:0")
|
||||
ipc_handle = reduce_tensor(dummy_tensor)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
|
||||
|
||||
with pytest.raises(ValueError, match="dtype_names"):
|
||||
IPCWeightTransferUpdateInfo(
|
||||
names=["layer.weight", "layer.bias"],
|
||||
dtype_names=["float32"], # Only one dtype
|
||||
shapes=[[10, 10], [10]],
|
||||
ipc_handles=ipc_handles,
|
||||
)
|
||||
|
||||
def test_mismatched_shapes_raises(self):
|
||||
"""Test that mismatched shapes length raises ValueError."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
dummy_tensor = torch.ones(10, 10, device="cuda:0")
|
||||
ipc_handle = reduce_tensor(dummy_tensor)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle}, {gpu_uuid: ipc_handle}]
|
||||
|
||||
with pytest.raises(ValueError, match="shapes"):
|
||||
IPCWeightTransferUpdateInfo(
|
||||
names=["layer.weight", "layer.bias"],
|
||||
dtype_names=["float32", "float32"],
|
||||
shapes=[[10, 10]], # Only one shape
|
||||
ipc_handles=ipc_handles,
|
||||
)
|
||||
|
||||
def test_mismatched_ipc_handles_raises(self):
|
||||
"""Test that mismatched ipc_handles length raises ValueError."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
dummy_tensor = torch.ones(10, 10, device="cuda:0")
|
||||
ipc_handle = reduce_tensor(dummy_tensor)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle}] # Only one handle
|
||||
|
||||
with pytest.raises(ValueError, match="ipc_handles"):
|
||||
IPCWeightTransferUpdateInfo(
|
||||
names=["layer.weight", "layer.bias"],
|
||||
dtype_names=["float32", "float32"],
|
||||
shapes=[[10, 10], [10]],
|
||||
ipc_handles=ipc_handles,
|
||||
)
|
||||
|
||||
def test_valid_update_info_from_pickled(self):
|
||||
"""Test creating IPCWeightTransferUpdateInfo from pickled handles."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
dummy_tensor = torch.ones(10, 10, device="cuda:0")
|
||||
ipc_handle = reduce_tensor(dummy_tensor)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle}]
|
||||
|
||||
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
|
||||
|
||||
info = IPCWeightTransferUpdateInfo(
|
||||
names=["layer.weight"],
|
||||
dtype_names=["float32"],
|
||||
shapes=[[10, 10]],
|
||||
ipc_handles_pickled=pickled,
|
||||
)
|
||||
assert info.ipc_handles == ipc_handles
|
||||
assert info.ipc_handles_pickled is None
|
||||
|
||||
def test_both_handles_and_pickled_raises(self):
|
||||
"""Test that providing both ipc_handles and ipc_handles_pickled raises."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
dummy_tensor = torch.ones(10, 10, device="cuda:0")
|
||||
ipc_handle = reduce_tensor(dummy_tensor)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle}]
|
||||
|
||||
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
|
||||
|
||||
with pytest.raises(ValueError, match="Cannot specify both"):
|
||||
IPCWeightTransferUpdateInfo(
|
||||
names=["layer.weight"],
|
||||
dtype_names=["float32"],
|
||||
shapes=[[10, 10]],
|
||||
ipc_handles=ipc_handles,
|
||||
ipc_handles_pickled=pickled,
|
||||
)
|
||||
|
||||
def test_neither_handles_nor_pickled_raises(self):
|
||||
"""Test that providing neither ipc_handles nor ipc_handles_pickled raises."""
|
||||
with pytest.raises(ValueError, match="must be provided"):
|
||||
IPCWeightTransferUpdateInfo(
|
||||
names=["layer.weight"],
|
||||
dtype_names=["float32"],
|
||||
shapes=[[10, 10]],
|
||||
)
|
||||
|
||||
def test_empty_lists_valid(self):
|
||||
"""Test that empty lists are valid."""
|
||||
info = IPCWeightTransferUpdateInfo(
|
||||
names=[],
|
||||
dtype_names=[],
|
||||
shapes=[],
|
||||
ipc_handles=[],
|
||||
)
|
||||
assert len(info.names) == 0
|
||||
|
||||
|
||||
# --- Unit Tests: IPC Engine Parsing ---
|
||||
|
||||
|
||||
class TestIPCEngineParsing:
|
||||
"""Test IPCWeightTransferEngine parsing methods."""
|
||||
|
||||
def test_parse_update_info_valid(self):
|
||||
"""Test parsing valid update info dict."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
config = WeightTransferConfig(backend="ipc")
|
||||
parallel_config = create_mock_parallel_config()
|
||||
engine = IPCWeightTransferEngine(config, parallel_config)
|
||||
|
||||
# Create dummy IPC handles
|
||||
dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
|
||||
dummy_tensor2 = torch.ones(50, device="cuda:0")
|
||||
ipc_handle1 = reduce_tensor(dummy_tensor1)
|
||||
ipc_handle2 = reduce_tensor(dummy_tensor2)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
|
||||
|
||||
update_info = engine.parse_update_info(
|
||||
{
|
||||
"names": ["w1", "w2"],
|
||||
"dtype_names": ["float32", "bfloat16"],
|
||||
"shapes": [[100, 100], [50]],
|
||||
"ipc_handles": ipc_handles,
|
||||
}
|
||||
)
|
||||
|
||||
assert isinstance(update_info, IPCWeightTransferUpdateInfo)
|
||||
assert update_info.names == ["w1", "w2"]
|
||||
assert update_info.dtype_names == ["float32", "bfloat16"]
|
||||
assert update_info.shapes == [[100, 100], [50]]
|
||||
assert len(update_info.ipc_handles) == 2
|
||||
|
||||
def test_parse_update_info_pickled(self):
|
||||
"""Test parsing update info with pickled IPC handles (HTTP path)."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
config = WeightTransferConfig(backend="ipc")
|
||||
parallel_config = create_mock_parallel_config()
|
||||
engine = IPCWeightTransferEngine(config, parallel_config)
|
||||
|
||||
dummy_tensor1 = torch.ones(100, 100, device="cuda:0")
|
||||
dummy_tensor2 = torch.ones(50, device="cuda:0")
|
||||
ipc_handle1 = reduce_tensor(dummy_tensor1)
|
||||
ipc_handle2 = reduce_tensor(dummy_tensor2)
|
||||
gpu_uuid = str(torch.cuda.get_device_properties(0).uuid)
|
||||
ipc_handles = [{gpu_uuid: ipc_handle1}, {gpu_uuid: ipc_handle2}]
|
||||
|
||||
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
|
||||
|
||||
update_info = engine.parse_update_info(
|
||||
{
|
||||
"names": ["w1", "w2"],
|
||||
"dtype_names": ["float32", "bfloat16"],
|
||||
"shapes": [[100, 100], [50]],
|
||||
"ipc_handles_pickled": pickled,
|
||||
}
|
||||
)
|
||||
|
||||
assert isinstance(update_info, IPCWeightTransferUpdateInfo)
|
||||
assert update_info.names == ["w1", "w2"]
|
||||
assert len(update_info.ipc_handles) == 2
|
||||
assert update_info.ipc_handles_pickled is None
|
||||
assert gpu_uuid in update_info.ipc_handles[0]
|
||||
assert gpu_uuid in update_info.ipc_handles[1]
|
||||
|
||||
|
||||
# --- Integration Test: IPC Weight Transfer Between Ray Tasks ---
|
||||
|
||||
|
||||
def get_physical_gpu_id(device_index: int = 0) -> str:
|
||||
"""Get physical GPU UUID for a device."""
|
||||
props = torch.cuda.get_device_properties(device_index)
|
||||
return str(props.uuid)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=0.5)
|
||||
class TrainerActor:
|
||||
"""Trainer actor that creates and holds CUDA IPC handles."""
|
||||
|
||||
def __init__(self, tensor_shape: list[int], tensor_dtype: str):
|
||||
# Create tensor on GPU and keep it alive
|
||||
dtype = getattr(torch, tensor_dtype)
|
||||
self.tensor = torch.ones(tensor_shape, dtype=dtype, device="cuda:0")
|
||||
self.tensor.fill_(42.0) # Fill with 42 to verify correct transfer
|
||||
|
||||
# Create IPC handle (tensor must stay alive for IPC to work)
|
||||
ipc_handle = reduce_tensor(self.tensor)
|
||||
gpu_uuid = get_physical_gpu_id(0)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
self.ipc_handle_dict = {
|
||||
"ipc_handle": ipc_handle,
|
||||
"gpu_uuid": gpu_uuid,
|
||||
"shape": tensor_shape,
|
||||
"dtype": tensor_dtype,
|
||||
}
|
||||
|
||||
def get_ipc_handle_dict(self) -> dict:
|
||||
"""Return IPC handle dict. Tensor stays alive in this actor."""
|
||||
return self.ipc_handle_dict
|
||||
|
||||
|
||||
@ray.remote(num_gpus=0.5)
|
||||
def inference_receive_ipc_tensor(
|
||||
ipc_handle_dict: dict,
|
||||
mode: str = "ray",
|
||||
) -> dict:
|
||||
"""Inference task that receives tensor via IPCWeightTransferEngine."""
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.weight_transfer import WeightTransferConfig
|
||||
from vllm.distributed.weight_transfer.ipc_engine import (
|
||||
IPCWeightTransferEngine,
|
||||
)
|
||||
|
||||
# Create engine with mock parallel config
|
||||
config = WeightTransferConfig(backend="ipc")
|
||||
parallel_config = MagicMock(spec=ParallelConfig)
|
||||
parallel_config.rank = 0
|
||||
parallel_config.world_size = 1
|
||||
parallel_config.data_parallel_rank = 0
|
||||
|
||||
engine = IPCWeightTransferEngine(config, parallel_config)
|
||||
|
||||
# Initialize the engine (no-op for IPC)
|
||||
init_info = IPCWeightTransferInitInfo()
|
||||
engine.init_transfer_engine(init_info)
|
||||
|
||||
# Receive weights with a no-op load_weights that captures the tensor
|
||||
received_tensors = []
|
||||
|
||||
def noop_load_weights(weights: list[tuple[str, torch.Tensor]]):
|
||||
for name, tensor in weights:
|
||||
# Clone tensor to keep it after engine cleans up
|
||||
received_tensors.append((name, tensor.clone()))
|
||||
|
||||
# Build update dict and go through parse_update_info (exercises __post_init__)
|
||||
ipc_handles = [{ipc_handle_dict["gpu_uuid"]: ipc_handle_dict["ipc_handle"]}]
|
||||
|
||||
if mode == "ray":
|
||||
update_dict: dict = {
|
||||
"names": ["test.weight"],
|
||||
"dtype_names": [ipc_handle_dict["dtype"]],
|
||||
"shapes": [ipc_handle_dict["shape"]],
|
||||
"ipc_handles": ipc_handles,
|
||||
}
|
||||
elif mode == "http":
|
||||
pickled = base64.b64encode(pickle.dumps(ipc_handles)).decode("utf-8")
|
||||
update_dict = {
|
||||
"names": ["test.weight"],
|
||||
"dtype_names": [ipc_handle_dict["dtype"]],
|
||||
"shapes": [ipc_handle_dict["shape"]],
|
||||
"ipc_handles_pickled": pickled,
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unknown mode: {mode}")
|
||||
|
||||
update_info = engine.parse_update_info(update_dict)
|
||||
engine.receive_weights(update_info, noop_load_weights)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Verify we received the tensor
|
||||
success = False
|
||||
received_shape = None
|
||||
received_sum = None
|
||||
|
||||
if len(received_tensors) == 1:
|
||||
name, tensor = received_tensors[0]
|
||||
received_shape = list(tensor.shape)
|
||||
received_sum = tensor.sum().item()
|
||||
# Check shape matches and values are all 42s (trainer sends 42s)
|
||||
if received_shape == ipc_handle_dict["shape"]:
|
||||
expected_sum = 42.0 * torch.tensor(ipc_handle_dict["shape"]).prod().item()
|
||||
if abs(received_sum - expected_sum) < 0.01:
|
||||
success = True
|
||||
|
||||
engine.shutdown()
|
||||
|
||||
return {
|
||||
"success": success,
|
||||
"received_shape": received_shape,
|
||||
"received_sum": received_sum,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 1,
|
||||
reason="Need at least 1 GPU to run IPC weight transfer test.",
|
||||
)
|
||||
@pytest.mark.parametrize("mode", ["ray", "http"])
|
||||
def test_ipc_weight_transfer_between_processes(mode: str):
|
||||
"""Test IPC weight transfer from trainer to inference process using Ray.
|
||||
|
||||
Parametrized over transport modes:
|
||||
- 'ray': ipc_handles passed directly.
|
||||
- 'http': ipc_handles pickled + base64-encoded, unpickled via __post_init__.
|
||||
|
||||
IPC requires same-GPU access, so we use a placement group to co-locate
|
||||
the trainer actor and inference task on the same GPU.
|
||||
"""
|
||||
from ray.util.placement_group import placement_group
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
ray.init(ignore_reinit_error=True)
|
||||
|
||||
# Create a placement group to ensure both processes are on the same GPU
|
||||
# Use fractional GPUs so both tasks can share the same GPU bundle
|
||||
pg = placement_group([{"GPU": 1, "CPU": 2}])
|
||||
ray.get(pg.ready())
|
||||
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=pg,
|
||||
placement_group_capture_child_tasks=True,
|
||||
)
|
||||
|
||||
# Tensor to transfer: 100x100 filled with 42s
|
||||
tensor_shape = [100, 100]
|
||||
tensor_dtype = "float32"
|
||||
|
||||
# Create trainer actor that holds the tensor and IPC handle (stays alive)
|
||||
trainer_actor = TrainerActor.options( # type: ignore[attr-defined]
|
||||
scheduling_strategy=scheduling_strategy
|
||||
).remote(tensor_shape, tensor_dtype)
|
||||
|
||||
# Get IPC handle dict (tensor stays alive in trainer actor)
|
||||
ipc_handle_dict = ray.get(trainer_actor.get_ipc_handle_dict.remote())
|
||||
|
||||
# Receive tensor in inference process using IPC handles (on same GPU)
|
||||
# Trainer actor stays alive during this operation
|
||||
inference_result = ray.get(
|
||||
inference_receive_ipc_tensor.options(
|
||||
scheduling_strategy=scheduling_strategy
|
||||
).remote(ipc_handle_dict, mode=mode)
|
||||
)
|
||||
|
||||
assert inference_result["success"], (
|
||||
f"IPC weight transfer failed (mode={mode}). "
|
||||
f"Received shape: {inference_result['received_shape']}, "
|
||||
f"Received sum: {inference_result['received_sum']}"
|
||||
)
|
||||
|
||||
|
||||
def test_ipc_receive_weights_missing_gpu_uuid_raises():
|
||||
"""Test that receive_weights raises if GPU UUID not found in IPC handles."""
|
||||
if torch.cuda.device_count() < 1:
|
||||
pytest.skip("Need at least 1 GPU for this test")
|
||||
|
||||
config = WeightTransferConfig(backend="ipc")
|
||||
parallel_config = create_mock_parallel_config()
|
||||
engine = IPCWeightTransferEngine(config, parallel_config)
|
||||
|
||||
# Create IPC handle with wrong GPU UUID
|
||||
dummy_tensor = torch.ones(10, 10, device="cuda:0")
|
||||
ipc_handle = reduce_tensor(dummy_tensor)
|
||||
wrong_uuid = "wrong-uuid-12345"
|
||||
ipc_handles = [{wrong_uuid: ipc_handle}]
|
||||
|
||||
update_info = IPCWeightTransferUpdateInfo(
|
||||
names=["w"],
|
||||
dtype_names=["float32"],
|
||||
shapes=[[10, 10]],
|
||||
ipc_handles=ipc_handles,
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="IPC handle not found"):
|
||||
engine.receive_weights(update_info, lambda x: None)
|
||||
|
||||
Reference in New Issue
Block a user