[AMD][CI] Fix test_custom_allreduce for A100 testgroup (#34735)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
This commit is contained in:
Ryan Rock
2026-02-20 15:33:04 -06:00
committed by GitHub
parent aaefc58ee0
commit 0632ed8778

View File

@@ -33,6 +33,7 @@ def graph_allreduce(
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False) m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
m.delenv("HIP_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
@@ -92,6 +93,7 @@ def eager_allreduce(
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False) m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
m.delenv("HIP_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)