[misc][distributed] improve libcudart.so finding (#7127)

This commit is contained in:
youkaichao
2024-08-04 11:31:51 -07:00
committed by GitHub
parent b1c9aa3daa
commit 16a1cc9bb2
2 changed files with 25 additions and 23 deletions

View File

@@ -145,6 +145,7 @@ def can_actually_p2p(
p_tgt.start()
p_src.join()
p_tgt.join()
assert p_src.exitcode == 0 and p_tgt.exitcode == 0
result: List[bool] = []
for src, tgt in zip(batch_src, batch_tgt):
a = result_queue.get()
@@ -221,7 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
# wrap raised exception to provide more information
raise RuntimeError(
f"Error happened when batch testing "
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
f"{returned.stderr.decode()}") from e
result = pickle.loads(returned.stdout)
for _i, _j, r in zip(batch_src, batch_tgt, result):
cache[f"{_i}->{_j}"] = r