[CPU Backend] [Perf] Accelerate tensor-parallel/data-parallel inference across NUMA domains on Arm (#32792)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
This commit is contained in:
Fadi Arafeh
2026-01-22 18:55:23 +00:00
committed by GitHub
parent 300622e609
commit 744ef30484
6 changed files with 164 additions and 6 deletions

View File

@@ -29,7 +29,10 @@ class CpuCommunicator(DeviceCommunicatorBase):
self.dist_module = torch.distributed
if (
(current_platform.get_cpu_architecture() == CpuArchEnum.X86)
(
current_platform.get_cpu_architecture() == CpuArchEnum.X86
or current_platform.get_cpu_architecture() == CpuArchEnum.ARM
)
and hasattr(torch.ops._C, "init_shm_manager")
and (unique_name.startswith("tp") or unique_name.startswith("pp"))
):