[CPU Backend] [Perf] Accelerate tensor-parallel/data-parallel inference across NUMA domains on Arm (#32792)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
This commit is contained in:
@@ -29,7 +29,10 @@ class CpuCommunicator(DeviceCommunicatorBase):
|
||||
self.dist_module = torch.distributed
|
||||
|
||||
if (
|
||||
(current_platform.get_cpu_architecture() == CpuArchEnum.X86)
|
||||
(
|
||||
current_platform.get_cpu_architecture() == CpuArchEnum.X86
|
||||
or current_platform.get_cpu_architecture() == CpuArchEnum.ARM
|
||||
)
|
||||
and hasattr(torch.ops._C, "init_shm_manager")
|
||||
and (unique_name.startswith("tp") or unique_name.startswith("pp"))
|
||||
):
|
||||
|
||||
@@ -66,6 +66,9 @@ class CPUWorker(Worker):
|
||||
self.local_omp_cpuid = self._get_autobind_cpu_ids(
|
||||
lambda cpus: cpus[-1:]
|
||||
)
|
||||
elif cpu_arch == CpuArchEnum.ARM:
|
||||
# For AArch64, no SMT
|
||||
self.local_omp_cpuid = self._get_autobind_cpu_ids(lambda cpus: cpus)
|
||||
else:
|
||||
self.local_omp_cpuid = "nobind"
|
||||
elif omp_cpuids == "nobind":
|
||||
|
||||
Reference in New Issue
Block a user