[CPU Backend] [Perf] Accelerate tensor-parallel/data-parallel inference across NUMA domains on Arm (#32792)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2026-01-22 18:55:23 +00:00
parent 300622e609
commit 744ef30484
6 changed files with 164 additions and 6 deletions
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -29,7 +29,10 @@ class CpuCommunicator(DeviceCommunicatorBase):
        self.dist_module = torch.distributed

        if (
-            (current_platform.get_cpu_architecture() == CpuArchEnum.X86)
+            (
+                current_platform.get_cpu_architecture() == CpuArchEnum.X86
+                or current_platform.get_cpu_architecture() == CpuArchEnum.ARM
+            )
            and hasattr(torch.ops._C, "init_shm_manager")
            and (unique_name.startswith("tp") or unique_name.startswith("pp"))
        ):