[perf][cpu] Accelerate paged attention GEMMs (QK, PV) on Arm CPUs with NEON (#29193)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2025-11-22 17:04:36 +00:00
parent f55c76c2b3
commit 730bd35378
5 changed files with 416 additions and 5 deletions
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -14,7 +14,7 @@
 #include "utils.hpp"

 namespace cpu_attention {
-enum class ISA { AMX, VEC, VEC16 };
+enum class ISA { AMX, VEC, VEC16, NEON };

 template <ISA isa, typename scalar_t, int64_t head_dim>
 class AttentionImpl {};
@@ -143,6 +143,12 @@ struct AttentionMetadata {
      case ISA::VEC:
        ss << "VEC, ";
        break;
+      case ISA::VEC16:
+        ss << "VEC16, ";
+        break;
+      case ISA::NEON:
+        ss << "NEON, ";
+        break;
    }
    ss << "workitem_group_num: " << workitem_group_num
       << ", reduction_item_num: " << reduction_item_num