[perf][cpu] Accelerate paged attention GEMMs (QK, PV) on Arm CPUs with NEON (#29193)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
This commit is contained in:
Fadi Arafeh
2025-11-22 17:04:36 +00:00
committed by GitHub
parent f55c76c2b3
commit 730bd35378
5 changed files with 416 additions and 5 deletions

View File

@@ -14,7 +14,7 @@
#include "utils.hpp"
namespace cpu_attention {
enum class ISA { AMX, VEC, VEC16 };
enum class ISA { AMX, VEC, VEC16, NEON };
template <ISA isa, typename scalar_t, int64_t head_dim>
class AttentionImpl {};
@@ -143,6 +143,12 @@ struct AttentionMetadata {
case ISA::VEC:
ss << "VEC, ";
break;
case ISA::VEC16:
ss << "VEC16, ";
break;
case ISA::NEON:
ss << "NEON, ";
break;
}
ss << "workitem_group_num: " << workitem_group_num
<< ", reduction_item_num: " << reduction_item_num