[perf][cpu] Accelerate paged attention GEMMs (QK, PV) on Arm CPUs with NEON (#29193)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
This commit is contained in:
Fadi Arafeh
2025-11-22 17:04:36 +00:00
committed by GitHub
parent f55c76c2b3
commit 730bd35378
5 changed files with 416 additions and 5 deletions

View File

@@ -1392,11 +1392,10 @@ class EngineArgs:
# Set default arguments for V1 Engine.
self._set_default_args(usage_context, model_config)
# Disable chunked prefill and prefix caching for:
# POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
# POWER (ppc64le)/s390x/RISCV CPUs in V1
if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
CpuArchEnum.POWERPC,
CpuArchEnum.S390X,
CpuArchEnum.ARM,
CpuArchEnum.RISCV,
):
logger.info(

View File

@@ -25,7 +25,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec
logger = init_logger(__name__)
_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86,)
_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86, CpuArchEnum.ARM)
class CPUAttentionBackend(AttentionBackend):
@@ -491,6 +491,9 @@ def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
return "amx"
elif block_size % 32 == 0:
return "vec"
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
return "neon"
else:
return "vec"
else:
return "vec16"