[Kernel][Perf] fuse QK Norm and RoPE into one cuda kernel for Qwen Model (#27165)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
This commit is contained in:
zhrrr
2025-11-12 01:00:31 +08:00
committed by GitHub
parent a7ef3eb0cd
commit 68c09efc37
16 changed files with 1243 additions and 38 deletions

View File

@@ -44,6 +44,10 @@ def empty_i32(*args, **kwargs):
return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda")
def empty_i64(*args, **kwargs):
return torch.empty(*args, **kwargs, dtype=torch.int64, device="cuda")
RMS_OP = torch.ops._C.rms_norm.default
RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default