[ROCm] AITER fused RoPE+KVCache (#33443)

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
Co-authored-by: charlifu <charlifu@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
This commit is contained in:
Rohan Potdar
2026-02-23 21:06:00 -06:00
committed by GitHub
parent 95642441d0
commit 2ff4e51152
19 changed files with 1211 additions and 83 deletions

View File

@@ -723,6 +723,33 @@ class AttentionImpl(AttentionImplBase[T], Generic[T]):
"""
return False
def fused_rope_kvcache_supported(self):
"""
Does this attention implementation support RoPE+KVCache fusion.
This is used by the RopeKVCacheFusionPass to only fuse the RoPE ops
with the KV cache update for implementations that support it.
"""
return False
def do_rope_and_kv_cache_update(
self,
layer: AttentionLayer,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
positions: torch.Tensor,
cos_sin_cache: torch.Tensor,
is_neox: bool,
kv_cache: torch.Tensor,
layer_slot_mapping: torch.Tensor,
):
"""
If `fused_rope_kvcache_supported` returns True, this method will be called
by torch.ops.vllm.fused_rope_and_unified_kv_cache_update
to perform the inplace RoPE and KV cache update.
"""
raise NotImplementedError
class MLAAttentionImpl(AttentionImplBase[T], Generic[T]):
"""MLA attention implementation with forward_mqa and forward_mha methods."""