[Performance] Split FlashAttn attention and cache update (#25954)

Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Luka Govedič <luka.govedic@gmail.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
This commit is contained in:
ElizaWszola
2026-01-24 02:28:06 +01:00
committed by GitHub
parent 0118cdcc02
commit a28b94e6ef
21 changed files with 458 additions and 68 deletions

View File

@@ -130,6 +130,18 @@ def try_get_attention_backend(
raise AssertionError("unreachable") from None
def try_backend_includes_kv_cache_update(
backend: AttentionBackendEnum,
) -> bool:
"""Try to get the attention backend class, skipping test if not found."""
try:
backend_class = backend.get_class()
return backend_class.forward_includes_kv_cache_update
except ImportError as e:
pytest.skip(f"{backend.name} not available: {e}")
raise AssertionError("unreachable") from None
def create_standard_kv_cache_spec(vllm_config: VllmConfig) -> FullAttentionSpec:
"""Create a FullAttentionSpec from ModelParams only."""
return FullAttentionSpec(