[Performance] Split FlashAttn attention and cache update (#25954)
Signed-off-by: ElizaWszola <ewszola@redhat.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Luka Govedič <luka.govedic@gmail.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: mgoin <mgoin64@gmail.com> Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <luka.govedic@gmail.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com>
This commit is contained in:
@@ -130,6 +130,18 @@ def try_get_attention_backend(
|
||||
raise AssertionError("unreachable") from None
|
||||
|
||||
|
||||
def try_backend_includes_kv_cache_update(
|
||||
backend: AttentionBackendEnum,
|
||||
) -> bool:
|
||||
"""Try to get the attention backend class, skipping test if not found."""
|
||||
try:
|
||||
backend_class = backend.get_class()
|
||||
return backend_class.forward_includes_kv_cache_update
|
||||
except ImportError as e:
|
||||
pytest.skip(f"{backend.name} not available: {e}")
|
||||
raise AssertionError("unreachable") from None
|
||||
|
||||
|
||||
def create_standard_kv_cache_spec(vllm_config: VllmConfig) -> FullAttentionSpec:
|
||||
"""Create a FullAttentionSpec from ModelParams only."""
|
||||
return FullAttentionSpec(
|
||||
|
||||
Reference in New Issue
Block a user