[Core] Subclass ModelRunner to support cross-attention & encoder sequences (towards eventual encoder/decoder model support) (#4942)

Co-authored-by: Andrew Feldman <afeld2012@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-08-06 16:51:47 -04:00
parent 660470e5a3
commit fd95e026e0
33 changed files with 3957 additions and 333 deletions
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -211,5 +211,5 @@ def test_varlen_with_paged_kv(
        sliding_window=sliding_window,
        soft_cap=soft_cap,
    )
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+    assert torch.allclose(output, ref_output, atol=2e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"