[V1] V1 FlashInfer Attention (#16684)

Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Aurick Qiao <qiao@aurick.net>
2025-04-21 18:38:41 -06:00
parent 210207525e
commit 986537f1c3
7 changed files with 668 additions and 13 deletions
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,13 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0

+import pytest
+
 from vllm import LLM, SamplingParams

+from ...utils import fork_new_process_for_each_test

-def test_cascade_attention(example_system_message, monkeypatch):
+
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("attn_backend",
+                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)