[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 14:16:02 -07:00
parent 0005d2a3c9
commit f1816fb192
18 changed files with 81 additions and 39 deletions
--- a/tests/v1/e2e/general/test_cascade_attention.py
+++ b/tests/v1/e2e/general/test_cascade_attention.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from ....utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
+def test_cascade_attention(example_system_message, attn_backend):
+    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
+
+    if attn_backend == "FLASHINFER":
+        pytest.skip(
+            "This test is failing with FlashInfer backend and "
+            "needs investigation. See issue #25679."
+        )
+
+    llm = LLM(
+        model="Qwen/Qwen2-1.5B-Instruct", attention_config={"backend": attn_backend}
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+    # No cascade attention.
+    single_prompt = [example_system_message + prompt]
+    responses = llm.generate(single_prompt, sampling_params)
+    ref_output = responses[0].outputs[0].text
+
+    # (Probably) Use cascade attention.
+    prompts = [example_system_message + prompt] * 64
+    responses = llm.generate(prompts, sampling_params)
+    for response in responses:
+        assert response.outputs[0].text == ref_output