[V1] [Hybrid] Enable piecewise CUDA Graph for mamba layers (#21194)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-07-19 21:27:21 +02:00
parent 9f414a12ad
commit 881e3cbe3b
10 changed files with 100 additions and 31 deletions
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -104,7 +104,6 @@ def test_models(
                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
            with vllm_runner(model,
                             max_num_seqs=MAX_NUM_SEQS,
-                             enforce_eager=True,
                             enable_prefix_caching=False) as vllm_model:
                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                    example_prompts, max_tokens, num_logprobs)