[Core] Allow full cudagraph with separate attention routines and orthogonal to compilation, add support for FA2 and FlashInfer (#20059)

Signed-off-by: fhl <2410591650@qq.com> Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
2025-08-15 22:01:39 +08:00
parent a0632a3e03
commit 74f441f4b5
34 changed files with 1839 additions and 597 deletions
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -18,9 +18,9 @@ from torch.library import Library

 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
-from vllm.forward_context import set_forward_context
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op

 # create a library to hold the custom op
@@ -276,9 +276,11 @@ def run_model(llama_config,
        )
        if split_attn:
            compilation_config.splitting_ops = ["silly.attention"]
+        cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
    else:
        compilation_config = CompilationConfig(
            level=CompilationLevel.NO_COMPILATION, )
+        cudagraph_runtime_mode = CUDAGraphMode.NONE

    vllm_config = VllmConfig(compilation_config=compilation_config,
                             additional_config=llama_config)
@@ -287,17 +289,37 @@ def run_model(llama_config,
                           vllm_config=vllm_config,
                           prefix="").eval().cuda()

-    with set_forward_context({}, vllm_config=vllm_config):
+    with set_forward_context({},
+                             vllm_config=vllm_config):  # background context
        B = 16  # max batch size
        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
        positions = torch.arange(B).cuda()

+        # warmup for the model with cudagraph_mode NONE
        model(input_ids, positions)
-        model(input_ids[:2], positions[:2])
-        model(input_ids[:1], positions[:1])
+
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(input_ids[:2], positions[:2])
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(input_ids[:1], positions[:1])

        input_ids[:2].zero_()
-        output = model(input_ids[:2], positions[:2])
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(input_ids[:2], positions[:2])

        output = output.cpu()