[Encoder decoder] Add cuda graph support during decoding for encoder-decoder models (#7631)

2024-09-17 07:35:01 -07:00
parent 1b6de8352b
commit 1009e93c5d
15 changed files with 525 additions and 111 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -88,7 +88,9 @@ class LLM:
            to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
            When a sequence has context length larger than this, we fall back
-            to eager mode.
+            to eager mode. Additionally for encoder-decoder models, if the
+            sequence length of the encoder input is larger than this, we fall
+            back to the eager mode.
        disable_custom_all_reduce: See ParallelConfig
        **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
            :ref:`engine_args`)
@@ -137,9 +139,7 @@ class LLM:
        LLM constructor.

        Note: if enforce_eager is unset (enforce_eager is None)
-        it defaults to False for decoder-only models and True
-        for encoder/decoder models, since encoder/decoder models
-        do not currently support CUDAGraph.
+        it defaults to False.
        '''

        if "disable_log_stats" not in kwargs: