[Encoder decoder] Add cuda graph support during decoding for encoder-decoder models (#7631)

This commit is contained in:
sroy745
2024-09-17 07:35:01 -07:00
committed by GitHub
parent 1b6de8352b
commit 1009e93c5d
15 changed files with 525 additions and 111 deletions

View File

@@ -472,7 +472,10 @@ class EngineArgs:
default=EngineArgs.max_seq_len_to_capture,
help='Maximum sequence length covered by CUDA '
'graphs. When a sequence has context length '
'larger than this, we fall back to eager mode.')
'larger than this, we fall back to eager mode. '
'Additionally for encoder-decoder models, if the '
'sequence length of the encoder input is larger '
'than this, we fall back to the eager mode.')
parser.add_argument('--disable-custom-all-reduce',
action='store_true',
default=EngineArgs.disable_custom_all_reduce,