Optimize model execution with CUDA graph (#1926)

Co-authored-by: Chen Shen <scv119@gmail.com>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
This commit is contained in:
Woosuk Kwon
2023-12-16 21:12:08 -08:00
committed by GitHub
parent eed74a558f
commit 37ca558103
34 changed files with 557 additions and 254 deletions

View File

@@ -33,6 +33,8 @@ class EngineArgs:
revision: Optional[str] = None
tokenizer_revision: Optional[str] = None
quantization: Optional[str] = None
enforce_eager: bool = False
max_context_len_to_capture: int = 8192
def __post_init__(self):
if self.tokenizer is None:
@@ -182,6 +184,17 @@ class EngineArgs:
choices=['awq', 'gptq', 'squeezellm', None],
default=None,
help='Method used to quantize the weights')
parser.add_argument('--enforce-eager',
action='store_true',
help='Always use eager-mode PyTorch. If False, '
'will use eager mode and CUDA graph in hybrid '
'for maximal performance and flexibility.')
parser.add_argument('--max-context-len-to-capture',
type=int,
default=EngineArgs.max_context_len_to_capture,
help='maximum context length covered by CUDA '
'graphs. When a sequence has context length '
'larger than this, we fall back to eager mode.')
return parser
@classmethod
@@ -200,7 +213,8 @@ class EngineArgs:
self.download_dir, self.load_format,
self.dtype, self.seed, self.revision,
self.tokenizer_revision, self.max_model_len,
self.quantization)
self.quantization, self.enforce_eager,
self.max_context_len_to_capture)
cache_config = CacheConfig(self.block_size,
self.gpu_memory_utilization,
self.swap_space,