[Misc] Remove deprecated arg for cuda graph capture (#9864)
Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -126,7 +126,6 @@ class EngineArgs:
|
||||
tokenizer_revision: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
enforce_eager: Optional[bool] = None
|
||||
max_context_len_to_capture: Optional[int] = None
|
||||
max_seq_len_to_capture: int = 8192
|
||||
disable_custom_all_reduce: bool = False
|
||||
tokenizer_pool_size: int = 0
|
||||
@@ -504,14 +503,6 @@ class EngineArgs:
|
||||
help='Always use eager-mode PyTorch. If False, '
|
||||
'will use eager mode and CUDA graph in hybrid '
|
||||
'for maximal performance and flexibility.')
|
||||
parser.add_argument('--max-context-len-to-capture',
|
||||
type=int,
|
||||
default=EngineArgs.max_context_len_to_capture,
|
||||
help='Maximum context length covered by CUDA '
|
||||
'graphs. When a sequence has context length '
|
||||
'larger than this, we fall back to eager mode. '
|
||||
'(DEPRECATED. Use --max-seq-len-to-capture instead'
|
||||
')')
|
||||
parser.add_argument('--max-seq-len-to-capture',
|
||||
type=int,
|
||||
default=EngineArgs.max_seq_len_to_capture,
|
||||
@@ -939,7 +930,6 @@ class EngineArgs:
|
||||
quantization=self.quantization,
|
||||
quantization_param_path=self.quantization_param_path,
|
||||
enforce_eager=self.enforce_eager,
|
||||
max_context_len_to_capture=self.max_context_len_to_capture,
|
||||
max_seq_len_to_capture=self.max_seq_len_to_capture,
|
||||
max_logprobs=self.max_logprobs,
|
||||
disable_sliding_window=self.disable_sliding_window,
|
||||
|
||||
Reference in New Issue
Block a user