[V0 Deprecation] Remove max_seq_len_to_capture (#25543)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -177,11 +177,6 @@ class ModelConfig:
|
||||
graph and always execute the model in eager mode. If False, we will use
|
||||
CUDA graph and eager execution in hybrid for maximal performance and
|
||||
flexibility."""
|
||||
max_seq_len_to_capture: int = 8192
|
||||
"""Maximum sequence len covered by CUDA graphs. When a sequence has context
|
||||
length larger than this, we fall back to eager mode. Additionally for
|
||||
encoder-decoder models, if the sequence length of the encoder input is
|
||||
larger than this, we fall back to the eager mode."""
|
||||
max_logprobs: int = 20
|
||||
"""Maximum number of log probabilities to return when `logprobs` is
|
||||
specified in `SamplingParams`. The default value comes the default for the
|
||||
@@ -1024,21 +1019,8 @@ class ModelConfig:
|
||||
current_platform.verify_quantization(self.quantization)
|
||||
|
||||
def _verify_cuda_graph(self) -> None:
|
||||
# The `max_seq_len_to_capture` was incorrectly
|
||||
# based on the encoder's input length (448)
|
||||
# but not the decoder's larger input length (1500).
|
||||
# This change ensures the CUDA Graph captures the correct,
|
||||
# larger sequence length, allowing it to work as intended.
|
||||
effective_max_seq_len = self.max_model_len
|
||||
if self.is_encoder_decoder:
|
||||
effective_max_seq_len = max(
|
||||
effective_max_seq_len,
|
||||
getattr(self.hf_config, "max_source_positions", 0))
|
||||
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
|
||||
effective_max_seq_len)
|
||||
# CUDAGraph capture not supported for encoder-decoder models on ROCm
|
||||
unsupported_rocm = self.is_encoder_decoder
|
||||
|
||||
if (unsupported_rocm and not self.enforce_eager
|
||||
and current_platform.is_rocm()):
|
||||
logger.warning(
|
||||
|
||||
Reference in New Issue
Block a user