[V0 Deprecation] Remove max_seq_len_to_capture (#25543)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-09-24 01:51:39 -07:00
committed by GitHub
parent 77a7fce1bb
commit 2e19a848d4
7 changed files with 2 additions and 48 deletions

View File

@@ -177,11 +177,6 @@ class ModelConfig:
graph and always execute the model in eager mode. If False, we will use
CUDA graph and eager execution in hybrid for maximal performance and
flexibility."""
max_seq_len_to_capture: int = 8192
"""Maximum sequence len covered by CUDA graphs. When a sequence has context
length larger than this, we fall back to eager mode. Additionally for
encoder-decoder models, if the sequence length of the encoder input is
larger than this, we fall back to the eager mode."""
max_logprobs: int = 20
"""Maximum number of log probabilities to return when `logprobs` is
specified in `SamplingParams`. The default value comes the default for the
@@ -1024,21 +1019,8 @@ class ModelConfig:
current_platform.verify_quantization(self.quantization)
def _verify_cuda_graph(self) -> None:
# The `max_seq_len_to_capture` was incorrectly
# based on the encoder's input length (448)
# but not the decoder's larger input length (1500).
# This change ensures the CUDA Graph captures the correct,
# larger sequence length, allowing it to work as intended.
effective_max_seq_len = self.max_model_len
if self.is_encoder_decoder:
effective_max_seq_len = max(
effective_max_seq_len,
getattr(self.hf_config, "max_source_positions", 0))
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
effective_max_seq_len)
# CUDAGraph capture not supported for encoder-decoder models on ROCm
unsupported_rocm = self.is_encoder_decoder
if (unsupported_rocm and not self.enforce_eager
and current_platform.is_rocm()):
logger.warning(