Add --max-model-len auto to auto-fit context to available memory (#29431)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-12-24 00:37:14 -05:00
committed by GitHub
parent d7e05ac743
commit 8ee90c83f8
7 changed files with 313 additions and 31 deletions

View File

@@ -172,7 +172,10 @@ class ModelConfig:
format. Examples:\n
- 1k -> 1000\n
- 1K -> 1024\n
- 25.6k -> 25,600"""
- 25.6k -> 25,600\n
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
GPU memory. This will use the model's maximum context length if it fits,
otherwise it will find the largest length that can be accommodated."""
spec_target_max_model_len: int | None = None
"""Specify the maximum length for spec decoding draft models."""
quantization: QuantizationMethods | str | None = None
@@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
if encoder_config and "max_seq_length" in encoder_config:
derived_max_model_len = encoder_config["max_seq_length"]
# If the user didn't specify `max_model_len`, then use that derived from
# the model config as a default value.
if max_model_len is None:
# If the user didn't specify `max_model_len` or specified -1 (auto-fit),
# then use that derived from the model config as a default value.
# When -1 is specified, the engine will later auto-fit to available memory.
if max_model_len is None or max_model_len == -1:
# For LongRoPE, default to original_max_position_embeddings to avoid
# performance degradation for shorter sequences
if rope_parameters is not None and any(