Add --max-model-len auto to auto-fit context to available memory (#29431)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-12-24 00:37:14 -05:00
parent d7e05ac743
commit 8ee90c83f8
7 changed files with 313 additions and 31 deletions
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -172,7 +172,10 @@ class ModelConfig:
    format. Examples:\n
    - 1k -> 1000\n
    - 1K -> 1024\n
-    - 25.6k -> 25,600"""
+    - 25.6k -> 25,600\n
+    - -1 or 'auto' -> Automatically choose the maximum model length that fits in
+    GPU memory. This will use the model's maximum context length if it fits,
+    otherwise it will find the largest length that can be accommodated."""
    spec_target_max_model_len: int | None = None
    """Specify the maximum length for spec decoding draft models."""
    quantization: QuantizationMethods | str | None = None
@@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
    if encoder_config and "max_seq_length" in encoder_config:
        derived_max_model_len = encoder_config["max_seq_length"]

-    # If the user didn't specify `max_model_len`, then use that derived from
-    # the model config as a default value.
-    if max_model_len is None:
+    # If the user didn't specify `max_model_len` or specified -1 (auto-fit),
+    # then use that derived from the model config as a default value.
+    # When -1 is specified, the engine will later auto-fit to available memory.
+    if max_model_len is None or max_model_len == -1:
        # For LongRoPE, default to original_max_position_embeddings to avoid
        # performance degradation for shorter sequences
        if rope_parameters is not None and any(