Add --max-model-len auto to auto-fit context to available memory (#29431)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -172,7 +172,10 @@ class ModelConfig:
|
||||
format. Examples:\n
|
||||
- 1k -> 1000\n
|
||||
- 1K -> 1024\n
|
||||
- 25.6k -> 25,600"""
|
||||
- 25.6k -> 25,600\n
|
||||
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
|
||||
GPU memory. This will use the model's maximum context length if it fits,
|
||||
otherwise it will find the largest length that can be accommodated."""
|
||||
spec_target_max_model_len: int | None = None
|
||||
"""Specify the maximum length for spec decoding draft models."""
|
||||
quantization: QuantizationMethods | str | None = None
|
||||
@@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
|
||||
if encoder_config and "max_seq_length" in encoder_config:
|
||||
derived_max_model_len = encoder_config["max_seq_length"]
|
||||
|
||||
# If the user didn't specify `max_model_len`, then use that derived from
|
||||
# the model config as a default value.
|
||||
if max_model_len is None:
|
||||
# If the user didn't specify `max_model_len` or specified -1 (auto-fit),
|
||||
# then use that derived from the model config as a default value.
|
||||
# When -1 is specified, the engine will later auto-fit to available memory.
|
||||
if max_model_len is None or max_model_len == -1:
|
||||
# For LongRoPE, default to original_max_position_embeddings to avoid
|
||||
# performance degradation for shorter sequences
|
||||
if rope_parameters is not None and any(
|
||||
|
||||
Reference in New Issue
Block a user