Make max_model_len configurable (#972)

2023-09-12 16:29:19 -07:00
parent d6545ad22e
commit 0bb1e885a0
2 changed files with 22 additions and 1 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,6 +18,7 @@ class EngineArgs:
    load_format: str = 'auto'
    dtype: str = 'auto'
    seed: int = 0
+    max_model_len: Optional[int] = None
    worker_use_ray: bool = False
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
@@ -89,6 +90,11 @@ class EngineArgs:
            'The "auto" option will use FP16 precision '
            'for FP32 and FP16 models, and BF16 precision '
            'for BF16 models.')
+        parser.add_argument('--max-model-len',
+                            type=int,
+                            default=None,
+                            help='model context length. If unspecified, '
+                            'will be automatically derived from the model.')
        # Parallel arguments
        parser.add_argument('--worker-use-ray',
                            action='store_true',
@@ -153,7 +159,7 @@ class EngineArgs:
        model_config = ModelConfig(self.model, self.tokenizer,
                                   self.tokenizer_mode, self.trust_remote_code,
                                   self.download_dir, self.load_format,
-                                   self.dtype, self.seed)
+                                   self.dtype, self.seed, self.max_model_len)
        cache_config = CacheConfig(self.block_size,
                                   self.gpu_memory_utilization,
                                   self.swap_space)