fix RAM OOM when load large models in tensor parallel mode. (#1395)

Co-authored-by: ran_lin <rlin@thoughtworks.com>
2023-11-21 11:02:42 +08:00
parent 819b18e7ba
commit 4bb6b67188
4 changed files with 52 additions and 7 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,6 +22,7 @@ class EngineArgs:
    worker_use_ray: bool = False
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
+    max_parallel_loading_workers: Optional[int] = None
    block_size: int = 16
    swap_space: int = 4  # GiB
    gpu_memory_utilization: float = 0.90
@@ -128,6 +129,12 @@ class EngineArgs:
                            type=int,
                            default=EngineArgs.tensor_parallel_size,
                            help='number of tensor parallel replicas')
+        parser.add_argument(
+            '--max-parallel-loading-workers',
+            type=int,
+            help='load model sequentially in multiple batches, '
+            'to avoid RAM OOM when using tensor '
+            'parallel and large models')
        # KV cache arguments
        parser.add_argument('--block-size',
                            type=int,
@@ -195,7 +202,8 @@ class EngineArgs:
            getattr(model_config.hf_config, 'sliding_window', None))
        parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                         self.tensor_parallel_size,
-                                         self.worker_use_ray)
+                                         self.worker_use_ray,
+                                         self.max_parallel_loading_workers)
        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                           self.max_num_seqs,
                                           model_config.max_model_len,