Implement custom all reduce kernels (#2192)

2024-01-28 04:46:35 +08:00
parent 220a47627b
commit 380170038e
18 changed files with 1453 additions and 65 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -35,6 +35,7 @@ class EngineArgs:
    quantization: Optional[str] = None
    enforce_eager: bool = False
    max_context_len_to_capture: int = 8192
+    disable_custom_all_reduce: bool = False
    enable_lora: bool = False
    max_loras: int = 1
    max_lora_rank: int = 16
@@ -208,6 +209,10 @@ class EngineArgs:
                            help='maximum context length covered by CUDA '
                            'graphs. When a sequence has context length '
                            'larger than this, we fall back to eager mode.')
+        parser.add_argument('--disable-custom-all-reduce',
+                            action='store_true',
+                            default=EngineArgs.disable_custom_all_reduce,
+                            help='See ParallelConfig')
        # LoRA related configs
        parser.add_argument('--enable-lora',
                            action='store_true',
@@ -269,7 +274,8 @@ class EngineArgs:
        parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                         self.tensor_parallel_size,
                                         self.worker_use_ray,
-                                         self.max_parallel_loading_workers)
+                                         self.max_parallel_loading_workers,
+                                         self.disable_custom_all_reduce)
        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                           self.max_num_seqs,
                                           model_config.max_model_len,