Make it easy to profile workers with nsight (#3162)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-03-03 16:19:13 -08:00
parent 996d095c54
commit 17c3103c56
4 changed files with 34 additions and 2 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -46,6 +46,7 @@ class EngineArgs:
    lora_dtype = 'auto'
    max_cpu_loras: Optional[int] = None
    device: str = 'auto'
+    ray_workers_use_nsight: bool = False

    def __post_init__(self):
        if self.tokenizer is None:
@@ -168,6 +169,10 @@ class EngineArgs:
            help='load model sequentially in multiple batches, '
            'to avoid RAM OOM when using tensor '
            'parallel and large models')
+        parser.add_argument(
+            '--ray-workers-use-nsight',
+            action='store_true',
+            help='If specified, use nsight to profile ray workers')
        # KV cache arguments
        parser.add_argument('--block-size',
                            type=int,
@@ -305,7 +310,8 @@ class EngineArgs:
                                         self.tensor_parallel_size,
                                         self.worker_use_ray,
                                         self.max_parallel_loading_workers,
-                                         self.disable_custom_all_reduce)
+                                         self.disable_custom_all_reduce,
+                                         self.ray_workers_use_nsight)
        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                           self.max_num_seqs,
                                           model_config.max_model_len,