[Core] add an option to log every function call to for debugging hang/crash in distributed inference (#4079)

Co-authored-by: Simon Mo <simon.mo@hey.com>
This commit is contained in:
youkaichao
2024-04-18 16:15:12 -07:00
committed by GitHub
parent 8f9c28fd40
commit 8a7a3e4436
7 changed files with 120 additions and 8 deletions

View File

@@ -1,12 +1,15 @@
import datetime
import importlib
import os
import tempfile
import threading
from abc import ABC, abstractmethod
from typing import Dict, List, Set, Tuple
from vllm.logger import init_logger
from vllm.logger import enable_trace_function_call, init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import update_environment_variables
from vllm.utils import get_vllm_instance_id, update_environment_variables
logger = init_logger(__name__)
@@ -115,9 +118,20 @@ class WorkerWrapperBase:
def init_worker(self, *args, **kwargs):
"""
Actual initialization of the worker class.
Actual initialization of the worker class, and set up
function tracing if required.
Arguments are passed to the worker class constructor.
"""
if int(os.getenv("VLLM_TRACE_FUNCTION", "0")):
tmp_dir = tempfile.gettempdir()
filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
f"_thread_{threading.get_ident()}_"
f"at_{datetime.datetime.now()}.log").replace(" ", "_")
log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(),
filename)
os.makedirs(os.path.dirname(log_path), exist_ok=True)
enable_trace_function_call(log_path)
mod = importlib.import_module(self.worker_module_name)
worker_class = getattr(mod, self.worker_class_name)
self.worker = worker_class(*args, **kwargs)