diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index efb234a2d..c06b76be5 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -85,7 +85,6 @@ start_server() { # Each argument and its value are separate elements. local common_args_array=( "$MODEL" - "--disable-log-requests" "--port" "8004" "--host" "$HOSTNAME" "--gpu-memory-utilization" "$gpu_memory_utilization" diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md index b0be1e3a6..fa3fa0513 100644 --- a/benchmarks/multi_turn/README.md +++ b/benchmarks/multi_turn/README.md @@ -7,7 +7,7 @@ First start serving your model ```bash export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ -vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests +vllm serve $MODEL_PATH --served-model-name Llama ``` The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface). diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 64b505a1d..c4d3c039a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -2187,14 +2187,10 @@ class AsyncEngineArgs(EngineArgs): "--enable-log-requests", action=argparse.BooleanOptionalAction, default=AsyncEngineArgs.enable_log_requests, - help="Enable logging requests.", - ) - parser.add_argument( - "--disable-log-requests", - action=argparse.BooleanOptionalAction, - default=not AsyncEngineArgs.enable_log_requests, - help="[DEPRECATED] Disable logging requests.", - deprecated=True, + help="Enable logging request information, dependant on log level:\n" + "- INFO: Request ID, parameters and LoRA request.\n" + "- DEBUG: Prompt inputs (e.g: text, token IDs).\n" + "You can set the minimum log level via `VLLM_LOGGING_LEVEL`.", ) current_platform.pre_register_and_update(parser) return parser diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index c9e809353..c2a77fbb4 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -18,6 +18,20 @@ class RequestLogger: def __init__(self, *, max_log_len: int | None) -> None: self.max_log_len = max_log_len + if not logger.isEnabledFor(logging.INFO): + logger.warning_once( + "`--enable-log-requests` is set but " + "the minimum log level is higher than INFO. " + "No request information will be logged." + ) + elif not logger.isEnabledFor(logging.DEBUG): + logger.info_once( + "`--enable-log-requests` is set but " + "the minimum log level is higher than DEBUG. " + "Only limited information will be logged to minimize overhead. " + "To view more details, set `VLLM_LOGGING_LEVEL=DEBUG`." + ) + def log_inputs( self, request_id: str, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index eac581e5d..5655491fd 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -143,7 +143,8 @@ class BaseFrontendArgs: templates and other tokenizer configuration.""" enable_log_outputs: bool = False """If set to True, log model outputs (generations). - Requires --enable-log-requests.""" + Requires `--enable-log-requests`. As with `--enable-log-requests`, + information is only logged at INFO level at maximum.""" enable_log_deltas: bool = True """If set to False, output deltas will not be logged. Relevant only if --enable-log-outputs is set.