diff --git a/docs/usage/security.md b/docs/usage/security.md index 0a54221ec..bb920ff43 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -178,7 +178,7 @@ These endpoints are **only available when the environment variable `VLLM_SERVER_ - `/is_sleeping` - Check if engine is sleeping - `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous) -**Profiler endpoints (only when `VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE` are set):** +**Profiler endpoints (only when profiling is enabled via `--profiler-config`):** These endpoints are only available when profiling is enabled and should only be used for local development: @@ -207,7 +207,7 @@ An attacker who can reach the vLLM HTTP server can: - Cache manipulation that can disrupt service - Detailed server configuration disclosure -Similarly, never enable profiler endpoints (`VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE`) in production. +Similarly, never enable profiler endpoints in production. **Be cautious with `--enable-tokenizer-info-endpoint`:** Only enable the `/tokenizer_info` endpoint if you need to expose tokenizer configuration information. This endpoint reveals chat templates and tokenizer settings that may contain sensitive implementation details or prompt engineering strategies. diff --git a/vllm/envs.py b/vllm/envs.py index f9aaa4f38..caddf0b76 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -855,53 +855,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LORA_RESOLVER_HF_REPO_LIST": lambda: os.getenv( "VLLM_LORA_RESOLVER_HF_REPO_LIST", None ), - # Enables torch CUDA profiling if set to 1. - # Deprecated, see profiler_config. - "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"), - # Enables torch profiler if set. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"), - # Enable torch profiler to record shapes if set to 1. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: ( - os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES") - ), - # Enable torch profiler to profile memory if set to 1. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: ( - os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY") - ), - # Enable torch profiler to profile stack if set to 1. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_WITH_STACK": lambda: ( - os.getenv("VLLM_TORCH_PROFILER_WITH_STACK") - ), - # Enable torch profiler to profile flops if set to 1. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: ( - os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS") - ), - # Disable torch profiling of the AsyncLLMEngine process if set to 1. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: ( - os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM") - ), - # Delay number of iterations before starting profiling when using - # the torch/torch CUDA profiler. If set to 0, will start profiling immediately. - # Deprecated, see profiler_config. - "VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")), - # Maximum number of iterations to profile when using the torch/torch CUDA profiler. - # If set to 0, will not limit the number of iterations. - "VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"), - # Control whether torch profiler gzip-compresses profiling files. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"), - # Control whether torch profiler dumps the self_cuda_time_total table. - # Set to 0 to disable dumping the table. - # Deprecated, see profiler_config. - "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: ( - os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL") - ), # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), # If set, allow loading or unloading lora adapters in runtime,