[Core] add and implement VLLM_LOGITS_PROCESSOR_THREADS (#12368)

Signed-off-by: Aviv Keshet <akeshet@scaledcognition.com>
2025-02-04 18:46:26 -08:00
parent 75e94309e8
commit b3a0d01e45
2 changed files with 44 additions and 11 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
    VLLM_LOGGING_LEVEL: str = "INFO"
    VLLM_LOGGING_PREFIX: str = ""
    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
    VLLM_TRACE_FUNCTION: int = 0
    VLLM_ATTENTION_BACKEND: Optional[str] = None
    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
@@ -282,6 +283,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_LOGGING_PREFIX":
    lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),

+    # if set, vllm will call logits processors in a thread pool with this many
+    # threads. This is useful when using custom logits processors that either
+    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
+    # while not holding the python GIL, or both.
+    "VLLM_LOGITS_PROCESSOR_THREADS":
+    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
+    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
+
    # Trace function calls
    # If set to 1, vllm will trace function calls
    # Useful for debugging