[Frontend][V1] Online serving performance improvements (#12287)

2025-01-22 14:22:12 -08:00
parent 7206ce4ce1
commit aea94362c9
7 changed files with 100 additions and 44 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -73,6 +73,7 @@ if TYPE_CHECKING:
    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
    VLLM_DISABLE_COMPILE_CACHE: bool = False
    VLLM_SERVER_DEV_MODE: bool = False
+    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128


 def get_default_cache_root():
@@ -474,6 +475,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # e.g. `/reset_prefix_cache`
    "VLLM_SERVER_DEV_MODE":
    lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
+
+    # Controls the maximum number of requests to handle in a
+    # single asyncio task when processing per-token outputs in the
+    # V1 AsyncLLM interface. It is applicable when handling a high
+    # concurrency of streaming requests.
+    # Setting this too high can result in a higher variance of
+    # inter-message latencies. Setting it too low can negatively impact
+    # TTFT and overall throughput.
+    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
 }

 # end-env-vars-definition