[Frontend][V1] Online serving performance improvements (#12287)

This commit is contained in:
Nick Hill
2025-01-22 14:22:12 -08:00
committed by GitHub
parent 7206ce4ce1
commit aea94362c9
7 changed files with 100 additions and 44 deletions

View File

@@ -73,6 +73,7 @@ if TYPE_CHECKING:
VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
VLLM_DISABLE_COMPILE_CACHE: bool = False
VLLM_SERVER_DEV_MODE: bool = False
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
def get_default_cache_root():
@@ -474,6 +475,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# e.g. `/reset_prefix_cache`
"VLLM_SERVER_DEV_MODE":
lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
# Controls the maximum number of requests to handle in a
# single asyncio task when processing per-token outputs in the
# V1 AsyncLLM interface. It is applicable when handling a high
# concurrency of streaming requests.
# Setting this too high can result in a higher variance of
# inter-message latencies. Setting it too low can negatively impact
# TTFT and overall throughput.
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
}
# end-env-vars-definition