[Frontend][V1] Online serving performance improvements (#12287)
This commit is contained in:
11
vllm/envs.py
11
vllm/envs.py
@@ -73,6 +73,7 @@ if TYPE_CHECKING:
|
||||
VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
|
||||
VLLM_DISABLE_COMPILE_CACHE: bool = False
|
||||
VLLM_SERVER_DEV_MODE: bool = False
|
||||
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -474,6 +475,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
# e.g. `/reset_prefix_cache`
|
||||
"VLLM_SERVER_DEV_MODE":
|
||||
lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))),
|
||||
|
||||
# Controls the maximum number of requests to handle in a
|
||||
# single asyncio task when processing per-token outputs in the
|
||||
# V1 AsyncLLM interface. It is applicable when handling a high
|
||||
# concurrency of streaming requests.
|
||||
# Setting this too high can result in a higher variance of
|
||||
# inter-message latencies. Setting it too low can negatively impact
|
||||
# TTFT and overall throughput.
|
||||
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE":
|
||||
lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
Reference in New Issue
Block a user