[Core] Remove busy loop from idle buffer readers (#28053)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -179,7 +179,6 @@ if TYPE_CHECKING:
|
||||
VLLM_MOONCAKE_BOOTSTRAP_PORT: int = 8998
|
||||
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
|
||||
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
|
||||
VLLM_SLEEP_WHEN_IDLE: bool = False
|
||||
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
|
||||
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
|
||||
VLLM_KV_CACHE_LAYOUT: Literal["NHD", "HND"] | None = None
|
||||
@@ -1338,9 +1337,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
|
||||
os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
|
||||
),
|
||||
# Reduce CPU usage when vLLM is idle. Enabling this will incur small
|
||||
# latency penalty when a request eventually comes.
|
||||
"VLLM_SLEEP_WHEN_IDLE": lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
|
||||
# Control the max chunk bytes (in MB) for the rpc message queue.
|
||||
# Object larger than this threshold will be broadcast to worker
|
||||
# processes via zmq.
|
||||
@@ -1751,7 +1747,6 @@ def compile_factors() -> dict[str, object]:
|
||||
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE",
|
||||
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS",
|
||||
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH",
|
||||
"VLLM_SLEEP_WHEN_IDLE",
|
||||
"VLLM_IMAGE_FETCH_TIMEOUT",
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT",
|
||||
"VLLM_AUDIO_FETCH_TIMEOUT",
|
||||
|
||||
Reference in New Issue
Block a user