Add --max-model-len auto to auto-fit context to available memory (#29431)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-12-24 00:37:14 -05:00
committed by GitHub
parent d7e05ac743
commit 8ee90c83f8
7 changed files with 313 additions and 31 deletions

View File

@@ -387,6 +387,19 @@ class Worker(WorkerBase):
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
return self.model_runner.get_kv_cache_spec()
def update_max_model_len(self, max_model_len: int) -> None:
"""Update max_model_len after auto-fit to GPU memory.
This is called when max_model_len=-1 is used and the engine
automatically determines the maximum context length that fits
in GPU memory. Workers need to update their cached max_model_len
to match the engine's decision.
"""
self.model_config.max_model_len = max_model_len
if self.model_runner is not None:
self.model_runner.max_model_len = max_model_len
logger.debug("Updated max_model_len to %d", max_model_len)
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config."""