Allow users to specify kv cache memory size (#21489)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Boyuan Feng
2025-09-11 06:41:07 -07:00
committed by GitHub
parent fd1ce98cdd
commit 94e6b2d55f
10 changed files with 236 additions and 47 deletions

View File

@@ -227,8 +227,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
elif contains_type(type_hints, int):
kwargs[name]["type"] = int
# Special case for large integers
if name in {"max_model_len", "max_num_batched_tokens"}:
human_readable_ints = {
"max_model_len",
"max_num_batched_tokens",
"kv_cache_memory_bytes",
}
if name in human_readable_ints:
kwargs[name]["type"] = human_readable_int
kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
elif contains_type(type_hints, float):
kwargs[name]["type"] = float
elif (contains_type(type_hints, dict)
@@ -335,6 +341,7 @@ class EngineArgs:
swap_space: float = CacheConfig.swap_space
cpu_offload_gb: float = CacheConfig.cpu_offload_gb
gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes
max_num_batched_tokens: Optional[
int] = SchedulerConfig.max_num_batched_tokens
max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
@@ -734,6 +741,8 @@ class EngineArgs:
cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
cache_group.add_argument("--gpu-memory-utilization",
**cache_kwargs["gpu_memory_utilization"])
cache_group.add_argument("--kv-cache-memory-bytes",
**cache_kwargs["kv_cache_memory_bytes"])
cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
cache_group.add_argument("--kv-cache-dtype",
**cache_kwargs["cache_dtype"])
@@ -1174,6 +1183,7 @@ class EngineArgs:
cache_config = CacheConfig(
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
swap_space=self.swap_space,
cache_dtype=self.kv_cache_dtype,
is_attention_free=model_config.is_attention_free,