[V1][Core] Autotune encoder cache budget (#11895)

Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
Roger Wang
2025-01-15 11:29:00 -08:00
committed by GitHub
parent edce722eaa
commit 70755e819e
6 changed files with 166 additions and 49 deletions

View File

@@ -1387,13 +1387,15 @@ class SchedulerConfig:
is_multimodal_model: bool = False
# FIXME(woosuk & ywang96): Below are placeholder values. We need to
# calculate the actual values from the configurations.
# Multimodal encoder run compute budget, only used in V1
max_num_encoder_input_tokens = 16384
# NOTE: The following multimodal encoder budget will be initialized to
# max_num_batched_tokens and overridden in case max multimodal embedding
# size is larger.
# TODO (ywang96): Make these configurable.
# Multimodal encoder compute budget, only used in V1
max_num_encoder_input_tokens: int = field(default=None) # type: ignore
# Multimodal encoder cache size, only used in V1
encoder_cache_size = 16384
encoder_cache_size: int = field(default=None) # type: ignore
# Whether to perform preemption by swapping or
# recomputation. If not specified, we determine the mode as follows:
@@ -1467,6 +1469,9 @@ class SchedulerConfig:
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
)
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
self.encoder_cache_size = self.max_num_batched_tokens
if self.enable_chunked_prefill:
logger.info(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",