[V1][Core] Autotune encoder cache budget (#11895)
Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -1387,13 +1387,15 @@ class SchedulerConfig:
|
||||
|
||||
is_multimodal_model: bool = False
|
||||
|
||||
# FIXME(woosuk & ywang96): Below are placeholder values. We need to
|
||||
# calculate the actual values from the configurations.
|
||||
# Multimodal encoder run compute budget, only used in V1
|
||||
max_num_encoder_input_tokens = 16384
|
||||
# NOTE: The following multimodal encoder budget will be initialized to
|
||||
# max_num_batched_tokens and overridden in case max multimodal embedding
|
||||
# size is larger.
|
||||
# TODO (ywang96): Make these configurable.
|
||||
# Multimodal encoder compute budget, only used in V1
|
||||
max_num_encoder_input_tokens: int = field(default=None) # type: ignore
|
||||
|
||||
# Multimodal encoder cache size, only used in V1
|
||||
encoder_cache_size = 16384
|
||||
encoder_cache_size: int = field(default=None) # type: ignore
|
||||
|
||||
# Whether to perform preemption by swapping or
|
||||
# recomputation. If not specified, we determine the mode as follows:
|
||||
@@ -1467,6 +1469,9 @@ class SchedulerConfig:
|
||||
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||
)
|
||||
|
||||
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
|
||||
self.encoder_cache_size = self.max_num_batched_tokens
|
||||
|
||||
if self.enable_chunked_prefill:
|
||||
logger.info(
|
||||
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
|
||||
|
||||
Reference in New Issue
Block a user