[V1][Core] Autotune encoder cache budget (#11895)

Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-15 11:29:00 -08:00
parent edce722eaa
commit 70755e819e
6 changed files with 166 additions and 49 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1387,13 +1387,15 @@ class SchedulerConfig:

    is_multimodal_model: bool = False

-    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
-    # calculate the actual values from the configurations.
-    # Multimodal encoder run compute budget, only used in V1
-    max_num_encoder_input_tokens = 16384
+    # NOTE: The following multimodal encoder budget will be initialized to
+    # max_num_batched_tokens and overridden in case max multimodal embedding
+    # size is larger.
+    # TODO (ywang96): Make these configurable.
+    # Multimodal encoder compute budget, only used in V1
+    max_num_encoder_input_tokens: int = field(default=None)  # type: ignore

    # Multimodal encoder cache size, only used in V1
-    encoder_cache_size = 16384
+    encoder_cache_size: int = field(default=None)  # type: ignore

    # Whether to perform preemption by swapping or
    # recomputation. If not specified, we determine the mode as follows:
@@ -1467,6 +1469,9 @@ class SchedulerConfig:
                    _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                )

+        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
+        self.encoder_cache_size = self.max_num_batched_tokens
+
        if self.enable_chunked_prefill:
            logger.info(
                "Chunked prefill is enabled with max_num_batched_tokens=%d.",