[Bugfix] Fix scheduling when repeated images in one request (#23544)

Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
2025-08-26 02:46:28 -07:00
parent 9b5f64238f
commit b5d34af328
3 changed files with 96 additions and 39 deletions
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -99,8 +99,9 @@ class EncoderCacheManager:
        self.cached[mm_hash].add(request.request_id)
        return True

-    def try_allocate(self, request: Request, input_id: int,
-                     encoder_budget: int) -> bool:
+    def can_allocate(self, request: Request, input_id: int,
+                     encoder_compute_budget: int,
+                     num_tokens_to_schedule: int) -> bool:
        """Check if there's sufficient cache space for a multimodal input. 
        If there is, return True and update EncoderCacheManager state.

@@ -116,6 +117,10 @@ class EncoderCacheManager:
        Args:
            request: The request containing the multimodal input.
            input_id: Index of the multimodal input within the request.
+            encoder_compute_budget: Number of encoder tokens allowed to be 
+                computed when this method is invoked.
+            num_tokens_to_schedule: Number of tokens already scheduled to be 
+                allocated with cache space when this method is invoked.

        Returns:
            True if there's enough capacity to hold the encoder output for this
@@ -128,13 +133,13 @@ class EncoderCacheManager:
        num_tokens = request.get_num_encoder_tokens(input_id)

        # Not enough compute budget
-        if num_tokens > encoder_budget:
+        if num_tokens > encoder_compute_budget:
            return False

+        num_tokens += num_tokens_to_schedule
+
        # Enough free slots
        if num_tokens <= self.num_free_slots:
-            self.num_free_slots -= num_tokens
-            self.num_freeable_slots -= num_tokens
            return True

        # Not enough reclaimable slots
@@ -149,8 +154,6 @@ class EncoderCacheManager:
            del self.cached[mm_hash]
            self.freed.append(mm_hash)
            self.num_free_slots += num_free_token
-        self.num_free_slots -= num_tokens
-        self.num_freeable_slots -= num_tokens
        return True

    def allocate(self, request: Request, input_id: int) -> None:
@@ -161,19 +164,24 @@ class EncoderCacheManager:
        the model runner; this method updates the manager's bookkeeping.

        Note:
-            This method assumes try_allocate() returned True for the same input.
+            This method assumes can_allocate() returned True for the same input.
        """
-        # Encoder cache space budget should be already updated for the
-        # multimodal input and non-negative after try_allocate() is called.
-        assert self.num_free_slots >= 0
-        assert self.num_freeable_slots >= 0

        mm_hash = request.mm_hashes[input_id]
        request_id = request.request_id
        if mm_hash not in self.cached:
            self.cached[mm_hash] = set()

+        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
+
+        # NOTE: Encoder cache should always have enough space for encoder inputs
+        # that are scheduled since eviction takes place at can_allocate().
+        assert self.num_free_slots >= num_encoder_tokens
+        assert self.num_freeable_slots >= num_encoder_tokens
+
        self.cached[mm_hash].add(request_id)
+        self.num_free_slots -= num_encoder_tokens
+        self.num_freeable_slots -= num_encoder_tokens

    def get_cached_input_ids(self, request: Request) -> set[int]:
        """Get all cached multimodal input IDs for a request.