[Bugfix] Fix scheduling when repeated images in one request (#23544)
Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
This commit is contained in:
@@ -99,8 +99,9 @@ class EncoderCacheManager:
|
||||
self.cached[mm_hash].add(request.request_id)
|
||||
return True
|
||||
|
||||
def try_allocate(self, request: Request, input_id: int,
|
||||
encoder_budget: int) -> bool:
|
||||
def can_allocate(self, request: Request, input_id: int,
|
||||
encoder_compute_budget: int,
|
||||
num_tokens_to_schedule: int) -> bool:
|
||||
"""Check if there's sufficient cache space for a multimodal input.
|
||||
If there is, return True and update EncoderCacheManager state.
|
||||
|
||||
@@ -116,6 +117,10 @@ class EncoderCacheManager:
|
||||
Args:
|
||||
request: The request containing the multimodal input.
|
||||
input_id: Index of the multimodal input within the request.
|
||||
encoder_compute_budget: Number of encoder tokens allowed to be
|
||||
computed when this method is invoked.
|
||||
num_tokens_to_schedule: Number of tokens already scheduled to be
|
||||
allocated with cache space when this method is invoked.
|
||||
|
||||
Returns:
|
||||
True if there's enough capacity to hold the encoder output for this
|
||||
@@ -128,13 +133,13 @@ class EncoderCacheManager:
|
||||
num_tokens = request.get_num_encoder_tokens(input_id)
|
||||
|
||||
# Not enough compute budget
|
||||
if num_tokens > encoder_budget:
|
||||
if num_tokens > encoder_compute_budget:
|
||||
return False
|
||||
|
||||
num_tokens += num_tokens_to_schedule
|
||||
|
||||
# Enough free slots
|
||||
if num_tokens <= self.num_free_slots:
|
||||
self.num_free_slots -= num_tokens
|
||||
self.num_freeable_slots -= num_tokens
|
||||
return True
|
||||
|
||||
# Not enough reclaimable slots
|
||||
@@ -149,8 +154,6 @@ class EncoderCacheManager:
|
||||
del self.cached[mm_hash]
|
||||
self.freed.append(mm_hash)
|
||||
self.num_free_slots += num_free_token
|
||||
self.num_free_slots -= num_tokens
|
||||
self.num_freeable_slots -= num_tokens
|
||||
return True
|
||||
|
||||
def allocate(self, request: Request, input_id: int) -> None:
|
||||
@@ -161,19 +164,24 @@ class EncoderCacheManager:
|
||||
the model runner; this method updates the manager's bookkeeping.
|
||||
|
||||
Note:
|
||||
This method assumes try_allocate() returned True for the same input.
|
||||
This method assumes can_allocate() returned True for the same input.
|
||||
"""
|
||||
# Encoder cache space budget should be already updated for the
|
||||
# multimodal input and non-negative after try_allocate() is called.
|
||||
assert self.num_free_slots >= 0
|
||||
assert self.num_freeable_slots >= 0
|
||||
|
||||
mm_hash = request.mm_hashes[input_id]
|
||||
request_id = request.request_id
|
||||
if mm_hash not in self.cached:
|
||||
self.cached[mm_hash] = set()
|
||||
|
||||
num_encoder_tokens = request.get_num_encoder_tokens(input_id)
|
||||
|
||||
# NOTE: Encoder cache should always have enough space for encoder inputs
|
||||
# that are scheduled since eviction takes place at can_allocate().
|
||||
assert self.num_free_slots >= num_encoder_tokens
|
||||
assert self.num_freeable_slots >= num_encoder_tokens
|
||||
|
||||
self.cached[mm_hash].add(request_id)
|
||||
self.num_free_slots -= num_encoder_tokens
|
||||
self.num_freeable_slots -= num_encoder_tokens
|
||||
|
||||
def get_cached_input_ids(self, request: Request) -> set[int]:
|
||||
"""Get all cached multimodal input IDs for a request.
|
||||
|
||||
Reference in New Issue
Block a user