[Core][Multimodal] Track encode cache entries by mm_hash and enable embedding sharing between requests (#22711)
Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Mapping
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@@ -31,34 +33,52 @@ class EncoderCacheManager:
|
||||
within requests, allowing for fine-grained memory management and enabling
|
||||
chunked processing of multimodal inputs.
|
||||
|
||||
Note that no caching is shared between requests at this time. If the same
|
||||
input is used across multiple requests, it will be reprocessed for each
|
||||
request.
|
||||
Cache is enabled to share embeddings of same multimodal data
|
||||
item (identified by their hash value) between different requests,
|
||||
and eviction takes place at allocation time when there's no free
|
||||
space for new embeddings.
|
||||
Oldest cached embeddings with no request referenced will be first evicted.
|
||||
|
||||
Args:
|
||||
cache_size: Limit the size of the cache, measured by the number of
|
||||
tokens from the input sequence.
|
||||
|
||||
Attributes:
|
||||
cache_size: Total cache capacity in encoder tokens
|
||||
num_free_slots: Current available cache capacity in encoder tokens
|
||||
cached: Mapping from request_id to set of cached input_ids for that
|
||||
request
|
||||
freed: List of (request_id, input_id) pairs that were recently freed.
|
||||
This is cleared after every call to get_freed_ids().
|
||||
cache_size: Total cache capacity in encoder tokens.
|
||||
num_free_slots: Current available cache capacity in encoder tokens.
|
||||
num_freeable_slots: Capacity that can be immediately reclaimed by
|
||||
evicting entries with zero references (in encoder tokens).
|
||||
cached: Mapping from mm_hash to a set of request IDs that currently
|
||||
reference the cached entry. If the set is empty, the entry exists
|
||||
but is not referenced by any request and is eligible for
|
||||
reclamation.
|
||||
freeable: List of tuples (mm_hash, num_tokens) representing entries
|
||||
whose no current running request is needed and that can be freed to
|
||||
make space when needed.
|
||||
freed: List of mm_hash strings that were actually evicted since the
|
||||
last call to get_freed_mm_hashes(). This list is cleared on return.
|
||||
"""
|
||||
|
||||
def __init__(self, cache_size: int):
|
||||
self.cache_size = cache_size
|
||||
self.num_free_slots = cache_size
|
||||
# req_id -> cached input ids
|
||||
self.cached: dict[str, set[int]] = {}
|
||||
# list of [req_id, input_id]
|
||||
self.freed: list[tuple[str, int]] = []
|
||||
self.num_freeable_slots = cache_size
|
||||
|
||||
def has_cache(self, request: Request, input_id: int) -> bool:
|
||||
# mm_hash of mm_data => ids of requests that reference the mm_data
|
||||
self.cached: dict[str, set[str]] = {}
|
||||
|
||||
# mm_hash of mm_data => num_encoder_tokens of the mm_data
|
||||
self.freeable: OrderedDict[str, int] = OrderedDict()
|
||||
self.freed: list[str] = []
|
||||
|
||||
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
|
||||
"""Check if encoder output for a specific multimodal input is cached.
|
||||
|
||||
If the encoder output is cached, update `cached` to add the request id
|
||||
to the set of request ids that reference the cached encoder output.
|
||||
If the encoder output was previously not referenced by any request,
|
||||
update `freeable` and `num_freeable_slots` accordingly.
|
||||
|
||||
Args:
|
||||
request: The request containing the multimodal input
|
||||
input_id: Index of the multimodal input within the request
|
||||
@@ -66,103 +86,151 @@ class EncoderCacheManager:
|
||||
Returns:
|
||||
True if the encoder output for this input is already cached
|
||||
"""
|
||||
req_id = request.request_id
|
||||
return req_id in self.cached and input_id in self.cached[req_id]
|
||||
mm_hash = request.mm_hashes[input_id]
|
||||
# Not cached at all
|
||||
if mm_hash not in self.cached:
|
||||
return False
|
||||
|
||||
def can_allocate(self, request: Request, input_id: int) -> bool:
|
||||
"""Check if there's sufficient cache space for a multimodal input.
|
||||
# Cached but currently not referenced by any request
|
||||
if not self.cached[mm_hash]:
|
||||
num_tokens = self.freeable.pop(mm_hash)
|
||||
self.num_freeable_slots -= num_tokens
|
||||
|
||||
self.cached[mm_hash].add(request.request_id)
|
||||
return True
|
||||
|
||||
def try_allocate(self, request: Request, input_id: int,
|
||||
encoder_budget: int) -> bool:
|
||||
"""Check if there's sufficient cache space for a multimodal input.
|
||||
If there is, return True and update EncoderCacheManager state.
|
||||
|
||||
If there is not enough free space in `num_free_slots` but there is
|
||||
enough reclaimable space in `num_freeable_slots`, entries will be
|
||||
evicted from `freeable` (their mm_hash appended to `freed`) until
|
||||
enough space is available, and then this method returns True.
|
||||
Older entries are evicted first.
|
||||
|
||||
Returns False only if the requested number of tokens exceeds both
|
||||
the free and reclaimable capacities combined.
|
||||
|
||||
Args:
|
||||
request: The request containing the multimodal input
|
||||
input_id: Index of the multimodal input within the request
|
||||
request: The request containing the multimodal input.
|
||||
input_id: Index of the multimodal input within the request.
|
||||
|
||||
Returns:
|
||||
True if there's enough free cache space to store the encoder output
|
||||
for this multimodal input
|
||||
True if there's enough capacity to hold the encoder output for this
|
||||
input (possibly after reclaiming `freeable` entries); otherwise
|
||||
False.
|
||||
|
||||
Note: This method does not allocate physical memory for the encoder
|
||||
output but only the state of EncoderCacheManager.
|
||||
"""
|
||||
num_tokens = request.get_num_encoder_tokens(input_id)
|
||||
return num_tokens <= self.num_free_slots
|
||||
|
||||
# Not enough compute budget
|
||||
if num_tokens > encoder_budget:
|
||||
return False
|
||||
|
||||
# Enough free slots
|
||||
if num_tokens <= self.num_free_slots:
|
||||
self.num_free_slots -= num_tokens
|
||||
self.num_freeable_slots -= num_tokens
|
||||
return True
|
||||
|
||||
# Not enough reclaimable slots
|
||||
if num_tokens > self.num_freeable_slots:
|
||||
return False
|
||||
|
||||
# Not enough free slots but enough reclaimable slots
|
||||
# NOTE: Eviction takes place here, but physical memory is not freed
|
||||
# until model runner is notified by the scheduler output.
|
||||
while num_tokens > self.num_free_slots:
|
||||
mm_hash, num_free_token = self.freeable.popitem(last=False)
|
||||
del self.cached[mm_hash]
|
||||
self.freed.append(mm_hash)
|
||||
self.num_free_slots += num_free_token
|
||||
self.num_free_slots -= num_tokens
|
||||
self.num_freeable_slots -= num_tokens
|
||||
return True
|
||||
|
||||
def allocate(self, request: Request, input_id: int) -> None:
|
||||
"""Allocate cache space for a multimodal input's encoder output.
|
||||
|
||||
This method reserves cache space for storing the encoder output of
|
||||
the specified multimodal input. The actual encoder output storage
|
||||
happens in the model runner, but this method ensures the cache
|
||||
manager tracks the allocation.
|
||||
|
||||
Args:
|
||||
request: The request containing the multimodal input
|
||||
input_id: Index of the multimodal input within the request
|
||||
This reserves cache space for storing the encoder output of the
|
||||
specified multimodal input. The actual encoder output storage happens in
|
||||
the model runner; this method updates the manager's bookkeeping.
|
||||
|
||||
Note:
|
||||
This method assumes can_allocate() returned True for the same
|
||||
request and input_id. It will reduce available cache space.
|
||||
This method assumes try_allocate() returned True for the same input.
|
||||
"""
|
||||
req_id = request.request_id
|
||||
if req_id not in self.cached:
|
||||
self.cached[req_id] = set()
|
||||
self.cached[req_id].add(input_id)
|
||||
self.num_free_slots -= request.get_num_encoder_tokens(input_id)
|
||||
# Encoder cache space budget should be already updated for the
|
||||
# multimodal input and non-negative after try_allocate() is called.
|
||||
assert self.num_free_slots >= 0
|
||||
assert self.num_freeable_slots >= 0
|
||||
|
||||
mm_hash = request.mm_hashes[input_id]
|
||||
request_id = request.request_id
|
||||
if mm_hash not in self.cached:
|
||||
self.cached[mm_hash] = set()
|
||||
|
||||
self.cached[mm_hash].add(request_id)
|
||||
|
||||
def get_cached_input_ids(self, request: Request) -> set[int]:
|
||||
"""Get all cached multimodal input IDs for a request.
|
||||
|
||||
Args:
|
||||
request: The request to query
|
||||
|
||||
Returns:
|
||||
Set of input_ids that have cached encoder outputs for this request.
|
||||
Returns empty set if no inputs are cached for this request.
|
||||
Returns the set of input IDs whose `mm_hash` exists in the cache map.
|
||||
This includes entries that are currently unreferenced (and thus present
|
||||
in `freeable`); for such entries, freeing for this request will be a
|
||||
no-op.
|
||||
"""
|
||||
return self.cached.get(request.request_id, set())
|
||||
return {
|
||||
input_id
|
||||
for input_id in range(len(request.mm_hashes))
|
||||
if request.mm_hashes[input_id] in self.cached
|
||||
}
|
||||
|
||||
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
||||
"""Free cache space for a single multimodal input's encoder output.
|
||||
"""Free the request's reference to the encoder input (`mm_data`)
|
||||
|
||||
This method is called when:
|
||||
- The encoder output has been fully consumed by the decoder and is
|
||||
no longer needed (e.g., in vision-language models after image
|
||||
tokens are processed)
|
||||
- A request is being cancelled or aborted
|
||||
When the reference set for the corresponding `mm_hash` becomes empty,
|
||||
the entry is appended to `freeable` and `num_freeable_slots` is
|
||||
increased by the number of encoder tokens for that input.
|
||||
|
||||
Args:
|
||||
request: The request containing the multimodal input
|
||||
input_id: Index of the multimodal input to free from cache
|
||||
The entry is NOT physically freed until capacity is needed (e.g., by
|
||||
`can_allocate`).
|
||||
"""
|
||||
req_id = request.request_id
|
||||
if req_id not in self.cached:
|
||||
mm_hash = request.mm_hashes[input_id]
|
||||
# The mm_hash not in cache or the req_id set is empty
|
||||
if not self.cached.get(mm_hash, None):
|
||||
return
|
||||
|
||||
self.cached[req_id].discard(input_id)
|
||||
if len(self.cached[req_id]) == 0:
|
||||
del self.cached[req_id]
|
||||
self.num_free_slots += request.get_num_encoder_tokens(input_id)
|
||||
self.freed.append((req_id, input_id))
|
||||
self.cached[mm_hash].discard(req_id)
|
||||
if not self.cached[mm_hash]:
|
||||
num_tokens = request.get_num_encoder_tokens(input_id)
|
||||
self.freeable[mm_hash] = num_tokens
|
||||
self.num_freeable_slots += num_tokens
|
||||
|
||||
def free(self, request: Request) -> None:
|
||||
"""Free all cached encoder outputs for a request.
|
||||
"""Free all encoder input cache reference held by *request*.
|
||||
|
||||
This method is typically called when a request is finished, cancelled,
|
||||
or aborted, and all its encoder outputs should be freed from cache.
|
||||
For each cached input ID, `free_encoder_input` is invoked.
|
||||
The data stays in memory until eviction is triggered by a future
|
||||
attempt allocation called by 'can_allocate'.
|
||||
|
||||
Args:
|
||||
request: The request whose encoder outputs should be freed
|
||||
Typically called when a request is finished, cancelled, or aborted.
|
||||
"""
|
||||
input_ids = self.get_cached_input_ids(request).copy()
|
||||
for input_id in input_ids:
|
||||
self.free_encoder_input(request, input_id)
|
||||
|
||||
def get_freed_ids(self) -> list[tuple[str, int]]:
|
||||
def get_freed_mm_hashes(self) -> list[str]:
|
||||
"""Get and clear the list of recently freed encoder cache entries.
|
||||
|
||||
This method returns all encoder cache entries that were freed since
|
||||
the last call to this method. It's used by the scheduler to notify
|
||||
workers about which encoder outputs can be removed from their caches.
|
||||
|
||||
Returns:
|
||||
List of (request_id, input_id) tuples that were freed since the
|
||||
last call. The internal freed list is cleared after this call.
|
||||
List of mm_hash strings that were actually evicted since the last
|
||||
call to be used by the scheduler to notify workers about which
|
||||
encoder outputs can be removed from their caches. The internal
|
||||
list is cleared after this call.
|
||||
"""
|
||||
freed = self.freed
|
||||
self.freed = []
|
||||
@@ -177,16 +245,11 @@ def compute_encoder_budget(
|
||||
"""Compute the encoder cache budget based on the model and scheduler
|
||||
configurations.
|
||||
|
||||
Args:
|
||||
model_config: Model configuration.
|
||||
scheduler_config: Scheduler configuration.
|
||||
mm_registry: Provides information about the token cost.
|
||||
|
||||
Returns:
|
||||
- Compute budget for encoder execution, in unit of number of tokens
|
||||
in the input sequence.
|
||||
- Space budget for encoder cache size, in unit of number of tokens
|
||||
in the input sequence.
|
||||
- Compute budget for encoder execution, measured in number of tokens
|
||||
from the input sequence.
|
||||
- Space budget for encoder cache size, measured in number of tokens
|
||||
from the input sequence.
|
||||
"""
|
||||
if mm_registry.supports_multimodal_inputs(model_config):
|
||||
max_tokens_by_modality = mm_registry \
|
||||
@@ -231,10 +294,10 @@ def compute_mm_encoder_budget(
|
||||
non-text modality.
|
||||
|
||||
Returns:
|
||||
- Compute budget for encoder execution, in unit of number of tokens
|
||||
in the input sequence.
|
||||
- Space budget for encoder cache size, in unit of number of tokens
|
||||
in the input sequence.
|
||||
- Compute budget for encoder execution, measured in number of tokens
|
||||
from the input sequence.
|
||||
- Space budget for encoder cache size, measured in number of tokens
|
||||
from the input sequence.
|
||||
"""
|
||||
|
||||
if not max_tokens_by_modality:
|
||||
|
||||
Reference in New Issue
Block a user