[Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
||||
def __init__(self, cache_size: int):
|
||||
self.cache_size = cache_size
|
||||
self.num_free_slots = cache_size
|
||||
self.freed: list[str] = []
|
||||
self.allocated: list[str] = []
|
||||
self.to_free: list[str] = []
|
||||
|
||||
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
|
||||
return False
|
||||
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
||||
self.num_free_slots -= num_encoder_embeds
|
||||
|
||||
mm_hash = request.mm_features[input_id].identifier
|
||||
self.freed.append(mm_hash)
|
||||
self.allocated.append(mm_hash)
|
||||
|
||||
def free(self, request: Request) -> None:
|
||||
for input_id in range(len(request.mm_features)):
|
||||
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
||||
return set(range(len(request.mm_features)))
|
||||
|
||||
def get_freed_mm_hashes(self) -> list[str]:
|
||||
freed = self.freed
|
||||
self.freed = []
|
||||
return freed
|
||||
# As encoder cache is not used for enc-dec models, we can free the entries here
|
||||
# The actual free happens in the runner, *before* the model is executed.
|
||||
# Therefore, `freeable` acts as a buffer to free the entries only after the
|
||||
# model is executed, mimicking the state transition of `EncoderCacheManager`.
|
||||
to_free = self.to_free
|
||||
self.to_free = self.allocated
|
||||
self.allocated = []
|
||||
return to_free
|
||||
|
||||
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
||||
num_encoder_embeds = request.get_num_encoder_embeds(input_id)
|
||||
|
||||
Reference in New Issue
Block a user