[Core][MM] Optimize encoder cache manager by operating with embeddings only (#30475)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Sun Kim <sunytokki@gmail.com>
This commit is contained in:
Roger Wang
2025-12-16 14:18:17 -08:00
committed by GitHub
parent 9fec0e13d5
commit f5f51e5931
14 changed files with 306 additions and 130 deletions

View File

@@ -209,10 +209,10 @@ class Request:
def get_finished_reason(self) -> FinishReason | None:
return RequestStatus.get_finished_reason(self.status)
def get_num_encoder_tokens(self, input_id: int) -> int:
def get_num_encoder_embeds(self, input_id: int) -> int:
assert input_id < len(self.mm_features)
num_tokens = self.mm_features[input_id].mm_position.length
return num_tokens
num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
return num_embeds
def record_event(
self,