[Refactor] Simplify code for MM budget (#23310)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-08-21 16:00:16 +08:00
committed by GitHub
parent 2e2000f352
commit 0c6e40bbaa
4 changed files with 59 additions and 70 deletions

View File

@@ -341,10 +341,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.model_config,
self.scheduler_config,
self.mm_registry,
max_model_len=self.max_model_len,
max_num_reqs=self.max_num_reqs,
) if self.supports_mm_inputs \
else None)
) if self.supports_mm_inputs else None)
self.reorder_batch_threshold: Optional[int] = None
@@ -669,7 +666,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
mm_budget = self.mm_budget
assert mm_budget is not None
dummy_modality, _ = mm_budget.get_modality_with_max_tokens()
dummy_modality = mm_budget.get_modality_with_max_tokens()
return self._get_mm_dummy_batch(dummy_modality, num_seqs)
@@ -2595,14 +2592,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
(
dummy_modality,
max_tokens,
) = mm_budget.get_modality_with_max_tokens()
(
max_mm_items_per_prompt,
max_mm_items_per_batch,
) = mm_budget.get_max_items(dummy_modality, max_tokens)
dummy_modality = mm_budget.get_modality_with_max_tokens()
max_mm_items_per_batch = mm_budget \
.max_items_per_batch_by_modality[dummy_modality]
logger.info(
"Encoder cache will be initialized with a budget of "