[Refactor] Move profiling methods to MM budget (#33559)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -19,6 +19,7 @@ from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.multimodal.budget import MultiModalBudget
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFeatureSpec,
|
||||
@@ -34,7 +35,6 @@ from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.metrics.stats import MultiModalCacheStats
|
||||
from vllm.v1.structured_output.backend_guidance import (
|
||||
@@ -59,32 +59,30 @@ class InputProcessor:
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
) -> None:
|
||||
self.vllm_config = vllm_config
|
||||
self.model_config = vllm_config.model_config
|
||||
self.model_config = model_config = vllm_config.model_config
|
||||
self.cache_config = vllm_config.cache_config
|
||||
self.lora_config = vllm_config.lora_config
|
||||
self.scheduler_config = vllm_config.scheduler_config
|
||||
self.structured_outputs_config = vllm_config.structured_outputs_config
|
||||
self.observability_config = vllm_config.observability_config
|
||||
|
||||
self.generation_config_fields = self.model_config.try_get_generation_config()
|
||||
self.generation_config_fields = model_config.try_get_generation_config()
|
||||
|
||||
self.mm_registry = mm_registry
|
||||
self.mm_processor_cache = mm_registry.processor_cache_from_config(vllm_config)
|
||||
self.mm_encoder_cache_size = None
|
||||
if (
|
||||
self.mm_registry.supports_multimodal_inputs(self.model_config)
|
||||
and not self.model_config.skip_tokenizer_init
|
||||
):
|
||||
with set_default_torch_num_threads():
|
||||
max_tokens_by_modality = (
|
||||
mm_registry.get_max_tokens_per_item_by_modality(self.model_config)
|
||||
)
|
||||
|
||||
_, self.mm_encoder_cache_size = compute_mm_encoder_budget(
|
||||
self.vllm_config.scheduler_config, max_tokens_by_modality
|
||||
)
|
||||
self.mm_encoder_cache_size: int | None = None
|
||||
if (
|
||||
mm_registry.supports_multimodal_inputs(model_config)
|
||||
and not model_config.skip_tokenizer_init
|
||||
):
|
||||
mm_budget = MultiModalBudget(vllm_config, mm_registry)
|
||||
self.mm_encoder_cache_size = mm_budget.encoder_cache_size
|
||||
mm_budget.reset_cache() # Not used anymore
|
||||
|
||||
self.input_preprocessor = InputPreprocessor(
|
||||
self.model_config,
|
||||
vllm_config.observability_config,
|
||||
model_config,
|
||||
self.observability_config,
|
||||
mm_registry,
|
||||
mm_processor_cache=self.mm_processor_cache,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user