[Frontend] Use engine argument to control MM cache size (#22441)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -358,8 +358,8 @@ class EngineArgs:
|
||||
"media_io_kwargs")
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = \
|
||||
MultiModalConfig.mm_processor_kwargs
|
||||
disable_mm_preprocessor_cache: bool = \
|
||||
MultiModalConfig.disable_mm_preprocessor_cache
|
||||
disable_mm_preprocessor_cache: bool = False # DEPRECATED
|
||||
mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
|
||||
# LoRA fields
|
||||
enable_lora: bool = False
|
||||
enable_lora_bias: bool = LoRAConfig.bias_enabled
|
||||
@@ -720,8 +720,11 @@ class EngineArgs:
|
||||
"--mm-processor-kwargs",
|
||||
**multimodal_kwargs["mm_processor_kwargs"])
|
||||
multimodal_group.add_argument(
|
||||
"--disable-mm-preprocessor-cache",
|
||||
**multimodal_kwargs["disable_mm_preprocessor_cache"])
|
||||
"--mm-processor-cache-gb",
|
||||
**multimodal_kwargs["mm_processor_cache_gb"])
|
||||
multimodal_group.add_argument("--disable-mm-preprocessor-cache",
|
||||
type=bool,
|
||||
deprecated=True)
|
||||
multimodal_group.add_argument(
|
||||
"--interleave-mm-strings",
|
||||
**multimodal_kwargs["interleave_mm_strings"])
|
||||
@@ -886,6 +889,23 @@ class EngineArgs:
|
||||
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
|
||||
self.load_format = "runai_streamer"
|
||||
|
||||
if self.disable_mm_preprocessor_cache:
|
||||
logger.warning(
|
||||
"`--disable-mm-preprocessor-cache` is deprecated "
|
||||
"and will be removed in v0.13. "
|
||||
"Please use `--mm-processor-cache-gb 0` instead.", )
|
||||
|
||||
self.mm_processor_cache_gb = 0
|
||||
elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
|
||||
logger.warning(
|
||||
"VLLM_MM_INPUT_CACHE_GIB` is deprecated "
|
||||
"and will be removed in v0.13. "
|
||||
"Please use `--mm-processor-cache-gb %d` instead.",
|
||||
envs.VLLM_MM_INPUT_CACHE_GIB,
|
||||
)
|
||||
|
||||
self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
|
||||
|
||||
return ModelConfig(
|
||||
model=self.model,
|
||||
hf_config_path=self.hf_config_path,
|
||||
@@ -922,7 +942,7 @@ class EngineArgs:
|
||||
use_async_output_proc=not self.disable_async_output_proc,
|
||||
config_format=self.config_format,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
|
||||
mm_processor_cache_gb=self.mm_processor_cache_gb,
|
||||
override_neuron_config=self.override_neuron_config,
|
||||
override_pooler_config=self.override_pooler_config,
|
||||
logits_processor_pattern=self.logits_processor_pattern,
|
||||
@@ -1234,13 +1254,13 @@ class EngineArgs:
|
||||
dp_supports_mm_processor_cache = (self.data_parallel_size == 1
|
||||
or data_parallel_external_lb)
|
||||
if (not dp_supports_mm_processor_cache
|
||||
and not model_config.disable_mm_preprocessor_cache):
|
||||
and model_config.mm_processor_cache_gb > 0):
|
||||
logger.warning(
|
||||
"Multi-modal processor cache is disabled because "
|
||||
"it is not compatible with data parallelism when "
|
||||
"there does not exist a one-to-one correspondance "
|
||||
"between API and engine core processes.")
|
||||
model_config.set_disable_mm_preprocessor_cache(True)
|
||||
model_config.set_mm_processor_cache_gb(0)
|
||||
|
||||
speculative_config = self.create_speculative_config(
|
||||
target_model_config=model_config,
|
||||
|
||||
Reference in New Issue
Block a user