[Core] Remove legacy input mapper/processor from V0 (#15686)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-04-28 15:38:48 +08:00
committed by GitHub
parent 7fcc4223dc
commit aec9674dbe
21 changed files with 142 additions and 1337 deletions

View File

@@ -223,28 +223,6 @@ class InputPreprocessor:
lora_request=lora_request,
add_special_tokens=add_special_tokens)
def _can_process_multimodal(self) -> bool:
model_config = self.model_config
if not model_config.is_multimodal_model:
raise ValueError("Your model does not support multi-modal inputs")
# Interim measure so we can handle models that have yet to be
# updated to use the new multi-modal processor
can_process_multimodal = self.mm_registry.has_processor(model_config)
if not can_process_multimodal:
from vllm.model_executor.models.registry import _VLLM_MODELS
if not any(arch in _VLLM_MODELS
for arch in model_config.architectures):
logger.warning_once(
"Your model uses the legacy input pipeline, which will be "
"removed in an upcoming release. "
"Please upgrade to the new multi-modal processing pipeline "
"(https://docs.vllm.ai/en/latest/design/mm_processing.html)"
)
return can_process_multimodal
def _process_multimodal(
self,
prompt: Union[str, list[int]],
@@ -258,8 +236,7 @@ class InputPreprocessor:
returning the corresponding token IDs and metadata.
"""
# At the moment on model (PrithviGeoSpatialMAE) requires to be
# initialized without a tokenizer while using also multi-modal
# input.
# initialized without a tokenizer while using also multi-modal input
if not self.tokenizer:
tokenizer = object() # Dummy
else:
@@ -285,8 +262,7 @@ class InputPreprocessor:
) -> MultiModalInputs:
"""Async version of :meth:`_process_multimodal`."""
# At the moment on model (PrithviGeoSpatialMAE) requires to be
# initialized without a tokenizer while using also multi-modal
# input.
# initialized without a tokenizer while using also multi-modal input
if not self.tokenizer:
tokenizer = object() # Dummy
else:
@@ -343,7 +319,7 @@ class InputPreprocessor:
multi_modal_data = tokens_content.get("multi_modal_data")
mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
if multi_modal_data is not None and self._can_process_multimodal():
if multi_modal_data is not None:
return self._process_multimodal(
prompt_token_ids,
multi_modal_data,
@@ -355,8 +331,6 @@ class InputPreprocessor:
return token_inputs(
prompt_token_ids=prompt_token_ids,
token_type_ids=token_type_ids,
multi_modal_data=multi_modal_data,
mm_processor_kwargs=mm_processor_kwargs,
)
if parsed["type"] == "text":
@@ -366,7 +340,7 @@ class InputPreprocessor:
multi_modal_data = text_content.get("multi_modal_data")
mm_processor_kwargs = text_content.get("mm_processor_kwargs")
if multi_modal_data is not None and self._can_process_multimodal():
if multi_modal_data is not None:
return self._process_multimodal(
prompt_text,
multi_modal_data,
@@ -383,8 +357,6 @@ class InputPreprocessor:
return token_inputs(
prompt=prompt_text,
prompt_token_ids=prompt_token_ids,
multi_modal_data=multi_modal_data,
mm_processor_kwargs=mm_processor_kwargs,
)
assert_never(parsed)
@@ -417,7 +389,7 @@ class InputPreprocessor:
multi_modal_data = tokens_content.get("multi_modal_data")
mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
if multi_modal_data is not None and self._can_process_multimodal():
if multi_modal_data is not None:
return await self._process_multimodal_async(
prompt_token_ids,
multi_modal_data,
@@ -426,11 +398,7 @@ class InputPreprocessor:
return_mm_hashes=return_mm_hashes,
)
return token_inputs(
prompt_token_ids=prompt_token_ids,
multi_modal_data=multi_modal_data,
mm_processor_kwargs=mm_processor_kwargs,
)
return token_inputs(prompt_token_ids=prompt_token_ids)
if parsed["type"] == "text":
text_content = parsed["content"]
@@ -439,7 +407,7 @@ class InputPreprocessor:
multi_modal_data = text_content.get("multi_modal_data")
mm_processor_kwargs = text_content.get("mm_processor_kwargs")
if multi_modal_data is not None and self._can_process_multimodal():
if multi_modal_data is not None:
return await self._process_multimodal_async(
prompt_text,
multi_modal_data,
@@ -456,8 +424,6 @@ class InputPreprocessor:
return token_inputs(
prompt=prompt_text,
prompt_token_ids=prompt_token_ids,
multi_modal_data=multi_modal_data,
mm_processor_kwargs=mm_processor_kwargs,
)
assert_never(parsed)
@@ -594,15 +560,13 @@ class InputPreprocessor:
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
# For multimodal model, override decoder prompt from processor
# with explicit decoder prompt.
if self.model_config.is_multimodal_model and (
self._can_process_multimodal()):
if self.model_config.is_multimodal_model:
encoder_inputs, decoder_inputs = (
self._separate_enc_dec_inputs_from_mm_processor_outputs(
encoder_inputs, decoder_inputs))
else:
inputs = self._prompt_to_llm_inputs(prompt)
if self.model_config.is_multimodal_model and (
self._can_process_multimodal()):
if self.model_config.is_multimodal_model:
# Encoder-Decoder Multimodal model
encoder_inputs, decoder_inputs = (
self._separate_enc_dec_inputs_from_mm_processor_outputs(
@@ -637,15 +601,13 @@ class InputPreprocessor:
# For multimodal model, override decoder prompt from processor
# with explicit decoder prompt.
if self.model_config.is_multimodal_model and (
self._can_process_multimodal()):
if self.model_config.is_multimodal_model:
encoder_inputs, decoder_inputs = (
self._separate_enc_dec_inputs_from_mm_processor_outputs(
encoder_inputs, decoder_inputs))
else:
inputs = await self._prompt_to_llm_inputs_async(prompt)
if self.model_config.is_multimodal_model and (
self._can_process_multimodal()):
if self.model_config.is_multimodal_model:
# Encoder-Decoder Multimodal model
encoder_inputs, decoder_inputs = (
self._separate_enc_dec_inputs_from_mm_processor_outputs(