[Refactor] Move MM data parsing outside processor (#33408)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-02-01 00:46:14 +08:00
committed by GitHub
parent 92924b2ddd
commit 88c3e114d8
43 changed files with 228 additions and 139 deletions

View File

@@ -176,7 +176,7 @@ def get_text_token_prompts(
if model_type in MM_DATA_PATCHES:
mm_data = MM_DATA_PATCHES[model_type](mm_data)
parsed_data = processor.data_parser.parse_mm_data(mm_data)
parsed_data = processor.info.parse_mm_data(mm_data)
mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
text_prompt: str | None
@@ -336,17 +336,18 @@ def _test_processing_correctness_one(
model_type = model_config.hf_config.model_type
text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data)
mm_items = baseline_processor.info.parse_mm_data(mm_data)
ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
baseline_tokenized_result = baseline_processor.apply(
token_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)
cached_tokenized_result = cached_processor.apply(
token_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)
@@ -360,12 +361,12 @@ def _test_processing_correctness_one(
if text_prompt is not None:
baseline_text_result = baseline_processor.apply(
text_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)
cached_text_result = cached_processor.apply(
text_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)

View File

@@ -175,7 +175,11 @@ def test_get_image_size_with_most_features(
for asset in image_assets:
mm_data = {"image": [asset.pil_image]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
num_patches_tensor = mm_kwargs_data["num_patches"]
tokens = int(num_patches_tensor.item()) * image_seq_length

View File

@@ -52,7 +52,11 @@ def test_processor_override(
metadata["fps"] = fps
mm_data = {"video": [(video, metadata)]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
@@ -100,8 +104,16 @@ def test_video_loader_consistency(
static_mm_data = {"video": [(static_video, static_metadata)]}
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
static_outputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(static_mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
dynamic_outputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(dynamic_mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
assert batched_tensors_equal(

View File

@@ -106,7 +106,11 @@ def _run_check(
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=mm_processor_kwargs,
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")

View File

@@ -55,7 +55,11 @@ def test_processor_override(
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)

View File

@@ -66,7 +66,11 @@ def _run_check(
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=mm_processor_kwargs,
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")

View File

@@ -49,7 +49,11 @@ def test_processor_override(
if tokenized_prompt:
prompt = tokenizer.encode(prompt)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=mm_processor_kwargs,
)
mm_data = processed_inputs["mm_kwargs"].get_data()
# place holder replacements

View File

@@ -87,7 +87,11 @@ def _validate_image_prompt_replacements_one(
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs={},
)
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs

View File

@@ -87,7 +87,11 @@ def _validate_image_prompt_replacements_one(
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs={},
)
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs

View File

@@ -29,7 +29,11 @@ def test_processor_override(
image = Image.new("RGB", size=(364, 364))
mm_data = {"image": [image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, {})
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs={},
)
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
@@ -46,7 +50,11 @@ def _validate_image_prompt_replacements_one(
mm_data = {"image": [image] * num_imgs}
try:
processed_inputs = processor.apply(prompt, mm_data, {})
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs={},
)
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs

View File

@@ -68,7 +68,11 @@ def _run_check(
for image in images
)
print(total_expected_num_patches)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=mm_processor_kwargs,
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<image>")

View File

@@ -47,7 +47,11 @@ def test_processor_override(
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)

View File

@@ -51,7 +51,11 @@ def test_processor_override(
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(

View File

@@ -42,7 +42,11 @@ def test_processor_override(
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
@@ -83,7 +87,11 @@ def test_get_image_size_with_most_features(
prompt = "<|vision_start|><|image_pad|><|vision_end|>"
for asset in image_assets:
mm_data = {"image": [asset.pil_image]}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
t, h, w = grid_thw[0]
tokens = (t * h * w) // (merge_size**2)

View File

@@ -51,7 +51,11 @@ def test_processor_with_audio_sample_rate(
hf_processor_mm_kwargs: dict[str, Any] = {
"audio_sample_rate": audio_sample_rate,
}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
# Verify audio tokens are generated
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
@@ -90,7 +94,11 @@ def test_longer_audio_generates_more_tokens(model_id: str) -> None:
hf_processor_mm_kwargs: dict[str, Any] = {
"audio_sample_rate": audio_sample_rate,
}
processed = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
hf_proc = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
audio_token_id = tokenizer.convert_tokens_to_ids(hf_proc.audio_token)
return processed["prompt_token_ids"].count(audio_token_id)

View File

@@ -55,7 +55,11 @@ def test_processor_override(
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
processed_inputs = processor.apply(
prompt,
mm_items=processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)

View File

@@ -24,10 +24,7 @@ from vllm.distributed import (
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.models.interfaces import (
SupportsMultiModal,
supports_multimodal,
)
from vllm.model_executor.models.interfaces import supports_multimodal
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
@@ -86,7 +83,6 @@ def resize_mm_data(
def create_batched_mm_kwargs(
model_cls: type[SupportsMultiModal],
model_config: ModelConfig,
processor: BaseMultiModalProcessor,
size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
@@ -102,10 +98,10 @@ def create_batched_mm_kwargs(
seq_len=model_config.max_model_len,
mm_counts=mm_counts,
)
mm_data = processor_inputs.mm_data
mm_items = processor_inputs.mm_items
resized_mm_data = {
modality: resize_mm_data(data, size_factors)
for modality, data in mm_data.items()
modality: resize_mm_data(items.data, size_factors)
for modality, items in mm_items.items()
}
# video metadata will be added back to the resized video data here.
@@ -113,7 +109,7 @@ def create_batched_mm_kwargs(
mm_kwargs = processor.apply(
prompt=token_prompt if text_prompt is None else text_prompt,
mm_data=resized_mm_data,
mm_items=processor.info.parse_mm_data(resized_mm_data),
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
tokenization_kwargs=processor_inputs.tokenization_kwargs,
)["mm_kwargs"].require_data()
@@ -246,9 +242,7 @@ def test_model_tensor_schema(model_id: str):
processor = factories.build_processor(ctx, cache=None)
with initialize_dummy_model(model_cls, model_config) as model:
for modality, _, mm_kwargs in create_batched_mm_kwargs(
model_cls, model_config, processor
):
for modality, _, mm_kwargs in create_batched_mm_kwargs(model_config, processor):
for method_name in inputs_parse_methods:
print(
f"Testing `{method_name}` with modality={modality} "

View File

@@ -21,7 +21,7 @@ def test_multimodal_processor(model_id):
str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
str_processed_inputs = mm_processor.apply(
prompt=str_prompt,
mm_data=mm_data,
mm_items=mm_processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs={},
)
@@ -46,7 +46,7 @@ def test_multimodal_processor(model_id):
]
ids_processed_inputs = mm_processor.apply(
prompt=ids_prompt,
mm_data=mm_data,
mm_items=mm_processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs={},
)