[Refactor] Move MM data parsing outside processor (#33408)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-02-01 00:46:14 +08:00
committed by GitHub
parent 92924b2ddd
commit 88c3e114d8
43 changed files with 228 additions and 139 deletions

View File

@@ -176,7 +176,7 @@ def get_text_token_prompts(
if model_type in MM_DATA_PATCHES:
mm_data = MM_DATA_PATCHES[model_type](mm_data)
parsed_data = processor.data_parser.parse_mm_data(mm_data)
parsed_data = processor.info.parse_mm_data(mm_data)
mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
text_prompt: str | None
@@ -336,17 +336,18 @@ def _test_processing_correctness_one(
model_type = model_config.hf_config.model_type
text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data)
mm_items = baseline_processor.info.parse_mm_data(mm_data)
ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
baseline_tokenized_result = baseline_processor.apply(
token_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)
cached_tokenized_result = cached_processor.apply(
token_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)
@@ -360,12 +361,12 @@ def _test_processing_correctness_one(
if text_prompt is not None:
baseline_text_result = baseline_processor.apply(
text_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)
cached_text_result = cached_processor.apply(
text_prompt,
mm_data=mm_data,
mm_items=mm_items,
hf_processor_mm_kwargs={},
)