[Refactor] Dynamic target and content for prompt updates (#23411)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-08-25 14:39:58 +08:00
committed by GitHub
parent 49ab23b3cc
commit 712d0f88d8
9 changed files with 463 additions and 456 deletions

View File

@@ -824,9 +824,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
processor=hf_processor,
)
image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
return image_tokens
return [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
def get_audio_replacement_phi4mm(item_idx: int):
audios = mm_items.get_items("audio", AudioProcessorItems)
@@ -837,28 +835,20 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
audio_embed_size = self.info._compute_audio_embed_size(
audio_frames)
audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
return [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
return audio_tokens
num_images = mm_items.get_count("image", strict=False)
num_audios = mm_items.get_count("audio", strict=False)
image_repl = [
return [
PromptReplacement(
modality="image",
target=image_token,
target=image_tokens.__getitem__,
replacement=get_image_replacement_phi4mm,
) for image_token in image_tokens[:num_images]
]
audio_repl = [
),
PromptReplacement(
modality="audio",
target=audio_token,
target=audio_tokens.__getitem__,
replacement=get_audio_replacement_phi4mm,
) for audio_token in audio_tokens[:num_audios]
),
]
return image_repl + audio_repl
@MULTIMODAL_REGISTRY.register_processor(