From 40a8756224861cd4a1b1c7969ae1b2a86dc33f3d Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 27 Dec 2025 21:42:02 +0800 Subject: [PATCH] [Chore]: Remove HF format Phi4-MM examples (#31405) Signed-off-by: Isotr0py --- examples/offline_inference/audio_language.py | 32 ----------------- examples/offline_inference/vision_language.py | 36 ------------------- .../vision_language_multi_image.py | 35 ------------------ 3 files changed, 103 deletions(-) diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index a6d0c5d12..4bc2112c0 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -213,37 +213,6 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: ) -def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData: - """ - Phi-4-multimodal-instruct supports both image and audio inputs. Here, we - show how to process audio inputs. - """ - model_path = snapshot_download( - "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70" - ) - # Since the vision-lora and speech-lora co-exist with the base model, - # we have to manually specify the path of the lora weights. - speech_lora_path = os.path.join(model_path, "speech-lora") - placeholders = "<|audio|>" * audio_count - - prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" - - engine_args = EngineArgs( - model=model_path, - max_model_len=12800, - max_num_seqs=2, - enable_lora=True, - max_lora_rank=320, - limit_mm_per_prompt={"audio": audio_count}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompts, - lora_requests=[LoRARequest("speech", 1, speech_lora_path)], - ) - - # Qwen2-Audio def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: model_name = "Qwen/Qwen2-Audio-7B-Instruct" @@ -416,7 +385,6 @@ model_example_map = { "midashenglm": run_midashenglm, "minicpmo": run_minicpmo, "phi4_mm": run_phi4mm, - "phi4_multimodal": run_phi4_multimodal, "qwen2_audio": run_qwen2_audio, "qwen2_5_omni": run_qwen2_5_omni, "ultravox": run_ultravox, diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index dd5b22ae9..dfca7d5c9 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1424,41 +1424,6 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: ) -# HF format Phi-4-multimodal-instruct -def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData: - """ - Phi-4-multimodal-instruct supports both image and audio inputs. Here, we - show how to process image inputs. - """ - assert modality == "image" - model_path = snapshot_download( - "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70" - ) - # Since the vision-lora and speech-lora co-exist with the base model, - # we have to manually specify the path of the lora weights. - vision_lora_path = os.path.join(model_path, "vision-lora") - prompts = [ - f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions - ] - engine_args = EngineArgs( - model=model_path, - max_model_len=5120, - max_num_seqs=2, - max_num_batched_tokens=12800, - enable_lora=True, - max_lora_rank=320, - # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={"dynamic_hd": 16}, - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - lora_requests=[LoRARequest("vision", 1, vision_lora_path)], - ) - - # Pixtral HF-format def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1904,7 +1869,6 @@ model_example_map = { "paligemma2": run_paligemma2, "phi3_v": run_phi3v, "phi4_mm": run_phi4mm, - "phi4_multimodal": run_phi4_multimodal, "pixtral_hf": run_pixtral_hf, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 3c01806ba..2d7aece52 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -932,40 +932,6 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData: - """ - Phi-4-multimodal-instruct supports both image and audio inputs. Here, we - show how to process multi images inputs. - """ - - model_path = snapshot_download( - "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70" - ) - # Since the vision-lora and speech-lora co-exist with the base model, - # we have to manually specify the path of the lora weights. - vision_lora_path = os.path.join(model_path, "vision-lora") - engine_args = EngineArgs( - model=model_path, - max_model_len=4096, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - enable_lora=True, - max_lora_rank=320, - # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={"dynamic_hd": 4}, - ) - - placeholders = "<|image|>" * len(image_urls) - prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>" - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - lora_requests=[LoRARequest("vision", 1, vision_lora_path)], - ) - - def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" engine_args = EngineArgs( @@ -1363,7 +1329,6 @@ model_example_map = { "paddleocr_vl": load_paddleocr_vl, "phi3_v": load_phi3v, "phi4_mm": load_phi4mm, - "phi4_multimodal": load_phi4_multimodal, "pixtral_hf": load_pixtral_hf, "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl,