[Doc] Consolidate whisper and florence2 examples (#14050)

2025-03-01 14:49:15 +08:00
parent 8994dabc22
commit fdcc405346
5 changed files with 224 additions and 162 deletions
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference with
+the explicit/implicit prompt format on enc-dec LMMs for text generation.
+"""
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+def run_florence2():
+    # Create a Florence-2 encoder/decoder model instance
+    llm = LLM(
+        model="microsoft/Florence-2-large",
+        tokenizer="facebook/bart-large",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # implicit prompt with task token
+            "prompt": "<DETAILED_CAPTION>",
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image
+            },
+        },
+        {   # explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "Describe in detail what is shown in the image.",
+                "multi_modal_data": {
+                    "image": ImageAsset("cherry_blossom").pil_image
+                },
+            },
+            "decoder_prompt": "",
+        },
+    ]
+    return llm, prompts
+
+
+def run_mllama():
+    # Create a Mllama encoder/decoder model instance
+    llm = LLM(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",   # noqa: E501
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image,
+            },
+        },
+        {   # Explicit prompt
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {
+                    "image": ImageAsset("stop_sign").pil_image,
+                },
+            },
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
+        },
+    ]
+    return llm, prompts
+
+
+def run_whisper():
+    # Create a Whisper encoder/decoder model instance
+    llm = LLM(
+        model="openai/whisper-large-v3-turbo",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Test implicit prompt
+            "prompt": "<|startoftranscript|>",
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {   # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<|startoftranscript|>",
+        }
+    ]
+    return llm, prompts
+
+
+model_example_map = {
+    "florence2": run_florence2,
+    "mllama": run_mllama,
+    "whisper": run_whisper,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompts = model_example_map[model]()
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=64,
+    )
+
+    start = time.time()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Decoder prompt: {prompt!r}, "
+              f"Generated text: {generated_text!r}")
+
+    duration = time.time() - start
+
+    print("Duration:", duration)
+    print("RPS:", len(prompts) / duration)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="mllama",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+
+    args = parser.parse_args()
+    main(args)