Add multimodal input method in the documentation (#31601)

Signed-off-by: xiaoming <1259730330@qq.com>
2026-01-02 20:43:30 +08:00
parent cc410e8644
commit a01f2faedf
2 changed files with 96 additions and 0 deletions
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -506,6 +506,7 @@ Then, you can use the OpenAI client as follows:
 ??? code

    ```python
+    import os
    from openai import OpenAI

    openai_api_key = "EMPTY"
@@ -517,8 +518,11 @@ Then, you can use the OpenAI client as follows:
    )

    # Single-image input inference
+
+    # Public image URL for testing remote image processing
    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

+    # Create chat completion with remote image
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
        messages=[
@@ -542,6 +546,35 @@ Then, you can use the OpenAI client as follows:
    )
    print("Chat completion output:", chat_response.choices[0].message.content)

+    # Local image file path (update this to point to your actual image file)
+    image_file = "/path/to/image.jpg"
+
+    # Create chat completion with local image file
+    # Launch the API server/engine with the --allowed-local-media-path argument.
+    if os.path.exists(image_file):
+        chat_completion_from_local_image_url = client.chat.completions.create(
+            model="microsoft/Phi-3.5-vision-instruct",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "What’s in this image?",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"file://{image_file}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        result = chat_completion_from_local_image_url.choices[0].message.content
+        print("Chat completion output from local image file:\n", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
+
    # Multi-image input inference
    image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
    image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"