Add multimodal input method in the documentation (#31601)

Signed-off-by: xiaoming <1259730330@qq.com>
This commit is contained in:
labAxiaoming
2026-01-02 20:43:30 +08:00
committed by GitHub
parent cc410e8644
commit a01f2faedf
2 changed files with 96 additions and 0 deletions

View File

@@ -506,6 +506,7 @@ Then, you can use the OpenAI client as follows:
??? code
```python
import os
from openai import OpenAI
openai_api_key = "EMPTY"
@@ -517,8 +518,11 @@ Then, you can use the OpenAI client as follows:
)
# Single-image input inference
# Public image URL for testing remote image processing
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# Create chat completion with remote image
chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct",
messages=[
@@ -542,6 +546,35 @@ Then, you can use the OpenAI client as follows:
)
print("Chat completion output:", chat_response.choices[0].message.content)
# Local image file path (update this to point to your actual image file)
image_file = "/path/to/image.jpg"
# Create chat completion with local image file
# Launch the API server/engine with the --allowed-local-media-path argument.
if os.path.exists(image_file):
chat_completion_from_local_image_url = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?",
},
{
"type": "image_url",
"image_url": {"url": f"file://{image_file}"},
},
],
}
],
)
result = chat_completion_from_local_image_url.choices[0].message.content
print("Chat completion output from local image file:\n", result)
else:
print(f"Local image file not found at {image_file}, skipping local file test.")
# Multi-image input inference
image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"