diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index c3fd726e9..5dc1ebbf5 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -506,6 +506,7 @@ Then, you can use the OpenAI client as follows: ??? code ```python + import os from openai import OpenAI openai_api_key = "EMPTY" @@ -517,8 +518,11 @@ Then, you can use the OpenAI client as follows: ) # Single-image input inference + + # Public image URL for testing remote image processing image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + # Create chat completion with remote image chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", messages=[ @@ -542,6 +546,35 @@ Then, you can use the OpenAI client as follows: ) print("Chat completion output:", chat_response.choices[0].message.content) + # Local image file path (update this to point to your actual image file) + image_file = "/path/to/image.jpg" + + # Create chat completion with local image file + # Launch the API server/engine with the --allowed-local-media-path argument. + if os.path.exists(image_file): + chat_completion_from_local_image_url = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?", + }, + { + "type": "image_url", + "image_url": {"url": f"file://{image_file}"}, + }, + ], + } + ], + ) + result = chat_completion_from_local_image_url.choices[0].message.content + print("Chat completion output from local image file:\n", result) + else: + print(f"Local image file not found at {image_file}, skipping local file test.") + # Multi-image input inference image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg" image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg" diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 3d1259276..198863ae4 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio """ import base64 +import os import requests from openai import OpenAI @@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str: return result +def encode_base64_content_from_file(file_path: str) -> str: + """Encode a local file content to base64 format.""" + + with open(file_path, "rb") as file: + file_content = file.read() + result = base64.b64encode(file_content).decode("utf-8") + + return result + + # Text-only inference def run_text_only(model: str, max_completion_tokens: int) -> None: chat_completion = client.chat.completions.create( @@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None: def run_single_image(model: str, max_completion_tokens: int) -> None: ## Use image url in the payload image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + image_file = "/path/to/image.jpg" # local file chat_completion_from_url = client.chat.completions.create( messages=[ { @@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None: result = chat_completion_from_url.choices[0].message.content print("Chat completion output from image url:\n", result) + ## Use local image url in the payload + # Launch the API server/engine with the --allowed-local-media-path argument. + if os.path.exists(image_file): + chat_completion_from_local_image_url = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": f"file://{image_file}"}, + }, + ], + } + ], + model=model, + max_completion_tokens=max_completion_tokens, + ) + result = chat_completion_from_local_image_url.choices[0].message.content + print("Chat completion output from local image file:\n", result) + else: + print(f"Local image file not found at {image_file}, skipping local file test.") + ## Use base64 encoded image in the payload image_base64 = encode_base64_content_from_url(image_url) chat_completion_from_base64 = client.chat.completions.create( @@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None: result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from base64 encoded image:", result) + ## Use base64 encoded local image in the payload + if os.path.exists(image_file): + local_image_base64 = encode_base64_content_from_file(image_file) + chat_completion_from_local_image_base64 = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{local_image_base64}" + }, + }, + ], + } + ], + model=model, + max_completion_tokens=max_completion_tokens, + ) + + result = chat_completion_from_local_image_base64.choices[0].message.content + print("Chat completion output from base64 encoded local image:", result) + else: + print(f"Local image file not found at {image_file}, skipping local file test.") + # Multi-image input inference def run_multi_image(model: str, max_completion_tokens: int) -> None: