[VLM] Remove image_input_type from VLM config (#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -1,38 +1,32 @@
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
|
||||
|
||||
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
|
||||
# You can use `.buildkite/download-images.sh` to download them
|
||||
|
||||
|
||||
def run_llava_pixel_values(*, disable_image_processor: bool = False):
|
||||
def run_llava():
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
image_input_type="pixel_values",
|
||||
image_token_id=32000,
|
||||
image_input_shape="1,3,336,336",
|
||||
image_feature_size=576,
|
||||
disable_image_processor=disable_image_processor,
|
||||
)
|
||||
|
||||
prompt = "<image>" * 576 + (
|
||||
"\nUSER: What is the content of this image?\nASSISTANT:")
|
||||
|
||||
if disable_image_processor:
|
||||
image = torch.load("images/stop_sign_pixel_values.pt")
|
||||
else:
|
||||
image = Image.open("images/stop_sign.jpg")
|
||||
image = Image.open("images/stop_sign.jpg")
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": ImagePixelData(image),
|
||||
"multi_modal_data": {
|
||||
"image": image
|
||||
},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
@@ -40,45 +34,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def run_llava_image_features():
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
image_input_type="image_features",
|
||||
image_token_id=32000,
|
||||
image_input_shape="1,576,1024",
|
||||
image_feature_size=576,
|
||||
)
|
||||
|
||||
prompt = "<image>" * 576 + (
|
||||
"\nUSER: What is the content of this image?\nASSISTANT:")
|
||||
|
||||
image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": ImageFeatureData(image),
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.type == "pixel_values":
|
||||
run_llava_pixel_values()
|
||||
else:
|
||||
run_llava_image_features()
|
||||
def main():
|
||||
run_llava()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Demo on Llava")
|
||||
parser.add_argument("--type",
|
||||
type=str,
|
||||
choices=["pixel_values", "image_features"],
|
||||
default="pixel_values",
|
||||
help="image input type")
|
||||
args = parser.parse_args()
|
||||
# Download from s3
|
||||
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
|
||||
local_directory = "images"
|
||||
@@ -95,4 +55,4 @@ if __name__ == "__main__":
|
||||
local_directory,
|
||||
"--no-sign-request",
|
||||
])
|
||||
main(args)
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user