[Core] Dynamic image size support for VLMs (#5276)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-07-03 11:34:00 +08:00
parent 482045ee77
commit 9831aec49f
38 changed files with 1453 additions and 664 deletions
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -5,22 +5,17 @@ from PIL import Image

 from vllm import LLM, SamplingParams

-# Dynamic image input is currently not supported and therefore
-# a fixed image input shape and its corresponding feature size is required.
-# See https://github.com/vllm-project/vllm/pull/4199 for the complete
-# configuration matrix.
-

 def run_llava_next():
    llm = LLM(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        image_token_id=32000,
        image_input_shape="1,3,336,336",
-        image_feature_size=1176,
+        # Use the maximum possible value for memory profiling
+        image_feature_size=2928,
    )

-    prompt = "[INST] " + "<image>" * 1176 + (
-        "\nWhat is shown in this image? [/INST]")
+    prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
    image = Image.open(BytesIO(requests.get(url).content))
    sampling_params = SamplingParams(temperature=0.8,