[Core] Dynamic image size support for VLMs (#5276)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
@@ -5,22 +5,17 @@ from PIL import Image
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Dynamic image input is currently not supported and therefore
|
||||
# a fixed image input shape and its corresponding feature size is required.
|
||||
# See https://github.com/vllm-project/vllm/pull/4199 for the complete
|
||||
# configuration matrix.
|
||||
|
||||
|
||||
def run_llava_next():
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
image_token_id=32000,
|
||||
image_input_shape="1,3,336,336",
|
||||
image_feature_size=1176,
|
||||
# Use the maximum possible value for memory profiling
|
||||
image_feature_size=2928,
|
||||
)
|
||||
|
||||
prompt = "[INST] " + "<image>" * 1176 + (
|
||||
"\nWhat is shown in this image? [/INST]")
|
||||
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
|
||||
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
|
||||
image = Image.open(BytesIO(requests.get(url).content))
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
|
||||
Reference in New Issue
Block a user