[VLM] Remove image_input_type from VLM config (#5852)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
xwjiang2010
2024-07-02 00:57:09 -07:00
committed by GitHub
parent 2c37540aa6
commit 98d6682cd1
35 changed files with 329 additions and 751 deletions

View File

@@ -1,7 +1,6 @@
import argparse
import dataclasses
import json
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
@@ -80,13 +79,9 @@ class EngineArgs:
preemption_mode: Optional[str] = None
# Related to Vision-language models such as llava
image_input_type: Optional[str] = None
image_token_id: Optional[int] = None
image_input_shape: Optional[str] = None
image_feature_size: Optional[int] = None
image_processor: Optional[str] = None
image_processor_revision: Optional[str] = None
disable_image_processor: bool = False
scheduler_delay_factor: float = 0.0
enable_chunked_prefill: bool = False
@@ -114,14 +109,6 @@ class EngineArgs:
@staticmethod
def add_cli_args_for_vlm(
parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument('--image-input-type',
type=nullable_str,
default=None,
choices=[
t.name.lower()
for t in VisionLanguageConfig.ImageInputType
],
help=('The image input type passed into vLLM.'))
parser.add_argument('--image-token-id',
type=int,
default=None,
@@ -137,24 +124,6 @@ class EngineArgs:
type=int,
default=None,
help=('The image feature size along the context dimension.'))
parser.add_argument(
'--image-processor',
type=str,
default=EngineArgs.image_processor,
help='Name or path of the huggingface image processor to use. '
'If unspecified, model name or path will be used.')
parser.add_argument(
'--image-processor-revision',
type=str,
default=None,
help='Revision of the huggingface image processor version to use. '
'It can be a branch name, a tag name, or a commit id. '
'If unspecified, will use the default version.')
parser.add_argument(
'--disable-image-processor',
action='store_true',
help='Disables the use of image processor, even if one is defined '
'for the model on huggingface.')
return parser
@@ -679,33 +648,16 @@ class EngineArgs:
raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")
if self.image_input_type:
if (not self.image_token_id or not self.image_input_shape
or not self.image_feature_size):
if self.image_token_id is not None:
if (not self.image_input_shape or not self.image_feature_size):
raise ValueError(
'Specify `image_token_id`, `image_input_shape` and '
'`image_feature_size` together with `image_input_type`.')
if self.image_processor is None:
self.image_processor = self.model
if self.disable_image_processor:
if self.image_processor != self.model:
warnings.warn(
"You've specified an image processor "
f"({self.image_processor}) but also disabled "
"it via `--disable-image-processor`.",
stacklevel=2)
self.image_processor = None
'Specify `image_input_shape` and '
'`image_feature_size` together with `image_token_id`.')
vision_language_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig.
get_image_input_enum_type(self.image_input_type),
image_token_id=self.image_token_id,
image_input_shape=str_to_int_tuple(self.image_input_shape),
image_feature_size=self.image_feature_size,
image_processor=self.image_processor,
image_processor_revision=self.image_processor_revision,
)
else:
vision_language_config = None