[vlm] Remove vision language config. (#6089)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
xwjiang2010
2024-07-03 15:14:16 -07:00
committed by GitHub
parent 3c6325f0fc
commit d9e98f42e4
43 changed files with 371 additions and 465 deletions

View File

@@ -6,11 +6,11 @@ from typing import List, Optional, Tuple, Union
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
ObservabilityConfig, ParallelConfig, SchedulerConfig,
SpeculativeConfig, TokenizerPoolConfig,
VisionLanguageConfig)
MultiModalConfig, ObservabilityConfig, ParallelConfig,
SchedulerConfig, SpeculativeConfig,
TokenizerPoolConfig)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser, str_to_int_tuple
from vllm.utils import FlexibleArgumentParser
def nullable_str(val: str):
@@ -78,11 +78,6 @@ class EngineArgs:
model_loader_extra_config: Optional[dict] = None
preemption_mode: Optional[str] = None
# Related to Vision-language models such as llava
image_token_id: Optional[int] = None
image_input_shape: Optional[str] = None
image_feature_size: Optional[int] = None
scheduler_delay_factor: float = 0.0
enable_chunked_prefill: bool = False
@@ -106,27 +101,6 @@ class EngineArgs:
if self.tokenizer is None:
self.tokenizer = self.model
@staticmethod
def add_cli_args_for_vlm(
parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument('--image-token-id',
type=int,
default=None,
help=('Input id for image token.'))
parser.add_argument(
'--image-input-shape',
type=nullable_str,
default=None,
help=('The biggest image input shape (worst for memory footprint) '
'given an input type. Only used for vLLM\'s profile_run.'))
parser.add_argument(
'--image-feature-size',
type=int,
default=None,
help=('The image feature size along the context dimension.'))
return parser
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"""Shared CLI arguments for vLLM engine."""
@@ -484,9 +458,6 @@ class EngineArgs:
],
help='Device type for vLLM execution.')
# Related to Vision-language models such as llava
parser = EngineArgs.add_cli_args_for_vlm(parser)
parser.add_argument(
'--scheduler-delay-factor',
type=float,
@@ -648,19 +619,7 @@ class EngineArgs:
raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")
if self.image_token_id is not None:
if (not self.image_input_shape or not self.image_feature_size):
raise ValueError(
'Specify `image_input_shape` and '
'`image_feature_size` together with `image_token_id`.')
vision_language_config = VisionLanguageConfig(
image_token_id=self.image_token_id,
image_input_shape=str_to_int_tuple(self.image_input_shape),
image_feature_size=self.image_feature_size,
)
else:
vision_language_config = None
multimodal_config = MultiModalConfig()
device_config = DeviceConfig(device=self.device)
model_config = ModelConfig(
@@ -685,7 +644,7 @@ class EngineArgs:
disable_sliding_window=self.disable_sliding_window,
skip_tokenizer_init=self.skip_tokenizer_init,
served_model_name=self.served_model_name,
multimodal_config=vision_language_config)
multimodal_config=multimodal_config)
cache_config = CacheConfig(
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
@@ -787,7 +746,7 @@ class EngineArgs:
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
vision_language_config=vision_language_config,
multimodal_config=multimodal_config,
speculative_config=speculative_config,
load_config=load_config,
decoding_config=decoding_config,
@@ -831,7 +790,3 @@ def _engine_args_parser():
def _async_engine_args_parser():
return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
async_args_only=True)
def _vlm_engine_args_parser():
return EngineArgs.add_cli_args_for_vlm(FlexibleArgumentParser())