[Model] Expose dynamic_image_size as mm_processor_kwargs for InternVL2 models (#10518)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
@@ -123,8 +123,15 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
def calculate_num_blocks_wrapper(hf_config: PretrainedConfig,
|
||||
max_dynamic_patch: Optional[int] = None):
|
||||
def calculate_num_blocks_wrapper(
|
||||
hf_config: PretrainedConfig,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
):
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = hf_config.dynamic_image_size
|
||||
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = hf_config.max_dynamic_patch
|
||||
min_num = hf_config.min_dynamic_patch
|
||||
@@ -183,10 +190,17 @@ def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int,
|
||||
return pixel_values
|
||||
|
||||
|
||||
def image_to_pixel_values_wrapper(hf_config: PretrainedConfig,
|
||||
max_dynamic_patch: Optional[int] = None):
|
||||
def image_to_pixel_values_wrapper(
|
||||
hf_config: PretrainedConfig,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
):
|
||||
image_size = hf_config.vision_config.image_size
|
||||
min_num = hf_config.min_dynamic_patch
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = hf_config.dynamic_image_size
|
||||
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = hf_config.max_dynamic_patch
|
||||
use_thumbnail = hf_config.use_thumbnail
|
||||
@@ -207,11 +221,17 @@ def get_internvl_num_patches(hf_config: PretrainedConfig):
|
||||
(downsample_ratio**2))
|
||||
|
||||
|
||||
def get_max_internvl_image_tokens(ctx: InputContext,
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None):
|
||||
def get_max_internvl_image_tokens(
|
||||
ctx: InputContext,
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
):
|
||||
hf_config = ctx.get_hf_config()
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = hf_config.dynamic_image_size
|
||||
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = hf_config.max_dynamic_patch
|
||||
use_thumbnail = hf_config.use_thumbnail
|
||||
@@ -222,12 +242,18 @@ def get_max_internvl_image_tokens(ctx: InputContext,
|
||||
return num_patches * max_dynamic_patch
|
||||
|
||||
|
||||
def get_max_internvl_image_size(ctx: InputContext,
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None):
|
||||
def get_max_internvl_image_size(
|
||||
ctx: InputContext,
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
):
|
||||
hf_config = ctx.get_hf_config()
|
||||
image_size = hf_config.vision_config.image_size
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = hf_config.dynamic_image_size
|
||||
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = hf_config.max_dynamic_patch
|
||||
use_thumbnail = hf_config.use_thumbnail
|
||||
@@ -281,6 +307,7 @@ class InternVLInputPipeline:
|
||||
inputs: DecoderOnlyInputs,
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
) -> DecoderOnlyInputs:
|
||||
multi_modal_data = inputs.get("multi_modal_data")
|
||||
if multi_modal_data is None or "image" not in multi_modal_data:
|
||||
@@ -292,7 +319,7 @@ class InternVLInputPipeline:
|
||||
image_data = multi_modal_data["image"]
|
||||
num_patches = get_internvl_num_patches(hf_config)
|
||||
num_blocks_calculator = calculate_num_blocks_wrapper(
|
||||
hf_config, max_dynamic_patch)
|
||||
hf_config, max_dynamic_patch, dynamic_image_size)
|
||||
if isinstance(image_data, Image.Image):
|
||||
width, height = image_data.size
|
||||
num_blocks, _, _ = num_blocks_calculator(width, height)
|
||||
@@ -332,11 +359,12 @@ class InternVLInputPipeline:
|
||||
data: object,
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
):
|
||||
hf_config = ctx.get_hf_config()
|
||||
|
||||
image_pixel_values_mapper = image_to_pixel_values_wrapper(
|
||||
hf_config, max_dynamic_patch)
|
||||
hf_config, max_dynamic_patch, dynamic_image_size)
|
||||
if isinstance(data, Image.Image):
|
||||
data = image_pixel_values_mapper(data)
|
||||
# Add an N dimension for number of images per prompt (currently 1).
|
||||
@@ -366,13 +394,17 @@ class InternVLInputPipeline:
|
||||
mm_counts: Mapping[str, int],
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
):
|
||||
num_images = mm_counts["image"]
|
||||
|
||||
hf_config = ctx.get_hf_config()
|
||||
|
||||
image_feature_size = get_max_internvl_image_tokens(
|
||||
ctx, max_dynamic_patch=max_dynamic_patch)
|
||||
ctx,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
model_config = ctx.model_config
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
@@ -388,7 +420,10 @@ class InternVLInputPipeline:
|
||||
)
|
||||
|
||||
max_image_width, max_image_height = get_max_internvl_image_size(
|
||||
ctx, max_dynamic_patch=max_dynamic_patch)
|
||||
ctx,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
mm_data = dummy_image_for_clip(
|
||||
hf_config.vision_config,
|
||||
|
||||
Reference in New Issue
Block a user