[VLM] Improve consistency between feature size calculation and dummy data for profiling (#6146)

This commit is contained in:
Roger Wang
2024-07-04 18:29:47 -07:00
committed by GitHub
parent ae96ef8fbd
commit a41357e941
2 changed files with 18 additions and 26 deletions

View File

@@ -53,6 +53,10 @@ _KEYS_TO_MODIFY_MAPPING = {
# Cannot find the following 2 numbers from hf config.
_IMAGE_TOKEN_ID = 32044
# Result in the max possible feature size (h:w = 16:1)
MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
hidden_act="quick_gelu",
hidden_size=1024,
@@ -322,24 +326,17 @@ def get_phi3v_image_feature_size(
def get_max_phi3v_image_tokens(ctx: InputContext):
# Result in the max possible feature size (h:w = 16:1)
dummy_height, dummy_width = 8000, 50
return get_phi3v_image_feature_size(
ctx.get_hf_config(PretrainedConfig),
input_height=dummy_height,
input_width=dummy_width,
input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
)
def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
# Result in the max possible feature size (h:w = 16:1)
dummy_height, dummy_width = 8000, 50
image_feature_size = get_phi3v_image_feature_size(
ctx.get_hf_config(PretrainedConfig),
input_height=dummy_height,
input_width=dummy_width,
)
image_feature_size = get_max_phi3v_image_tokens(ctx)
seq_data = dummy_seq_data_for_clip(
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
@@ -349,8 +346,8 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
)
mm_data = dummy_image_for_clip(
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
image_width_override=dummy_width,
image_height_override=dummy_height,
image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
)
return seq_data, mm_data