[VLM] Calculate maximum number of multi-modal tokens by model (#6121)

This commit is contained in:
Cyrus Leung
2024-07-05 07:37:23 +08:00
committed by GitHub
parent 69ec3ca14c
commit ae96ef8fbd
12 changed files with 265 additions and 95 deletions

View File

@@ -35,6 +35,10 @@ def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
patch_size=hf_config.patch_size)
def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
return get_clip_image_feature_size(hf_config)
def dummy_seq_data_for_clip(
hf_config: CLIPVisionConfig,
seq_len: int,