[Model] Update multi-modal processor to support Mantis(LLaVA) model (#10711)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -34,7 +34,7 @@ COMMON_BROADCAST_SETTINGS = {
|
||||
"dtype": "half",
|
||||
"max_tokens": 5,
|
||||
"tensor_parallel_size": 2,
|
||||
"model_kwargs": {"device_map": "auto"},
|
||||
"hf_model_kwargs": {"device_map": "auto"},
|
||||
"image_size_factors": [(.25, 0.5, 1.0)],
|
||||
"distributed_executor_backend": (
|
||||
"ray",
|
||||
@@ -108,7 +108,7 @@ VLM_TEST_SETTINGS = {
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
@@ -151,7 +151,7 @@ VLM_TEST_SETTINGS = {
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
|
||||
}),
|
||||
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor("pixel_values"),
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"),
|
||||
stop_str=["<|im_end|>"],
|
||||
image_size_factors=[(0.10, 0.15)],
|
||||
max_tokens=64,
|
||||
@@ -177,7 +177,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
# For chameleon, we only compare the sequences
|
||||
@@ -281,7 +281,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
num_video_frames=16,
|
||||
max_model_len=16384,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values_videos"
|
||||
),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
@@ -306,6 +306,20 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
|
||||
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||
),
|
||||
"mantis": VLMTestInfo(
|
||||
models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}}, # noqa: E501
|
||||
get_stop_token_ids=lambda tok: [128009],
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
||||
patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
||||
),
|
||||
"minicpmv_25": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-Llama3-V-2_5"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
@@ -342,7 +356,7 @@ VLM_TEST_SETTINGS = {
|
||||
# max_num_seqs=2,
|
||||
# task="generate",
|
||||
# # use eager mode for hf runner since phi3v didn't work with flash_attn
|
||||
# model_kwargs={"_attn_implementation": "eager"},
|
||||
# hf_model_kwargs={"_attn_implementation": "eager"},
|
||||
# use_tokenizer_eos=True,
|
||||
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
|
||||
# num_logprobs=10,
|
||||
@@ -373,7 +387,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
@@ -438,7 +452,7 @@ VLM_TEST_SETTINGS = {
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=16384,
|
||||
max_num_seqs=2,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
|
||||
Reference in New Issue
Block a user