[Model] Refactor Qwen2-VL to use merged multimodal processor (#11258)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Isotr0py
2024-12-20 00:28:00 +08:00
committed by GitHub
parent 7379b3d4b2
commit e24113a8fe
5 changed files with 272 additions and 522 deletions

View File

@@ -447,7 +447,6 @@ def run_qwen_vl(question: str, modality: str):
# Qwen2-VL
def run_qwen2_vl(question: str, modality: str):
assert modality == "image"
model_name = "Qwen/Qwen2-VL-7B-Instruct"
@@ -463,8 +462,13 @@ def run_qwen2_vl(question: str, modality: str):
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None