[Model] Update multi-modal processor to support Mantis(LLaVA) model (#10711)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-08 01:10:05 +08:00
parent 1c768fe537
commit 39e227c7ae
14 changed files with 175 additions and 78 deletions
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -419,6 +419,22 @@ def run_aria(question: str, modality: str):
    return llm, prompt, stop_token_ids


+# Mantis
+def run_mantis(question: str, modality: str):
+    assert modality == "image"
+
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
+
+    llm = LLM(
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
+        max_model_len=4096,
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+    )
+    stop_token_ids = [128009]
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
    "llava": run_llava,
    "llava-next": run_llava_next,
@@ -441,6 +457,7 @@ model_example_map = {
    "glm4v": run_glm4v,
    "idefics3": run_idefics3,
    "aria": run_aria,
+    "mantis": run_mantis,
 }