[Model] Support Llama4 in vLLM (#16104)

2025-04-05 21:01:00 -07:00
parent 63375f0cdb
commit c575232395
35 changed files with 2369 additions and 142 deletions
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -582,6 +582,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    )


+def run_llama4(questions: list[str], modality: str):
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        gpu_memory_utilization=0.4,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [[{
+        "role":
+        "user",
+        "content": [{
+            "type": "image"
+        }, {
+            "type": "text",
+            "text": f"{question}"
+        }]
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            add_generation_prompt=True,
+                                            tokenize=False)
+    stop_token_ids = None
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # Molmo
 def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -907,6 +943,7 @@ model_example_map = {
    "minicpmv": run_minicpmv,
    "mistral3": run_mistral3,
    "mllama": run_mllama,
+    "llama4": run_llama4,
    "molmo": run_molmo,
    "NVLM_D": run_nvlm_d,
    "paligemma": run_paligemma,