[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126)

2024-08-15 01:55:42 +08:00
parent 70b746efcf
commit 3f674a49b5
38 changed files with 572 additions and 216 deletions
--- a/tests/models/test_minicpmv.py
+++ b/tests/models/test_minicpmv.py
@@ -61,7 +61,7 @@ def run_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -176,7 +176,7 @@ def run_multi_image_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -197,6 +197,7 @@ def run_multi_image_test(
    with vllm_runner(model,
                     max_model_len=4096,
                     max_num_seqs=1,
+                     limit_mm_per_prompt={"image": len(images)},
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,