[Misc] support arbitrary MM datasets in spec dec bench (#33486)

Signed-off-by: kkt-cohere <komal@cohere.com> Signed-off-by: Komal Kumar Teru <162363718+kkt-cohere@users.noreply.github.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-02 14:19:48 +05:30
parent ab374786c7
commit ba871fb788
3 changed files with 156 additions and 18 deletions
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -32,6 +32,7 @@ th {
 | HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` |
 | Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
 | Custom | ✅ | ✅ | Local file: `data.jsonl` |
+| Custom MM | ✅ | ✅ | Local file: `mm_data.jsonl` |

 Legend:

@@ -133,6 +134,33 @@ vllm bench serve --port 9001 --save-result --save-detailed \

 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.

+#### Custom multimodal dataset
+
+If the multimodal dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomMMDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `mm_data.jsonl`:
+
+```json
+{"prompt": "How many animals are present in the given image?", "image_files": ["/path/to/image/folder/horsepony.jpg"]}
+{"prompt": "What colour is the bird shown in the image?", "image_files": ["/path/to/image/folder/flycatcher.jpeg"]}
+```
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct
+```
+
+```bash
+# run benchmarking script
+vllm bench serve--save-result --save-detailed \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name custom_mm \
+  --dataset-path <path-to-your-mm-data-jsonl> \
+  --allowed-local-media-path /path/to/image/folder
+```
+
+Note that we need to use the `openai-chat` backend and `/v1/chat/completions` endpoint for multimodal inputs.
+
 #### VisionArena Benchmark for Vision Language Models

 ```bash
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -5,7 +5,6 @@ from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams
 from vllm.benchmarks.datasets import add_dataset_parser, get_samples
-from vllm.inputs import TokensPrompt
 from vllm.v1.metrics.reader import Counter, Vector

 try:
@@ -56,6 +55,7 @@ def parse_args():
        default="eagle",
        choices=["ngram", "eagle", "eagle3", "mtp", "draft_model"],
    )
+    parser.add_argument("--backend", type=str, default="openai")
    parser.add_argument("--num-spec-tokens", type=int, default=2)
    parser.add_argument("--prompt-lookup-max", type=int, default=5)
    parser.add_argument("--prompt-lookup-min", type=int, default=2)
@@ -75,12 +75,11 @@ def parse_args():
    parser.add_argument("--gpu-memory-utilization", type=float, default=0.9)
    parser.add_argument("--disable-padded-drafter-batch", action="store_true")
    parser.add_argument("--max-num-seqs", type=int, default=None)
+    parser.add_argument("--allowed-local-media-path", type=str, default="")
    return parser.parse_args()


 def main(args):
-    args.endpoint_type = "openai-chat"
-
    model_dir = args.model_dir
    if args.model_dir is None:
        if args.custom_mm_prompts:
@@ -91,19 +90,25 @@ def main(args):
            )
        model_dir = "meta-llama/Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
-    args.custom_skip_chat_template = True

-    if not args.custom_mm_prompts:
-        prompts = get_samples(args, tokenizer)
-        # add_special_tokens is False to avoid adding bos twice
-        # when using chat templates
-        prompt_ids = [
-            tokenizer.encode(prompt.prompt, add_special_tokens=False)
-            for prompt in prompts
-        ]
+    if args.custom_mm_prompts:
+        prompts = llm_prompts = get_custom_mm_prompts(args.num_prompts)
    else:
-        prompts = get_custom_mm_prompts(args.num_prompts)
-
+        prompts = get_samples(args, tokenizer)
+        if args.enable_multimodal_chat:
+            llm_prompts = [p.prompt for p in prompts]
+        else:
+            # add_special_tokens is False to avoid adding bos twice
+            # when using chat templates
+            llm_prompts = [
+                {
+                    "prompt_token_ids": tokenizer.encode(
+                        prompt.prompt, add_special_tokens=False
+                    ),
+                    "multi_modal_data": prompt.multi_modal_data,
+                }
+                for prompt in prompts
+            ]
    if args.method == "eagle" or args.method == "eagle3":
        eagle_dir = args.eagle_dir
        if args.method == "eagle" and eagle_dir is None:
@@ -154,16 +159,17 @@ def main(args):
        limit_mm_per_prompt={"image": 5},
        disable_chunked_mm_input=True,
        max_num_seqs=args.max_num_seqs,
+        allowed_local_media_path=args.allowed_local_media_path,
    )

    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
-    if not args.custom_mm_prompts:
+    if args.backend == "openai-chat":
+        outputs = llm.chat(llm_prompts, sampling_params=sampling_params)
+    else:
        outputs = llm.generate(
-            [TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
+            llm_prompts,
            sampling_params=sampling_params,
        )
-    else:
-        outputs = llm.chat(prompts, sampling_params=sampling_params)

    # print the generated text
    if args.print_output:
@@ -219,6 +225,8 @@ def main(args):

 if __name__ == "__main__":
    args = parse_args()
+    args.enable_multimodal_chat = args.backend == "openai-chat"
+
    acceptance_length = main(args)

    if args.test:
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1335,6 +1335,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
            "random-rerank",
            "hf",
            "custom",
+            "custom_mm",
            "prefix_repetition",
            "spec_bench",
        ],
@@ -1363,6 +1364,11 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
        action="store_true",
        help="Skip applying chat template to prompt for datasets that support it.",
    )
+    parser.add_argument(
+        "--enable-multimodal-chat",
+        action="store_true",
+        help="Enable multimodal chat transformation for datasets that support it.",
+    )
    parser.add_argument(
        "--disable-shuffle",
        action="store_true",
@@ -1685,6 +1691,19 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
            no_oversample=args.no_oversample,
        )

+    elif args.dataset_name == "custom_mm":
+        dataset = CustomMMDataset(
+            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+        )
+        input_requests = dataset.sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.custom_output_len,
+            enable_multimodal_chat=args.enable_multimodal_chat,
+            request_id_prefix=args.request_id_prefix,
+            no_oversample=args.no_oversample,
+        )
+
    elif args.dataset_name == "sonnet":
        dataset = SonnetDataset(
            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
@@ -1832,6 +1851,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
            output_len=args.hf_output_len,
+            enable_multimodal_chat=args.enable_multimodal_chat,
            request_id_prefix=args.request_id_prefix,
            no_oversample=args.no_oversample,
            skip_chat_template=args.skip_chat_template,
@@ -1849,6 +1869,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
                num_requests=args.num_prompts,
                tokenizer=tokenizer,
                output_len=args.spec_bench_output_len,
+                enable_multimodal_chat=args.enable_multimodal_chat,
                request_id_prefix=args.request_id_prefix,
                no_oversample=args.no_oversample,
            ),
@@ -1860,6 +1881,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
                output_len=args.sharegpt_output_len,
+                enable_multimodal_chat=args.enable_multimodal_chat,
                request_id_prefix=args.request_id_prefix,
                no_oversample=args.no_oversample,
            ),
@@ -1903,6 +1925,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
                limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
                num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
                bucket_config=args.random_mm_bucket_config,
+                enable_multimodal_chat=args.enable_multimodal_chat,
                request_id_prefix=args.request_id_prefix,
                no_oversample=args.no_oversample,
            ),
@@ -2075,6 +2098,85 @@ class CustomDataset(BenchmarkDataset):
        return sampled_requests


+class CustomMMDataset(CustomDataset):
+    """
+    Implements the Custom MultiModal dataset. Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {
+        "prompt": "How many red blocks in the given images?",
+        "image_files": ["path/to/image1.png", "path/to/image2.png"],
+    }
+    {
+        "prompt": "Which country has the most pokemons based on the given graphs?",
+        "image_files": ["path/to/image.png"],
+    }
+    ```
+
+    NOTE: Only the first image file in "image_files" is used for each sample request.
+
+    This is used to benchmark multimodal LLMs on arbitrary datasets.
+    """
+
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: TokenizerLike,
+        num_requests: int,
+        output_len: int | None = None,
+        enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        **kwargs,
+    ) -> list:
+        # load all data if needed
+        self.num_available_samples = len(self.data)
+        if num_requests <= 0:
+            num_requests = self.num_available_samples
+            logger.info(
+                "num_requests is set to 0 or negative, "
+                "so using all available samples: %d",
+                num_requests,
+            )
+
+        sampled_requests = []
+        for i, item in enumerate(self.data):
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            images = item["image_files"]
+            if len(images) > 1:
+                logger.warning(
+                    "Multiple image files found for sample %d. "
+                    "Only the first image will be used.",
+                    i,
+                )
+            mm_content = process_image(images[0])
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
+                )
+            )
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix, no_oversample
+        )
+
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # Spec Bench Dataset Implementation
 # -----------------------------------------------------------------------------