diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md index dd5a12e40..43b6052de 100644 --- a/docs/benchmarking/cli.md +++ b/docs/benchmarking/cli.md @@ -32,6 +32,7 @@ th { | HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` | | Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` | | Custom | ✅ | ✅ | Local file: `data.jsonl` | +| Custom MM | ✅ | ✅ | Local file: `mm_data.jsonl` | Legend: @@ -133,6 +134,33 @@ vllm bench serve --port 9001 --save-result --save-detailed \ You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. +#### Custom multimodal dataset + +If the multimodal dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomMMDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `mm_data.jsonl`: + +```json +{"prompt": "How many animals are present in the given image?", "image_files": ["/path/to/image/folder/horsepony.jpg"]} +{"prompt": "What colour is the bird shown in the image?", "image_files": ["/path/to/image/folder/flycatcher.jpeg"]} +``` + +```bash +# need a model with vision capability here +vllm serve Qwen/Qwen2-VL-7B-Instruct +``` + +```bash +# run benchmarking script +vllm bench serve--save-result --save-detailed \ + --backend openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name custom_mm \ + --dataset-path \ + --allowed-local-media-path /path/to/image/folder +``` + +Note that we need to use the `openai-chat` backend and `/v1/chat/completions` endpoint for multimodal inputs. + #### VisionArena Benchmark for Vision Language Models ```bash diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 1e3e310e7..45593b530 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -5,7 +5,6 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams from vllm.benchmarks.datasets import add_dataset_parser, get_samples -from vllm.inputs import TokensPrompt from vllm.v1.metrics.reader import Counter, Vector try: @@ -56,6 +55,7 @@ def parse_args(): default="eagle", choices=["ngram", "eagle", "eagle3", "mtp", "draft_model"], ) + parser.add_argument("--backend", type=str, default="openai") parser.add_argument("--num-spec-tokens", type=int, default=2) parser.add_argument("--prompt-lookup-max", type=int, default=5) parser.add_argument("--prompt-lookup-min", type=int, default=2) @@ -75,12 +75,11 @@ def parse_args(): parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) parser.add_argument("--disable-padded-drafter-batch", action="store_true") parser.add_argument("--max-num-seqs", type=int, default=None) + parser.add_argument("--allowed-local-media-path", type=str, default="") return parser.parse_args() def main(args): - args.endpoint_type = "openai-chat" - model_dir = args.model_dir if args.model_dir is None: if args.custom_mm_prompts: @@ -91,19 +90,25 @@ def main(args): ) model_dir = "meta-llama/Llama-3.1-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_dir) - args.custom_skip_chat_template = True - if not args.custom_mm_prompts: - prompts = get_samples(args, tokenizer) - # add_special_tokens is False to avoid adding bos twice - # when using chat templates - prompt_ids = [ - tokenizer.encode(prompt.prompt, add_special_tokens=False) - for prompt in prompts - ] + if args.custom_mm_prompts: + prompts = llm_prompts = get_custom_mm_prompts(args.num_prompts) else: - prompts = get_custom_mm_prompts(args.num_prompts) - + prompts = get_samples(args, tokenizer) + if args.enable_multimodal_chat: + llm_prompts = [p.prompt for p in prompts] + else: + # add_special_tokens is False to avoid adding bos twice + # when using chat templates + llm_prompts = [ + { + "prompt_token_ids": tokenizer.encode( + prompt.prompt, add_special_tokens=False + ), + "multi_modal_data": prompt.multi_modal_data, + } + for prompt in prompts + ] if args.method == "eagle" or args.method == "eagle3": eagle_dir = args.eagle_dir if args.method == "eagle" and eagle_dir is None: @@ -154,16 +159,17 @@ def main(args): limit_mm_per_prompt={"image": 5}, disable_chunked_mm_input=True, max_num_seqs=args.max_num_seqs, + allowed_local_media_path=args.allowed_local_media_path, ) sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) - if not args.custom_mm_prompts: + if args.backend == "openai-chat": + outputs = llm.chat(llm_prompts, sampling_params=sampling_params) + else: outputs = llm.generate( - [TokensPrompt(prompt_token_ids=x) for x in prompt_ids], + llm_prompts, sampling_params=sampling_params, ) - else: - outputs = llm.chat(prompts, sampling_params=sampling_params) # print the generated text if args.print_output: @@ -219,6 +225,8 @@ def main(args): if __name__ == "__main__": args = parse_args() + args.enable_multimodal_chat = args.backend == "openai-chat" + acceptance_length = main(args) if args.test: diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index f0a1baf3d..d437e26ad 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1335,6 +1335,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "random-rerank", "hf", "custom", + "custom_mm", "prefix_repetition", "spec_bench", ], @@ -1363,6 +1364,11 @@ def add_dataset_parser(parser: FlexibleArgumentParser): action="store_true", help="Skip applying chat template to prompt for datasets that support it.", ) + parser.add_argument( + "--enable-multimodal-chat", + action="store_true", + help="Enable multimodal chat transformation for datasets that support it.", + ) parser.add_argument( "--disable-shuffle", action="store_true", @@ -1685,6 +1691,19 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: no_oversample=args.no_oversample, ) + elif args.dataset_name == "custom_mm": + dataset = CustomMMDataset( + dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle + ) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + enable_multimodal_chat=args.enable_multimodal_chat, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + elif args.dataset_name == "sonnet": dataset = SonnetDataset( dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle @@ -1832,6 +1851,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: num_requests=args.num_prompts, tokenizer=tokenizer, output_len=args.hf_output_len, + enable_multimodal_chat=args.enable_multimodal_chat, request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, skip_chat_template=args.skip_chat_template, @@ -1849,6 +1869,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: num_requests=args.num_prompts, tokenizer=tokenizer, output_len=args.spec_bench_output_len, + enable_multimodal_chat=args.enable_multimodal_chat, request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), @@ -1860,6 +1881,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: tokenizer=tokenizer, num_requests=args.num_prompts, output_len=args.sharegpt_output_len, + enable_multimodal_chat=args.enable_multimodal_chat, request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), @@ -1903,6 +1925,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt, num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio, bucket_config=args.random_mm_bucket_config, + enable_multimodal_chat=args.enable_multimodal_chat, request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), @@ -2075,6 +2098,85 @@ class CustomDataset(BenchmarkDataset): return sampled_requests +class CustomMMDataset(CustomDataset): + """ + Implements the Custom MultiModal dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + { + "prompt": "How many red blocks in the given images?", + "image_files": ["path/to/image1.png", "path/to/image2.png"], + } + { + "prompt": "Which country has the most pokemons based on the given graphs?", + "image_files": ["path/to/image.png"], + } + ``` + + NOTE: Only the first image file in "image_files" is used for each sample request. + + This is used to benchmark multimodal LLMs on arbitrary datasets. + """ + + IS_MULTIMODAL = True + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + # load all data if needed + self.num_available_samples = len(self.data) + if num_requests <= 0: + num_requests = self.num_available_samples + logger.info( + "num_requests is set to 0 or negative, " + "so using all available samples: %d", + num_requests, + ) + + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + prompt_len = len(tokenizer(prompt).input_ids) + images = item["image_files"] + if len(images) > 1: + logger.warning( + "Multiple image files found for sample %d. " + "Only the first image will be used.", + i, + ) + mm_content = process_image(images[0]) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + ) + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + + return sampled_requests + + # ----------------------------------------------------------------------------- # Spec Bench Dataset Implementation # -----------------------------------------------------------------------------