[Benchmark] Parameterization of streaming loading of multimodal datasets (#20528)

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-07-09 21:35:16 +08:00
committed by GitHub
parent 70ca5484f5
commit 9ff2af6d2b
4 changed files with 24 additions and 2 deletions

View File

@@ -356,6 +356,7 @@ def get_requests(args, tokenizer):
elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf":
common_kwargs["no_stream"] = args.no_stream
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = VisionArenaDataset
common_kwargs["dataset_subset"] = None
@@ -610,6 +611,11 @@ def create_argument_parser():
help="Name of the dataset to benchmark on.",
default="sharegpt",
)
parser.add_argument(
"--no-stream",
action="store_true",
help="Do not load the dataset in streaming mode.",
)
parser.add_argument(
"--dataset",
type=str,