[Benchmark] Parameterization of streaming loading of multimodal datasets (#20528)

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-07-09 21:35:16 +08:00
committed by GitHub
parent 70ca5484f5
commit 9ff2af6d2b
4 changed files with 24 additions and 2 deletions

View File

@@ -825,6 +825,7 @@ def main(args: argparse.Namespace):
dataset_subset=args.hf_subset,
dataset_split=args.hf_split,
random_seed=args.seed,
no_stream=args.no_stream,
).sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
@@ -1033,6 +1034,11 @@ def create_argument_parser():
help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.",
)
parser.add_argument(
"--no-stream",
action="store_true",
help="Do not load the dataset in streaming mode.",
)
parser.add_argument(
"--max-concurrency",
type=int,