[Benchmark] Parameterization of streaming loading of multimodal datasets (#20528)

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-07-09 21:35:16 +08:00
committed by GitHub
parent 70ca5484f5
commit 9ff2af6d2b
4 changed files with 24 additions and 2 deletions

View File

@@ -701,6 +701,7 @@ class HuggingFaceDataset(BenchmarkDataset):
self,
dataset_path: str,
dataset_split: str,
no_stream: bool = False,
dataset_subset: Optional[str] = None,
**kwargs,
) -> None:
@@ -708,6 +709,7 @@ class HuggingFaceDataset(BenchmarkDataset):
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset
self.load_stream = not no_stream
self.load_data()
def load_data(self) -> None:
@@ -716,7 +718,7 @@ class HuggingFaceDataset(BenchmarkDataset):
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
streaming=self.load_stream,
)
self.data = self.data.shuffle(seed=self.random_seed)