openwebtext-to-jsonl.py 428 Bytes
Newer Older
hepj987's avatar
hepj987 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/usr/bin/env python

# generate a jsonl version of a small slice of a dataset that can be fed to megatron-lm preprocessor

import sys
from datasets import load_dataset

dataset_name = "stas/openwebtext-10k"

# subset to jsonlines
n_samples = 1000
ds = load_dataset(dataset_name, split='train')
ds_small = ds.select(range(n_samples))
path = f"openwebtext-{n_samples}.jsonl"
ds_small.to_json(path, orient="records", lines=True)