convert_data_to_ds.sh 235 Bytes
Newer Older
chenzk's avatar
v1.0.8  
chenzk committed
1
2
3
4
5
6
python3 tools/preprocess_data.py \
       --tokenizer-name-or-path HuggingFaceTB/cosmo2-tokenizer \
       --output-folder datasets/fineweb-edu-dedup-ds \
       --n-tasks 16 \
       hf \
       --dataset datasets/fineweb-edu-dedup \