python3 tools/preprocess_data.py \ --tokenizer-name-or-path HuggingFaceTB/cosmo2-tokenizer \ --output-folder datasets/fineweb-edu-dedup-ds \ --n-tasks 16 \ hf \ --dataset datasets/fineweb-edu-dedup \