VOCAB_FILE=pt2-vocab.json MERGE_FILE=gpt2-merges.txt python3 tools/preprocess_data.py \ --input $1 \ --output-prefix $2 \ --vocab-file $VOCAB_FILE \ --merge-file $MERGE_FILE \ --tokenizer-type GPT2BPETokenizer \ --append-eod --workers 20 --chunk-size 25