python tools/preprocess_data.py \
    --input /public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head.jsonl \
    --output-prefix /public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-qwen \
    --vocab-file /public/home/wangxj/Downloads/model_weights/qwen1.5_14b/vocab.json \
    --tokenizer-type QwenTokenizer \
    --merge-file /public/home/wangxj/Downloads/model_weights/qwen1.5_14b/merges.txt \
    --append-eod \
    --workers 8