test_preprocess.sh

#!/bin/bash

IMPL=lazy
KEYS=text

python tools/preprocess_data.py \
        --input path/to/test_sample_cn.json \
        --json-keys ${KEYS} \
        --vocab-file path/to/bert-base-chinese-vocab.txt \
        --dataset-impl ${IMPL} \
        --tokenizer-name BertTokenizer \
        --do-lower-case \
        --do-chinese-wwm \
        --split-sentences \
        --output-prefix cn_samples_${IMPL} \
        --workers 1 \
        --log-interval 2