added presplit-sentences to scripts

3573423f · Raul Puri · d0878333 · 3573423f · 3573423f · 3573423f
Commit 3573423f authored May 13, 2019 by Raul Puri
3 changed files
--- a/scripts/pretrain_bert.sh
+++ b/scripts/pretrain_bert.sh
@@ -10,6 +10,7 @@ python pretrain_bert.py \
    --tokenizer-model-type bert-large-uncased \
    --vocab-size 30522 \
    --train-data wikipedia \
+    --presplit-sentences \
    --loose-json \
    --text-key text \
    --split 1000,1,1 \

--- a/scripts/pretrain_bert_distributed.sh
+++ b/scripts/pretrain_bert_distributed.sh
@@ -17,6 +17,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
    --tokenizer-model-type bert-large-uncased \
    --vocab-size 30522 \
    --train-data wikipedia \
+    --presplit-sentences \
    --loose-json \
    --text-key text \
    --split 1000,1,1 \

--- a/scripts/pretrain_bert_sentencepiece.sh
+++ b/scripts/pretrain_bert_sentencepiece.sh
@@ -10,6 +10,7 @@ python pretrain_bert.py \
    --tokenizer-path tokenizer.model \
    --vocab-size 30522 \
    --train-data wikipedia \
+    --presplit-sentences \
    --loose-json \
    --text-key text \
    --split 1000,1,1 \