Commit 3573423f authored by Raul Puri's avatar Raul Puri
Browse files

added presplit-sentences to scripts

parent d0878333
...@@ -10,6 +10,7 @@ python pretrain_bert.py \ ...@@ -10,6 +10,7 @@ python pretrain_bert.py \
--tokenizer-model-type bert-large-uncased \ --tokenizer-model-type bert-large-uncased \
--vocab-size 30522 \ --vocab-size 30522 \
--train-data wikipedia \ --train-data wikipedia \
--presplit-sentences \
--loose-json \ --loose-json \
--text-key text \ --text-key text \
--split 1000,1,1 \ --split 1000,1,1 \
......
...@@ -17,6 +17,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ ...@@ -17,6 +17,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--tokenizer-model-type bert-large-uncased \ --tokenizer-model-type bert-large-uncased \
--vocab-size 30522 \ --vocab-size 30522 \
--train-data wikipedia \ --train-data wikipedia \
--presplit-sentences \
--loose-json \ --loose-json \
--text-key text \ --text-key text \
--split 1000,1,1 \ --split 1000,1,1 \
......
...@@ -10,6 +10,7 @@ python pretrain_bert.py \ ...@@ -10,6 +10,7 @@ python pretrain_bert.py \
--tokenizer-path tokenizer.model \ --tokenizer-path tokenizer.model \
--vocab-size 30522 \ --vocab-size 30522 \
--train-data wikipedia \ --train-data wikipedia \
--presplit-sentences \
--loose-json \ --loose-json \
--text-key text \ --text-key text \
--split 1000,1,1 \ --split 1000,1,1 \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment