Commit ca19a8b5 authored by Mohammad's avatar Mohammad
Browse files

updated scripts

parent fe40745e
......@@ -32,7 +32,7 @@ For BERT training, we swapped the position of the layer normalization and the re
<a id="setup"></a>
# Setup
We officially support only python3.6 and above.
We officially support only python 3.6, pytorch 1.5, cuda 10, and nccl 2.6 versions and above.
To use this repo please install the latest supported versions of PyTorch with GPU support. We strongly recommend using one of [NGC's recent PyTorch containers](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) (the latest compatible version at time of publication can be pulled with `docker pull nvcr.io/nvidia/pytorch:20.03-py3`). Data preprocessing requires [NLTK](https://www.nltk.org/install.html), though this is not required for training, evaluation or downstream tasks.
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
#!/bin/bash
RANK=0
WORLD_SIZE=1
python pretrain_albert.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--batch-size 4 \
--seq-length 512 \
--max-preds-per-seq 80 \
--max-position-embeddings 512 \
--train-iters 10000 \
--save checkpoints/albert_117m \
--load checkpoints/albert_117m \
--resume-dataloader \
--data-path data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap \
--vocab data/megatron/vocab.txt \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--fp16 \
--fp32-layernorm \
--fp32-embedding \
--skip-mmap-warmup \
--num-workers 0
#!/bin/bash
GPUS_PER_NODE=2
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_albert.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--batch-size 4 \
--seq-length 512 \
--max-preds-per-seq 80 \
--max-position-embeddings 512 \
--train-iters 10000 \
--save checkpoints/albert_117m \
--load checkpoints/albert_117m \
--resume-dataloader \
--data-path data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap \
--vocab data/megatron/vocab.txt \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--fp16 \
--fp32-layernorm \
--fp32-embedding \
--skip-mmap-warmup \
--num-workers 0
#!/bin/bash
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--model-parallel-size 2 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 4 \
--seq-length 512 \
--max-preds-per-seq 80 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--save checkpoints/bert_345m_mp2 \
--load checkpoints/bert_345m_mp2 \
--resume-dataloader \
--train-data wikipedia \
--lazy-loader \
--tokenizer-type BertWordPieceTokenizer \
--tokenizer-model-type bert-large-uncased \
--presplit-sentences \
--cache-dir cache \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--fp16 \
--fp32-layernorm \
--fp32-embedding
#!/bin/bash
RANK=0
WORLD_SIZE=1
python pretrain_bert.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 4 \
--seq-length 512 \
--max-preds-per-seq 80 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--save checkpoints/bert_345m \
--load checkpoints/bert_345m \
--resume-dataloader \
--train-data wikipedia \
--lazy-loader \
--tokenizer-type SentencePieceTokenizer \
--tokenizer-model-type bpe \
--tokenizer-path tokenizer.model \
--presplit-sentences \
--cache-dir cache \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--fp16 \
--fp32-layernorm \
--fp32-embedding
#!/bin/bash
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 4 \
--seq-length 512 \
--max-preds-per-seq 80 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--save checkpoints/bert_345m \
--load checkpoints/bert_345m \
--resume-dataloader \
--use-tfrecords \
--train-data <TF Record 1> <TFRecord 2> \
--valid-data <TF Record 3> \
--test-data <TF Record 4> \
--tokenizer-type BertWordPieceTokenizer \
--tokenizer-model-type bert-large-uncased \
--presplit-sentences \
--cache-dir cache \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--fp16 \
--fp32-layernorm \
--fp32-embedding
#! /bin/bash
# Runs the "345M" parameter model
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt2.py \
--model-parallel-size 2 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 320000 \
--save checkpoints/gpt2_345m_mp2 \
--load checkpoints/gpt2_345m_mp2 \
--resume-dataloader \
--train-data wikipedia \
--lazy-loader \
--tokenizer-type GPT2BPETokenizer \
--cache-dir cache \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--fp16
set +x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment