Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
78cb1781
Commit
78cb1781
authored
Apr 16, 2020
by
Mohammad
Browse files
moved steves branch
parent
c6b5c137
Changes
10
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
665 additions
and
255 deletions
+665
-255
README.md
README.md
+438
-187
examples/evaluate_zeroshot_gpt2.sh
examples/evaluate_zeroshot_gpt2.sh
+38
-0
examples/finetine_race_distributed.sh
examples/finetine_race_distributed.sh
+47
-0
examples/finetune_mnli_distributed.sh
examples/finetune_mnli_distributed.sh
+44
-0
examples/generate_text.sh
examples/generate_text.sh
+17
-24
examples/merge_mp_bert.sh
examples/merge_mp_bert.sh
+18
-0
examples/pretrain_bert.sh
examples/pretrain_bert.sh
+15
-14
examples/pretrain_bert_distributed.sh
examples/pretrain_bert_distributed.sh
+14
-14
examples/pretrain_gpt2.sh
examples/pretrain_gpt2.sh
+17
-8
examples/pretrain_gpt2_distributed.sh
examples/pretrain_gpt2_distributed.sh
+17
-8
No files found.
README.md
View file @
78cb1781
This diff is collapsed.
Click to expand it.
examples/evaluate_zeroshot_gpt2.sh
0 → 100644
View file @
78cb1781
#!/bin/bash
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
TASK
=
"LAMBADA"
VALID_DATA
=
<lambada path>
VOCAB_FILE
=
gpt2-vocab.json
MERGE_FILE
=
gpt2-merges.txt
CHECKPOINT
=
checkpoints/gpt2_345m
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--task
$TASK
\
--valid-data
$VALID_DATA
\
--tokenizer-type
GPT2BPETokenizer
\
--strict-lambada
--vocab-file
$VOCAB_FILE
\
--merge-file
$MERGE_FILE
\
--load
$CHECKPOINT
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--checkpoint-activations
\
--seq-length
512
\
--max-position-embeddings
512
\
--log-interval
10
\
--fp16
\
--no-load-optim
\
--no-load-rng
examples/finetine_race_distributed.sh
0 → 100644
View file @
78cb1781
#!/bin/bash
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
TRAIN_DATA
=
"data/RACE/train/middle"
VALID_DATA
=
"data/RACE/dev/middle
\
data/RACE/dev/high"
VOCAB_FILE
=
bert-vocab.txt
PRETRIANED_CHECKPOINT
=
checkpoints/bert_345m
CHECKPOINT_PATH
=
checkpoints/bert_345m_race
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--task
RACE
\
--seed
1234
\
--train-data
$TRAIN_DATA
\
--valid-data
$VALID_DATA
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--epochs
3
\
--pretrained-checkpoint
$PRETRIANED_CHECKPOINT
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--checkpoint-activations
\
--lr
1.0e-5
\
--lr-decay-style
linear
\
--warmup
0.06
\
--seq-length
512
\
--max-position-embeddings
512
\
--save-interval
500000
\
--save
$CHECKPOINT_PATH
\
--log-interval
10
\
--eval-interval
100
\
--eval-iters
50
\
--weight-decay
1.0e-1
\
--clip-grad
1.0
\
--hidden-dropout
0.1
\
--attention-dropout
0.1
\
--fp16
examples/finetune_mnli_distributed.sh
0 → 100644
View file @
78cb1781
#!/bin/bash
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
TRAIN_DATA
=
"data/glue_data/MNLI/train.tsv"
VALID_DATA
=
"data/glue_data/MNLI/dev_matched.tsv
\
data/glue_data/MNLI/dev_mismatched.tsv"
PRETRAINED_CHECKPOINT
=
checkpoints/bert_345m
VOCAB_FILE
=
bert-vocab.txt
CHECKPOINT_PATH
=
checkpoints/bert_345m_mnli
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--task
MNLI
\
--seed
1234
\
--train-data
$TRAIN_DATA
\
--valid-data
$VALID_DATA
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--epochs
5
\
--pretrained-checkpoint
$PRETRAINED_CHECKPOINT
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--checkpoint-activations
\
--lr
5.0e-5
\
--lr-decay-style
linear
\
--warmup
0.065
\
--seq-length
512
\
--max-position-embeddings
512
\
--save-interval
500000
\
--save
$CHECKPOINT_PATH
\
--log-interval
10
\
--eval-interval
100
\
--eval-iters
50
\
--weight-decay
1.0e-1
\
--fp16
examples/generate_text.sh
View file @
78cb1781
#!/bin/bash
CHECKPOINT_PATH
=
checkpoints/gpt2_345m/
MPSIZE
=
1
NLAYERS
=
12
NHIDDEN
=
768
NATT
=
12
MAXSEQLEN
=
1024
CHECKPOINT_PATH
=
checkpoints/gpt2_345m
VOCAB_FILE
=
gpt2-vocab.json
MERGE_FILE
=
gpt2-merges.txt
#SAMPLING ARGS
TEMP
=
0.9
#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
TOPK
=
0
TOPP
=
0
python generate_samples.py
\
--model-parallel-size
$MPSIZE
\
--num-layers
$NLAYERS
\
--hidden-size
$NHIDDEN
\
python tools/generate_samples_gpt2.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--load
$CHECKPOINT_PATH
\
--num-attention-heads
$NATT
\
--num-attention-heads
16
\
--max-position-embeddings
1024
\
--tokenizer-type
GPT2BPETokenizer
\
--fp16
\
--cache-dir
cache
\
--out-seq-length
$MAXSEQLEN
\
--temperature
$TEMP
\
--top_k
$TOPK
\
--genfile
dbg_unconditional.json
\
--num-samples
10
\
--top_p
$TOPP
\
--batch-size
2
\
--seq-length
1024
\
--out-seq-length
1024
\
--temperature
1.0
\
--vocab-file
$VOCAB_FILE
\
--merge-file
$MERGE_FILE
\
--genfile
unconditional_samples.json
\
--num-samples
2
\
--top_p
0.9
\
--recompute
examples/merge_mp_bert.sh
0 → 100644
View file @
78cb1781
#!/bin/bash
MODEL_PARALLEL_SIZE
=
2
VOCAB_FILE
=
bert-vocab.txt
CHECKPOINT_PATH
=
checkpoints/bert_345m
WORLD_SIZE
=
$MODEL_PARALLEL_SIZE
python tools/merge_mp_partitions.py
\
--model-type
BERT
\
--model-parallel-size
$MODEL_PARALLEL_SIZE
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
512
\
--max-position-embeddings
512
\
--load
$CHECKPOINT_PATH
examples/pretrain_bert.sh
View file @
78cb1781
...
...
@@ -2,6 +2,8 @@
RANK
=
0
WORLD_SIZE
=
1
DATA_PATH
=
<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH
=
<Specify path>
python pretrain_bert.py
\
--num-layers
24
\
...
...
@@ -9,26 +11,25 @@ python pretrain_bert.py \
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
checkpoints/bert_345m
\
--load
checkpoints/bert_345m
\
--resume-dataloader
\
--train-data
wikipedia
\
--lazy-loader
\
--tokenizer-type
BertWordPieceTokenizer
\
--tokenizer-model-type
bert-large-uncased
\
--presplit-sentences
\
--cache-dir
cache
\
--train-iters
2000000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
bert-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--min-lr
0.00001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
examples/pretrain_bert_distributed.sh
View file @
78cb1781
...
...
@@ -8,27 +8,26 @@ NNODES=1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH
=
<Specify path>
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
checkpoints/bert_345m
\
--load
checkpoints/bert_345m
\
--resume-dataloader
\
--train-data
wikipedia
\
--lazy-loader
\
--tokenizer-type
BertWordPieceTokenizer
\
--tokenizer-model-type
bert-large-uncased
\
--presplit-sentences
\
--cache-dir
cache
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
bert-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
...
...
@@ -37,7 +36,8 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
examples/pretrain_gpt2.sh
View file @
78cb1781
...
...
@@ -5,6 +5,10 @@
RANK
=
0
WORLD_SIZE
=
1
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
python pretrain_gpt2.py
\
--num-layers
24
\
--hidden-size
1024
\
...
...
@@ -12,22 +16,27 @@ python pretrain_gpt2.py \
--batch-size
8
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--train-iters
32
0000
\
--
save
checkpoints/gpt2_345m
\
--
load
checkpoints/gpt2_345m
\
--
resume-dataloader
\
--
train-data
wikipedia
\
--
lazy-loader
\
--
tokenizer-type
GPT2BPETokenizer
\
--
cache-dir
cache
\
--train-iters
50
0000
\
--
lr-decay-iters
320000
\
--
save
$CHECKPOINT_PATH
\
--
load
$CHECKPOINT_PATH
\
--
data-path
$DATA_PATH
\
--
vocab-file
gpt2-vocab.json
\
--
merge-file
gpt2-merges.txt
\
--
data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--min-lr
1.0e-5
\
--lr-decay-style
cosine
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--checkpoint-activations
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
...
...
examples/pretrain_gpt2_distributed.sh
View file @
78cb1781
...
...
@@ -10,24 +10,28 @@ NNODES=1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_gpt2.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--train-iters
32
0000
\
--
save
checkpoints/gpt2_345m
\
--
load
checkpoints/gpt2_345m
\
--
resume-dataloader
\
--
train-data
wikipedia
\
--
lazy-loader
\
--
tokenizer-type
GPT2BPETokenizer
\
--
cache-dir
cache
\
--train-iters
50
0000
\
--
lr-decay-iters
320000
\
--
save
$CHECKPOINT_PATH
\
--
load
$CHECKPOINT_PATH
\
--
data-path
$DATA_PATH
\
--
vocab-file
gpt2-vocab.json
\
--
merge-file
gpt2-merges.txt
\
--
data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
...
...
@@ -36,7 +40,12 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--clip-grad
1.0
\
--warmup
.01
\
--checkpoint-activations
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
set
+x
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment