Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
5e56e563
Commit
5e56e563
authored
Apr 28, 2020
by
Neel Kant
Browse files
Merge master into realm-mlm
parents
6c0a5bd8
569b3dab
Changes
107
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
752 additions
and
624 deletions
+752
-624
LICENSE
LICENSE
+35
-2
README.md
README.md
+445
-187
examples/evaluate_zeroshot_gpt2.sh
examples/evaluate_zeroshot_gpt2.sh
+38
-0
examples/finetune_mnli_distributed.sh
examples/finetune_mnli_distributed.sh
+44
-0
examples/finetune_race_distributed.sh
examples/finetune_race_distributed.sh
+47
-0
examples/generate_text.sh
examples/generate_text.sh
+17
-24
examples/merge_mp_bert.sh
examples/merge_mp_bert.sh
+18
-0
examples/pretrain_albert.sh
examples/pretrain_albert.sh
+0
-32
examples/pretrain_albert_distributed.sh
examples/pretrain_albert_distributed.sh
+0
-40
examples/pretrain_bert.sh
examples/pretrain_bert.sh
+15
-14
examples/pretrain_bert_distributed.sh
examples/pretrain_bert_distributed.sh
+15
-14
examples/pretrain_bert_model_parallel.sh
examples/pretrain_bert_model_parallel.sh
+0
-44
examples/pretrain_bert_sentencepiece.sh
examples/pretrain_bert_sentencepiece.sh
+0
-35
examples/pretrain_bert_tfrecords_distributed.sh
examples/pretrain_bert_tfrecords_distributed.sh
+0
-44
examples/pretrain_gpt2.sh
examples/pretrain_gpt2.sh
+17
-8
examples/pretrain_gpt2_distributed.sh
examples/pretrain_gpt2_distributed.sh
+18
-8
examples/pretrain_gpt2_model_parallel.sh
examples/pretrain_gpt2_model_parallel.sh
+0
-43
examples/run_gpt2_eval.py
examples/run_gpt2_eval.py
+0
-81
megatron/__init__.py
megatron/__init__.py
+1
-1
megatron/arguments.py
megatron/arguments.py
+42
-47
No files found.
LICENSE
View file @
5e56e563
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
The following applies to all files unless otherwise noted:
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
...
...
@@ -24,8 +26,15 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--
This repository also contains code from Hugging Face Inc., Google Research,
and Facebook (from their Fairseq project). Files from these
organizations have notices at the top of each file. Below are licenses
used in those files, as indicated.
------------- LICENSE FOR huggingface
(transformer) repository
--------------
------------- LICENSE FOR huggingface
and Google Research code
--------------
Apache License
...
...
@@ -229,3 +238,27 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
------------- LICENSE FOR Facebook Fairseq code --------------
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
README.md
View file @
5e56e563
This diff is collapsed.
Click to expand it.
examples/evaluate_zeroshot_gpt2.sh
0 → 100755
View file @
5e56e563
#!/bin/bash
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
TASK
=
"LAMBADA"
VALID_DATA
=
<lambada path>
VOCAB_FILE
=
gpt2-vocab.json
MERGE_FILE
=
gpt2-merges.txt
CHECKPOINT
=
checkpoints/gpt2_345m
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--task
$TASK
\
--valid-data
$VALID_DATA
\
--tokenizer-type
GPT2BPETokenizer
\
--strict-lambada
\
--vocab-file
$VOCAB_FILE
\
--merge-file
$MERGE_FILE
\
--load
$CHECKPOINT
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--checkpoint-activations
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--log-interval
10
\
--fp16
\
--no-load-optim
\
--no-load-rng
examples/finetune_mnli_distributed.sh
0 → 100755
View file @
5e56e563
#!/bin/bash
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
TRAIN_DATA
=
"data/glue_data/MNLI/train.tsv"
VALID_DATA
=
"data/glue_data/MNLI/dev_matched.tsv
\
data/glue_data/MNLI/dev_mismatched.tsv"
PRETRAINED_CHECKPOINT
=
checkpoints/bert_345m
VOCAB_FILE
=
bert-vocab.txt
CHECKPOINT_PATH
=
checkpoints/bert_345m_mnli
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--task
MNLI
\
--seed
1234
\
--train-data
$TRAIN_DATA
\
--valid-data
$VALID_DATA
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--epochs
5
\
--pretrained-checkpoint
$PRETRAINED_CHECKPOINT
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--checkpoint-activations
\
--lr
5.0e-5
\
--lr-decay-style
linear
\
--warmup
0.065
\
--seq-length
512
\
--max-position-embeddings
512
\
--save-interval
500000
\
--save
$CHECKPOINT_PATH
\
--log-interval
10
\
--eval-interval
100
\
--eval-iters
50
\
--weight-decay
1.0e-1
\
--fp16
examples/finetune_race_distributed.sh
0 → 100755
View file @
5e56e563
#!/bin/bash
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
TRAIN_DATA
=
"data/RACE/train/middle"
VALID_DATA
=
"data/RACE/dev/middle
\
data/RACE/dev/high"
VOCAB_FILE
=
bert-vocab.txt
PRETRAINED_CHECKPOINT
=
checkpoints/bert_345m
CHECKPOINT_PATH
=
checkpoints/bert_345m_race
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--task
RACE
\
--seed
1234
\
--train-data
$TRAIN_DATA
\
--valid-data
$VALID_DATA
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--epochs
3
\
--pretrained-checkpoint
$PRETRAINED_CHECKPOINT
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--checkpoint-activations
\
--lr
1.0e-5
\
--lr-decay-style
linear
\
--warmup
0.06
\
--seq-length
512
\
--max-position-embeddings
512
\
--save-interval
100000
\
--save
$CHECKPOINT_PATH
\
--log-interval
10
\
--eval-interval
100
\
--eval-iters
50
\
--weight-decay
1.0e-1
\
--clip-grad
1.0
\
--hidden-dropout
0.1
\
--attention-dropout
0.1
\
--fp16
examples/generate_text.sh
View file @
5e56e563
#!/bin/bash
CHECKPOINT_PATH
=
checkpoints/gpt2_345m/
MPSIZE
=
1
NLAYERS
=
12
NHIDDEN
=
768
NATT
=
12
MAXSEQLEN
=
1024
CHECKPOINT_PATH
=
checkpoints/gpt2_345m
VOCAB_FILE
=
gpt2-vocab.json
MERGE_FILE
=
gpt2-merges.txt
#SAMPLING ARGS
TEMP
=
0.9
#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
TOPK
=
0
TOPP
=
0
python generate_samples.py
\
--model-parallel-size
$MPSIZE
\
--num-layers
$NLAYERS
\
--hidden-size
$NHIDDEN
\
python tools/generate_samples_gpt2.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--load
$CHECKPOINT_PATH
\
--num-attention-heads
$NATT
\
--num-attention-heads
16
\
--max-position-embeddings
1024
\
--tokenizer-type
GPT2BPETokenizer
\
--fp16
\
--cache-dir
cache
\
--out-seq-length
$MAXSEQLEN
\
--temperature
$TEMP
\
--top_k
$TOPK
\
--genfile
dbg_unconditional.json
\
--num-samples
10
\
--top_p
$TOPP
\
--batch-size
2
\
--seq-length
1024
\
--out-seq-length
1024
\
--temperature
1.0
\
--vocab-file
$VOCAB_FILE
\
--merge-file
$MERGE_FILE
\
--genfile
unconditional_samples.json
\
--num-samples
2
\
--top_p
0.9
\
--recompute
examples/merge_mp_bert.sh
0 → 100755
View file @
5e56e563
#!/bin/bash
MODEL_PARALLEL_SIZE
=
2
VOCAB_FILE
=
bert-vocab.txt
CHECKPOINT_PATH
=
checkpoints/bert_345m
WORLD_SIZE
=
$MODEL_PARALLEL_SIZE
python tools/merge_mp_partitions.py
\
--model-type
BERT
\
--model-parallel-size
$MODEL_PARALLEL_SIZE
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
512
\
--max-position-embeddings
512
\
--load
$CHECKPOINT_PATH
examples/pretrain_albert.sh
deleted
100755 → 0
View file @
6c0a5bd8
#!/bin/bash
RANK
=
0
WORLD_SIZE
=
1
python pretrain_albert.py
\
--num-layers
12
\
--hidden-size
768
\
--num-attention-heads
12
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
10000
\
--save
checkpoints/albert_117m
\
--load
checkpoints/albert_117m
\
--resume-dataloader
\
--data-path
data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap
\
--vocab
data/megatron/vocab.txt
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
\
--skip-mmap-warmup
\
--num-workers
0
examples/pretrain_albert_distributed.sh
deleted
100755 → 0
View file @
6c0a5bd8
#!/bin/bash
GPUS_PER_NODE
=
2
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
NNODES
=
1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_albert.py
\
--num-layers
12
\
--hidden-size
768
\
--num-attention-heads
12
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
10000
\
--save
checkpoints/albert_117m
\
--load
checkpoints/albert_117m
\
--resume-dataloader
\
--data-path
data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap
\
--vocab
data/megatron/vocab.txt
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
\
--skip-mmap-warmup
\
--num-workers
0
examples/pretrain_bert.sh
View file @
5e56e563
...
...
@@ -2,6 +2,8 @@
RANK
=
0
WORLD_SIZE
=
1
DATA_PATH
=
<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH
=
<Specify path>
python pretrain_bert.py
\
--num-layers
24
\
...
...
@@ -9,26 +11,25 @@ python pretrain_bert.py \
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
checkpoints/bert_345m
\
--load
checkpoints/bert_345m
\
--resume-dataloader
\
--train-data
wikipedia
\
--lazy-loader
\
--tokenizer-type
BertWordPieceTokenizer
\
--tokenizer-model-type
bert-large-uncased
\
--presplit-sentences
\
--cache-dir
cache
\
--train-iters
2000000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
bert-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--min-lr
0.00001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
examples/pretrain_bert_distributed.sh
View file @
5e56e563
...
...
@@ -8,36 +8,37 @@ NNODES=1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH
=
<Specify path>
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
checkpoints/bert_345m
\
--load
checkpoints/bert_345m
\
--resume-dataloader
\
--train-data
wikipedia
\
--lazy-loader
\
--tokenizer-type
BertWordPieceTokenizer
\
--tokenizer-model-type
bert-large-uncased
\
--presplit-sentences
\
--cache-dir
cache
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
bert-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--min-lr
1.0e-5
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
examples/pretrain_bert_model_parallel.sh
deleted
100755 → 0
View file @
6c0a5bd8
#!/bin/bash
GPUS_PER_NODE
=
8
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
NNODES
=
1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
--model-parallel-size
2
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
checkpoints/bert_345m_mp2
\
--load
checkpoints/bert_345m_mp2
\
--resume-dataloader
\
--train-data
wikipedia
\
--lazy-loader
\
--tokenizer-type
BertWordPieceTokenizer
\
--tokenizer-model-type
bert-large-uncased
\
--presplit-sentences
\
--cache-dir
cache
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
examples/pretrain_bert_sentencepiece.sh
deleted
100755 → 0
View file @
6c0a5bd8
#!/bin/bash
RANK
=
0
WORLD_SIZE
=
1
python pretrain_bert.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
checkpoints/bert_345m
\
--load
checkpoints/bert_345m
\
--resume-dataloader
\
--train-data
wikipedia
\
--lazy-loader
\
--tokenizer-type
SentencePieceTokenizer
\
--tokenizer-model-type
bpe
\
--tokenizer-path
tokenizer.model
\
--presplit-sentences
\
--cache-dir
cache
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
examples/pretrain_bert_tfrecords_distributed.sh
deleted
100755 → 0
View file @
6c0a5bd8
#!/bin/bash
GPUS_PER_NODE
=
8
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
NNODES
=
1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
checkpoints/bert_345m
\
--load
checkpoints/bert_345m
\
--resume-dataloader
\
--use-tfrecords
\
--train-data
<TF Record 1> <TFRecord 2>
\
--valid-data
<TF Record 3>
\
--test-data
<TF Record 4>
\
--tokenizer-type
BertWordPieceTokenizer
\
--tokenizer-model-type
bert-large-uncased
\
--presplit-sentences
\
--cache-dir
cache
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
examples/pretrain_gpt2.sh
View file @
5e56e563
...
...
@@ -5,6 +5,10 @@
RANK
=
0
WORLD_SIZE
=
1
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
python pretrain_gpt2.py
\
--num-layers
24
\
--hidden-size
1024
\
...
...
@@ -12,22 +16,27 @@ python pretrain_gpt2.py \
--batch-size
8
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--train-iters
32
0000
\
--
save
checkpoints/gpt2_345m
\
--
load
checkpoints/gpt2_345m
\
--
resume-dataloader
\
--
train-data
wikipedia
\
--
lazy-loader
\
--
tokenizer-type
GPT2BPETokenizer
\
--
cache-dir
cache
\
--train-iters
50
0000
\
--
lr-decay-iters
320000
\
--
save
$CHECKPOINT_PATH
\
--
load
$CHECKPOINT_PATH
\
--
data-path
$DATA_PATH
\
--
vocab-file
gpt2-vocab.json
\
--
merge-file
gpt2-merges.txt
\
--
data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--min-lr
1.0e-5
\
--lr-decay-style
cosine
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--checkpoint-activations
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
...
...
examples/pretrain_gpt2_distributed.sh
View file @
5e56e563
...
...
@@ -10,33 +10,43 @@ NNODES=1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_gpt2.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--train-iters
32
0000
\
--
save
checkpoints/gpt2_345m
\
--
load
checkpoints/gpt2_345m
\
--
resume-dataloader
\
--
train-data
wikipedia
\
--
lazy-loader
\
--
tokenizer-type
GPT2BPETokenizer
\
--
cache-dir
cache
\
--train-iters
50
0000
\
--
lr-decay-iters
320000
\
--
save
$CHECKPOINT_PATH
\
--
load
$CHECKPOINT_PATH
\
--
data-path
$DATA_PATH
\
--
vocab-file
gpt2-vocab.json
\
--
merge-file
gpt2-merges.txt
\
--
data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--lr-decay-style
cosine
\
--min-lr
1.0e-5
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--checkpoint-activations
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
set
+x
examples/pretrain_gpt2_model_parallel.sh
deleted
100755 → 0
View file @
6c0a5bd8
#! /bin/bash
# Runs the "345M" parameter model
GPUS_PER_NODE
=
8
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
NNODES
=
1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_gpt2.py
\
--model-parallel-size
2
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--train-iters
320000
\
--save
checkpoints/gpt2_345m_mp2
\
--load
checkpoints/gpt2_345m_mp2
\
--resume-dataloader
\
--train-data
wikipedia
\
--lazy-loader
\
--tokenizer-type
GPT2BPETokenizer
\
--cache-dir
cache
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--lr-decay-style
cosine
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--checkpoint-activations
\
--fp16
set
+x
examples/run_gpt2_eval.py
deleted
100644 → 0
View file @
6c0a5bd8
"""
example usage:
python scripts/run_gpt2_eval.py
\
--model-parallel-size 1
\
--num-layers 12
\
--hidden-size 768
\
--num-attention-heads 12
\
--model-path <gpt2_117_path>
\
--data-path <wikitext_tokens_test_path>
\
--batch-size 16
\
--cache-dir <cache dir path>
"""
import
argparse
import
subprocess
parser
=
argparse
.
ArgumentParser
(
'run zero shot GPT2 eval'
)
parser
.
add_argument
(
'--model-path'
,
type
=
str
,
required
=
True
,
help
=
'Saved model path for evaluation'
)
parser
.
add_argument
(
'--batch-size'
,
type
=
int
,
default
=
4
,
help
=
'batch size to use for evaluation'
)
parser
.
add_argument
(
'--num-attention-heads'
,
type
=
int
,
default
=
12
,
help
=
'num of transformer attention heads'
)
parser
.
add_argument
(
'--hidden-size'
,
type
=
int
,
default
=
768
,
help
=
'tansformer hidden size'
)
parser
.
add_argument
(
'--num-layers'
,
type
=
int
,
default
=
12
,
help
=
'num decoder layers'
)
parser
.
add_argument
(
'--data-path'
,
type
=
str
,
required
=
True
,
help
=
'Data path for evaluation data'
)
parser
.
add_argument
(
'--cloze-eval'
,
action
=
'store_true'
,
help
=
'Run lambada cloze eval instead of perplexity eval.'
)
parser
.
add_argument
(
'--easy-lambada'
,
action
=
'store_true'
,
help
=
'use easier formulation of lambada'
)
parser
.
add_argument
(
'--model-parallel-size'
,
type
=
int
,
default
=
1
,
help
=
'model parallel size to use'
)
args
=
parser
.
parse_args
()
multinode_args
=
''
if
args
.
model_parallel_size
>
1
:
multinode_args
+=
' -m torch.distributed.launch --nproc_per_node {} '
.
format
(
args
.
model_parallel_size
)
CMD
=
' --model-parallel-size {model_par}
\
--num-layers {nlayers}
\
--hidden-size {hidden}
\
--log-interval 100
\
--load {model}
\
--batch-size {batch}
\
--num-attention-heads {natt}
\
--seq-length 1024
\
--max-position-embeddings 1024
\
--tokenizer-type GPT2BPETokenizer
\
--distributed-backend nccl
\
--hidden-dropout 0.1
\
--attention-dropout 0.1
\
--fp16
\
--lr 1 --no-load-optim --no-load-rng --epochs 0
\
--overlapping-eval 32
\
--merge-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/merges.txt
\
--vocab-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/vocab.json'
.
format
(
model_par
=
args
.
model_parallel_size
,
nlayers
=
args
.
num_layers
,
hidden
=
args
.
hidden_size
,
model
=
args
.
model_path
,
batch
=
args
.
batch_size
,
natt
=
args
.
num_attention_heads
,)
if
args
.
cloze_eval
:
CMD
+=
' --valid-data {} '
.
format
(
args
.
data_path
)
CMD
+=
' --task LAMBADA '
if
not
args
.
easy_lambada
:
CMD
+=
' --strict-lambada '
CMD
=
'main.py'
+
CMD
print
(
'Running Lambada Eval Command:'
,
flush
=
True
)
else
:
CMD
+=
' --valid-data {} '
.
format
(
args
.
data_path
)
CMD
+=
' --task WIKITEXT103 '
CMD
=
'main.py'
+
CMD
print
(
'Running PPL Eval Command:'
,
flush
=
True
)
CMD
=
'python3 '
+
multinode_args
+
CMD
print
(
CMD
,
flush
=
True
)
subprocess
.
call
(
CMD
.
split
())
megatron/__init__.py
View file @
5e56e563
# coding=utf-8
# Copyright (c) 20
19
, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 20
20
, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
megatron/arguments.py
View file @
5e56e563
# coding=utf-8
# Copyright (c) 20
19
, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 20
20
, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -19,9 +19,11 @@ import argparse
import
os
def
parse_args
(
extra_args_provider
=
None
,
defaults
=
{}):
def
parse_args
(
extra_args_provider
=
None
,
defaults
=
{},
ignore_unknown_args
=
False
):
"""Parse all arguments."""
parser
=
argparse
.
ArgumentParser
(
description
=
'Megatron-LM Arguments'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Megatron-LM Arguments'
,
allow_abbrev
=
False
)
# Standard arguments.
parser
=
_add_network_size_args
(
parser
)
...
...
@@ -35,24 +37,16 @@ def parse_args(extra_args_provider=None, defaults={}):
parser
=
_add_validation_args
(
parser
)
parser
=
_add_data_args
(
parser
)
parser
=
_add_autoresume_args
(
parser
)
# TODO: Refactor
parser
=
_add_gpt2_args
(
parser
)
# Custom arguments.
if
extra_args_provider
is
not
None
:
parser
=
extra_args_provider
(
parser
)
# Parse.
args
=
parser
.
parse_args
()
# Set input defaults.
for
key
in
defaults
:
# For default to be valid, it should not be provided in the
# arguments that are passed to the program. We check this by
# ensuring the arg is set to None.
assert
getattr
(
args
,
key
)
is
None
,
\
'defaults can only be overwritten for args with None values.'
setattr
(
args
,
key
,
defaults
[
key
])
if
ignore_unknown_args
:
args
,
_
=
parser
.
parse_known_args
()
else
:
args
=
parser
.
parse_args
()
# Distributed args.
args
.
rank
=
int
(
os
.
getenv
(
'RANK'
,
'0'
))
...
...
@@ -67,6 +61,26 @@ def parse_args(extra_args_provider=None, defaults={}):
if
args
.
loss_scale
is
None
:
args
.
dynamic_loss_scale
=
True
# Set input defaults.
for
key
in
defaults
:
# For default to be valid, it should not be provided in the
# arguments that are passed to the program. We check this by
# ensuring the arg is set to None.
if
getattr
(
args
,
key
)
is
not
None
:
if
args
.
rank
==
0
:
print
(
'WARNING: overriding default arguments for {key}:{v}
\
with {key}:{v2}'
.
format
(
key
=
key
,
v
=
defaults
[
key
],
v2
=
getattr
(
args
,
key
)),
flush
=
True
)
else
:
setattr
(
args
,
key
,
defaults
[
key
])
# Check required arguments.
required_args
=
[
'num_layers'
,
'hidden_size'
,
'num_attention_heads'
,
'max_position_embeddings'
]
for
req_arg
in
required_args
:
_check_arg_is_not_none
(
args
,
req_arg
)
# Checks.
assert
args
.
hidden_size
%
args
.
num_attention_heads
==
0
if
args
.
seq_length
is
not
None
:
...
...
@@ -93,16 +107,20 @@ def _print_args(args):
print
(
'---------------- end of arguments ----------------'
,
flush
=
True
)
def
_check_arg_is_not_none
(
args
,
arg
):
assert
getattr
(
args
,
arg
)
is
not
None
,
'{} argument is None'
.
format
(
arg
)
def
_add_network_size_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'network size'
)
group
.
add_argument
(
'--num-layers'
,
type
=
int
,
required
=
Tru
e
,
group
.
add_argument
(
'--num-layers'
,
type
=
int
,
default
=
Non
e
,
help
=
'Number of transformer layers.'
)
group
.
add_argument
(
'--hidden-size'
,
type
=
int
,
required
=
Tru
e
,
group
.
add_argument
(
'--hidden-size'
,
type
=
int
,
default
=
Non
e
,
help
=
'Tansformer hidden size.'
)
group
.
add_argument
(
'--num-attention-heads'
,
type
=
int
,
required
=
Tru
e
,
group
.
add_argument
(
'--num-attention-heads'
,
type
=
int
,
default
=
Non
e
,
help
=
'Number of transformer attention heads.'
)
group
.
add_argument
(
'--max-position-embeddings'
,
type
=
int
,
required
=
Tru
e
,
group
.
add_argument
(
'--max-position-embeddings'
,
type
=
int
,
default
=
Non
e
,
help
=
'Maximum number of position embeddings to use. '
'This is the size of position embedding.'
)
group
.
add_argument
(
'--make-vocab-size-divisible-by'
,
type
=
int
,
default
=
128
,
...
...
@@ -114,6 +132,10 @@ def _add_network_size_args(parser):
action
=
'store_true'
,
help
=
'If set, use original BERT residula connection '
'ordering.'
)
group
.
add_argument
(
'--openai-gelu'
,
action
=
'store_true'
,
help
=
'Use OpenAIs GeLU implementation. This option'
'should not be used unless for backward compatibility'
'reasons.'
)
return
parser
...
...
@@ -322,6 +344,7 @@ def _add_data_args(parser):
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
default
=
None
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
'GPT2BPETokenizer'
],
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--data-impl'
,
type
=
str
,
default
=
'infer'
,
...
...
@@ -348,31 +371,3 @@ def _add_autoresume_args(parser):
'termination signal'
)
return
parser
########################################################################
def
_add_gpt2_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'gpt2'
)
group
.
add_argument
(
'--input-data-sizes-file'
,
type
=
str
,
default
=
'sizes.txt'
,
help
=
'The filename containing all the shards '
'sizes for numpy data loader'
)
return
parser
def
add_data_args_
(
parser
):
"""Train/valid/test data arguments."""
group
=
parser
.
add_argument_group
(
'data'
,
'data configurations'
)
group
.
add_argument
(
'--data-loader'
,
type
=
str
,
default
=
None
,
choices
=
[
'raw'
,
'lazy'
,
'tfrecords'
,
'numpy'
,
'binary'
],
help
=
'Which data loader to use. Default varies by model.'
)
return
parser
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment