Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
94dbfd1c
Commit
94dbfd1c
authored
Feb 17, 2023
by
Maanu Grover
Committed by
Jared Casper
Feb 17, 2023
Browse files
Fix distributed pretraining examples
parent
285068c8
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
333 additions
and
212 deletions
+333
-212
examples/pretrain_bert_distributed.sh
examples/pretrain_bert_distributed.sh
+52
-32
examples/pretrain_bert_distributed_with_mp.sh
examples/pretrain_bert_distributed_with_mp.sh
+54
-35
examples/pretrain_gpt_distributed.sh
examples/pretrain_gpt_distributed.sh
+55
-35
examples/pretrain_gpt_distributed_with_mp.sh
examples/pretrain_gpt_distributed_with_mp.sh
+59
-38
examples/pretrain_t5_distributed.sh
examples/pretrain_t5_distributed.sh
+56
-36
examples/pretrain_t5_distributed_with_mp.sh
examples/pretrain_t5_distributed_with_mp.sh
+57
-36
No files found.
examples/pretrain_bert_distributed.sh
View file @
94dbfd1c
#!/bin/bash
#!/bin/bash
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
GPUS_PER_NODE
=
8
GPUS_PER_NODE
=
8
# Change for multinode config
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_ADDR
=
localhost
...
@@ -8,37 +10,55 @@ NNODES=1
...
@@ -8,37 +10,55 @@ NNODES=1
NODE_RANK
=
0
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH
=
<Specify path>
CHECKPOINT_PATH
=
<Specify path>
VOCAB_FILE
=
<Specify path to file>/bert-vocab.txt
DATA_PATH
=
<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"
--nproc_per_node
$GPUS_PER_NODE
\
--nnodes
$NNODES
\
--node_rank
$NODE_RANK
\
--master_addr
$MASTER_ADDR
\
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
BERT_ARGS
=
"
pretrain_bert.py
\
--num-layers 24
\
--num-layers 24
\
--hidden-size 1024
\
--hidden-size 1024
\
--num-attention-heads 16
\
--num-attention-heads 16
\
--micro-batch-size
4
\
--global-batch-size
32
\
--seq-length 512
\
--seq-length 512
\
--max-position-embeddings 512
\
--max-position-embeddings 512
\
--train-iters
1000000
\
--micro-batch-size 4
\
--save
$CHECKPOINT_PATH
\
--global-batch-size 32
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
bert-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr 0.0001
\
--lr 0.0001
\
--train-iters 1000000
\
--lr-decay-iters 990000
\
--lr-decay-style linear
\
--lr-decay-style linear
\
--min-lr 1.0e-5
\
--min-lr 1.0e-5
\
--lr-decay-iters
990000
\
--weight-decay 1e-2
\
--weight-decay 1e-2
\
--clip-grad
1.0
\
--lr-warmup-fraction .01
\
--lr-warmup-fraction .01
\
--clip-grad 1.0
\
--fp16
"
DATA_ARGS
=
"
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--data-impl mmap
\
--split 949,50,1
"
OUTPUT_ARGS
=
"
--log-interval 100
\
--log-interval 100
\
--save-interval 10000
\
--save-interval 10000
\
--eval-interval 1000
\
--eval-interval 1000
\
--eval-iters
10
\
--eval-iters 10
--fp16
"
torchrun
$DISTRIBUTED_ARGS
pretrain_bert.py
\
$BERT_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--distributed-backend
nccl
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
examples/pretrain_bert_distributed_with_mp.sh
View file @
94dbfd1c
#!/bin/bash
#!/bin/bash
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
GPUS_PER_NODE
=
8
GPUS_PER_NODE
=
8
# Change for multinode config
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_ADDR
=
localhost
...
@@ -8,40 +10,57 @@ NNODES=1
...
@@ -8,40 +10,57 @@ NNODES=1
NODE_RANK
=
0
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_sentence
VOCAB_FILE
=
<Specify path to vocab.txt>
CHECKPOINT_PATH
=
<Specify path>
CHECKPOINT_PATH
=
<Specify path>
VOCAB_FILE
=
<Specify path to file>/bert-vocab.txt
DATA_PATH
=
<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"
--nproc_per_node
$GPUS_PER_NODE
\
--nnodes
$NNODES
\
--node_rank
$NODE_RANK
\
--master_addr
$MASTER_ADDR
\
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
BERT_ARGS
=
"
pretrain_bert.py
\
--tensor-model-parallel-size 2
\
--tensor-model-parallel-size 2
\
--pipeline-model-parallel-size 2
\
--pipeline-model-parallel-size 2
\
--num-layers 24
\
--num-layers 24
\
--hidden-size 1024
\
--hidden-size 1024
\
--num-attention-heads 16
\
--num-attention-heads 16
\
--micro-batch-size
2
\
--global-batch-size
16
\
--seq-length 512
\
--seq-length 512
\
--max-position-embeddings 512
\
--max-position-embeddings 512
\
--train-iters
1000000
\
--micro-batch-size 2
\
--save
$CHECKPOINT_PATH
\
--global-batch-size 16
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr 0.0001
\
--lr 0.0001
\
--train-iters 1000000
\
--lr-decay-iters 990000
\
--lr-decay-style linear
\
--lr-decay-style linear
\
--min-lr 1.0e-5
\
--min-lr 1.0e-5
\
--lr-decay-iters
990000
\
--weight-decay 1e-2
\
--weight-decay 1e-2
\
--clip-grad
1.0
\
--lr-warmup-fraction .01
\
--lr-warmup-fraction .01
\
--clip-grad 1.0
\
--fp16
"
DATA_ARGS
=
"
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--data-impl mmap
\
--split 949,50,1
"
OUTPUT_ARGS
=
"
--log-interval 100
\
--log-interval 100
\
--save-interval 10000
\
--save-interval 10000
\
--eval-interval 1000
\
--eval-interval 1000
\
--eval-iters
10
\
--eval-iters 10
--fp16
"
torchrun
$DISTRIBUTED_ARGS
pretrain_bert.py
\
$BERT_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--distributed-backend
nccl
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
examples/pretrain_gpt_distributed.sh
View file @
94dbfd1c
#!
/bin/bash
#!/bin/bash
# Runs the "345M" parameter model
# Runs the "345M" parameter model
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
GPUS_PER_NODE
=
8
GPUS_PER_NODE
=
8
# Change for multinode config
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_ADDR
=
localhost
...
@@ -10,39 +12,57 @@ NNODES=1
...
@@ -10,39 +12,57 @@ NNODES=1
NODE_RANK
=
0
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
CHECKPOINT_PATH
=
<Specify path>
VOCAB_FILE
=
<Specify path to file>/gpt2-vocab.json
MERGE_FILE
=
<Specify path to file>/gpt2-merges.txt
DATA_PATH
=
<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"
--nproc_per_node
$GPUS_PER_NODE
\
--nnodes
$NNODES
\
--node_rank
$NODE_RANK
\
--master_addr
$MASTER_ADDR
\
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
GPT_ARGS
=
"
pretrain_gpt.py
\
--num-layers 24
\
--num-layers 24
\
--hidden-size 1024
\
--hidden-size 1024
\
--num-attention-heads 16
\
--num-attention-heads 16
\
--micro-batch-size
8
\
--global-batch-size
64
\
--seq-length 1024
\
--seq-length 1024
\
--max-position-embeddings 1024
\
--max-position-embeddings 1024
\
--micro-batch-size 8
\
--global-batch-size 64
\
--lr 0.00015
\
--train-iters 500000
\
--train-iters 500000
\
--lr-decay-iters 320000
\
--lr-decay-iters 320000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
gpt2-vocab.json
\
--merge-file
gpt2-merges.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--lr-decay-style cosine
\
--lr-decay-style cosine
\
--min-lr 1.0e-5
\
--min-lr 1.0e-5
\
--weight-decay 1e-2
\
--weight-decay 1e-2
\
--clip-grad
1.0
\
--lr-warmup-fraction .01
\
--lr-warmup-fraction .01
\
--activations-checkpoint-method
uniform
\
--clip-grad 1.0
\
--fp16
"
DATA_ARGS
=
"
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--merge-file
$MERGE_FILE
\
--data-impl mmap
\
--split 949,50,1
"
OUTPUT_ARGS
=
"
--log-interval 100
\
--log-interval 100
\
--save-interval 10000
\
--save-interval 10000
\
--eval-interval 1000
\
--eval-interval 1000
\
--eval-iters
10
\
--eval-iters 10
--fp16
"
torchrun
$DISTRIBUTED_ARGS
pretrain_gpt.py
\
$GPT_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--distributed-backend
nccl
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
examples/pretrain_gpt_distributed_with_mp.sh
View file @
94dbfd1c
#!
/bin/bash
#!/bin/bash
# Runs the "345M" parameter model
# Runs the "345M" parameter model
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
GPUS_PER_NODE
=
8
GPUS_PER_NODE
=
8
# Change for multinode config
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_ADDR
=
localhost
...
@@ -10,42 +12,61 @@ NNODES=1
...
@@ -10,42 +12,61 @@ NNODES=1
NODE_RANK
=
0
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
CHECKPOINT_PATH
=
<Specify path>
VOCAB_FILE
=
<Specify path to file>/gpt2-vocab.json
MERGE_FILE
=
<Specify path to file>/gpt2-merges.txt
DATA_PATH
=
<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"
--nproc_per_node
$GPUS_PER_NODE
\
--nnodes
$NNODES
\
--node_rank
$NODE_RANK
\
--master_addr
$MASTER_ADDR
\
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
GPT_ARGS
=
"
pretrain_gpt.py
\
--tensor-model-parallel-size 2
\
--tensor-model-parallel-size 2
\
--pipeline-model-parallel-size 2
\
--pipeline-model-parallel-size 2
\
--sequence-parallel
\
--sequence-parallel
\
--num-layers 24
\
--num-layers 24
\
--hidden-size 1024
\
--hidden-size 1024
\
--num-attention-heads 16
\
--num-attention-heads 16
\
--micro-batch-size
4
\
--global-batch-size
16
\
--seq-length 1024
\
--seq-length 1024
\
--max-position-embeddings 1024
\
--max-position-embeddings 1024
\
--micro-batch-size 4
\
--global-batch-size 16
\
--lr 0.00015
\
--train-iters 500000
\
--train-iters 500000
\
--lr-decay-iters 320000
\
--lr-decay-iters 320000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
gpt2-vocab.json
\
--merge-file
gpt2-merges.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--lr-decay-style cosine
\
--lr-decay-style cosine
\
--min-lr 1.0e-5
\
--min-lr 1.0e-5
\
--weight-decay 1e-2
\
--weight-decay 1e-2
\
--clip-grad
1.0
\
--lr-warmup-fraction .01
\
--lr-warmup-fraction .01
\
--activations-checkpoint-method
uniform
\
--clip-grad 1.0
\
--fp16
"
DATA_ARGS
=
"
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--merge-file
$MERGE_FILE
\
--data-impl mmap
\
--split 949,50,1
"
OUTPUT_ARGS
=
"
--log-interval 100
\
--log-interval 100
\
--save-interval 10000
\
--save-interval 10000
\
--eval-interval 1000
\
--eval-interval 1000
\
--eval-iters
10
\
--eval-iters 10
--fp16
"
torchrun
$DISTRIBUTED_ARGS
pretrain_gpt.py
\
$GPT_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--distributed-backend
nccl
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
examples/pretrain_t5_distributed.sh
View file @
94dbfd1c
#!/bin/bash
#!/bin/bash
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
GPUS_PER_NODE
=
8
GPUS_PER_NODE
=
8
# Change for multinode config
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_ADDR
=
localhost
...
@@ -8,14 +10,19 @@ NNODES=1
...
@@ -8,14 +10,19 @@ NNODES=1
NODE_RANK
=
0
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>
VOCAB_FILE
=
<Specify path to vocab.txt>
CHECKPOINT_PATH
=
<Specify path>
CHECKPOINT_PATH
=
<Specify path>
VOCAB_FILE
=
<Specify path to file>/t5-vocab.txt
DATA_PATH
=
<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"
--nproc_per_node
$GPUS_PER_NODE
\
--nnodes
$NNODES
\
--node_rank
$NODE_RANK
\
--master_addr
$MASTER_ADDR
\
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
T5_ARGS
=
"
pretrain_t5.py
\
--num-layers 12
\
--num-layers 12
\
--hidden-size 768
\
--hidden-size 768
\
--num-attention-heads 12
\
--num-attention-heads 12
\
...
@@ -23,26 +30,39 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
...
@@ -23,26 +30,39 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--ffn-hidden-size 3072
\
--ffn-hidden-size 3072
\
--encoder-seq-length 512
\
--encoder-seq-length 512
\
--decoder-seq-length 128
\
--decoder-seq-length 128
\
--max-position-embeddings 512
\
--micro-batch-size 16
\
--micro-batch-size 16
\
--global-batch-size 128
\
--global-batch-size 128
\
--max-position-embeddings
512
\
--lr 0.0001
\
--train-iters 1000000
\
--train-iters 1000000
\
--lr-decay-iters 1000000
\
--lr-decay-iters 1000000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--data-impl
mmap
\
--split
949,50,1
\
--lr
0.0001
\
--min-lr
0.00001
\
--lr-decay-style linear
\
--lr-decay-style linear
\
--lr-warmup-fraction
.
01
\
--min-lr 0.000
01
\
--weight-decay 1e-2
\
--weight-decay 1e-2
\
--lr-warmup-fraction .01
\
--clip-grad 1.0
\
--clip-grad 1.0
\
--fp16
\
--vocab-extra-ids 100
"
DATA_ARGS
=
"
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--data-impl mmap
\
--split 949,50,1
"
OUTPUT_ARGS
=
"
--log-interval 100
\
--log-interval 100
\
--save-interval 10000
\
--save-interval 10000
\
--eval-interval 1000
\
--eval-interval 1000
\
--eval-iters
10
\
--eval-iters 10
--fp16
\
"
--vocab-extra-ids
100
torchrun
$DISTRIBUTED_ARGS
pretrain_t5.py
\
$T5_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--distributed-backend
nccl
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
examples/pretrain_t5_distributed_with_mp.sh
View file @
94dbfd1c
#!/bin/bash
#!/bin/bash
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
GPUS_PER_NODE
=
8
GPUS_PER_NODE
=
8
# Change for multinode config
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_ADDR
=
localhost
...
@@ -8,13 +10,19 @@ NNODES=1
...
@@ -8,13 +10,19 @@ NNODES=1
NODE_RANK
=
0
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>
CHECKPOINT_PATH
=
<Specify path>
CHECKPOINT_PATH
=
<Specify path>
VOCAB_FILE
=
<Specify path to file>/t5-vocab.txt
DATA_PATH
=
<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"
--nproc_per_node
$GPUS_PER_NODE
\
--nnodes
$NNODES
\
--node_rank
$NODE_RANK
\
--master_addr
$MASTER_ADDR
\
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
T5_ARGS
=
"
pretrain_t5.py
\
--tensor-model-parallel-size 2
\
--tensor-model-parallel-size 2
\
--num-layers 12
\
--num-layers 12
\
--hidden-size 768
\
--hidden-size 768
\
...
@@ -23,26 +31,39 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
...
@@ -23,26 +31,39 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--ffn-hidden-size 3072
\
--ffn-hidden-size 3072
\
--encoder-seq-length 512
\
--encoder-seq-length 512
\
--decoder-seq-length 128
\
--decoder-seq-length 128
\
--max-position-embeddings 512
\
--micro-batch-size 16
\
--micro-batch-size 16
\
--global-batch-size 128
\
--global-batch-size 128
\
--max-position-embeddings
512
\
--lr 0.0001
\
--train-iters 1000000
\
--train-iters 1000000
\
--lr-decay-iters 1000000
\
--lr-decay-iters 1000000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
t5-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--lr
0.0001
\
--min-lr
0.00001
\
--lr-decay-style linear
\
--lr-decay-style linear
\
--lr-warmup-fraction
.
01
\
--min-lr 0.000
01
\
--weight-decay 1e-2
\
--weight-decay 1e-2
\
--lr-warmup-fraction .01
\
--clip-grad 1.0
\
--clip-grad 1.0
\
--fp16
\
--vocab-extra-ids 100
"
DATA_ARGS
=
"
--data-path
$DATA_PATH
\
--vocab-file
$VOCAB_FILE
\
--data-impl mmap
\
--split 949,50,1
"
OUTPUT_ARGS
=
"
--log-interval 100
\
--log-interval 100
\
--save-interval 10000
\
--save-interval 10000
\
--eval-interval 1000
\
--eval-interval 1000
\
--eval-iters
10
\
--eval-iters 10
--fp16
\
"
--vocab-extra-ids
100
torchrun
$DISTRIBUTED_ARGS
pretrain_t5.py
\
$T5_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--distributed-backend
nccl
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment