Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
316d3f90
Commit
316d3f90
authored
Jul 14, 2022
by
Pan,Huiwen
Browse files
增加ds框架测试模型
parent
aebde649
Changes
227
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
264 additions
and
0 deletions
+264
-0
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/finetune_race_distributed.sh
...ron-LM-v1.1.5-ZeRO3/examples/finetune_race_distributed.sh
+47
-0
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/generate_text.sh
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/generate_text.sh
+25
-0
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/merge_mp_bert.sh
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/merge_mp_bert.sh
+18
-0
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_bert.sh
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_bert.sh
+35
-0
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_bert_distributed.sh
...ron-LM-v1.1.5-ZeRO3/examples/pretrain_bert_distributed.sh
+44
-0
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_gpt2.sh
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_gpt2.sh
+43
-0
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_gpt2_distributed.sh
...ron-LM-v1.1.5-ZeRO3/examples/pretrain_gpt2_distributed.sh
+52
-0
No files found.
Too many changes to show.
To preserve performance only
227 of 227+
files are displayed.
Plain diff
Email patch
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/finetune_race_distributed.sh
0 → 100644
View file @
316d3f90
#!/bin/bash
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
TRAIN_DATA
=
"data/RACE/train/middle"
VALID_DATA
=
"data/RACE/dev/middle
\
data/RACE/dev/high"
VOCAB_FILE
=
bert-vocab.txt
PRETRAINED_CHECKPOINT
=
checkpoints/bert_345m
CHECKPOINT_PATH
=
checkpoints/bert_345m_race
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--task
RACE
\
--seed
1234
\
--train-data
$TRAIN_DATA
\
--valid-data
$VALID_DATA
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--epochs
3
\
--pretrained-checkpoint
$PRETRAINED_CHECKPOINT
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--checkpoint-activations
\
--lr
1.0e-5
\
--lr-decay-style
linear
\
--warmup
0.06
\
--seq-length
512
\
--max-position-embeddings
512
\
--save-interval
100000
\
--save
$CHECKPOINT_PATH
\
--log-interval
10
\
--eval-interval
100
\
--eval-iters
50
\
--weight-decay
1.0e-1
\
--clip-grad
1.0
\
--hidden-dropout
0.1
\
--attention-dropout
0.1
\
--fp16
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/generate_text.sh
0 → 100644
View file @
316d3f90
#!/bin/bash
CHECKPOINT_PATH
=
checkpoints/gpt2_345m
VOCAB_FILE
=
gpt2-vocab.json
MERGE_FILE
=
gpt2-merges.txt
python tools/generate_samples_gpt2.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--load
$CHECKPOINT_PATH
\
--num-attention-heads
16
\
--max-position-embeddings
1024
\
--tokenizer-type
GPT2BPETokenizer
\
--fp16
\
--batch-size
2
\
--seq-length
1024
\
--out-seq-length
1024
\
--temperature
1.0
\
--vocab-file
$VOCAB_FILE
\
--merge-file
$MERGE_FILE
\
--genfile
unconditional_samples.json
\
--num-samples
2
\
--top_p
0.9
\
--recompute
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/merge_mp_bert.sh
0 → 100644
View file @
316d3f90
#!/bin/bash
MODEL_PARALLEL_SIZE
=
2
VOCAB_FILE
=
bert-vocab.txt
CHECKPOINT_PATH
=
checkpoints/bert_345m
WORLD_SIZE
=
$MODEL_PARALLEL_SIZE
python tools/merge_mp_partitions.py
\
--model-type
BERT
\
--model-parallel-size
$MODEL_PARALLEL_SIZE
\
--tokenizer-type
BertWordPieceLowerCase
\
--vocab-file
$VOCAB_FILE
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
512
\
--max-position-embeddings
512
\
--load
$CHECKPOINT_PATH
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_bert.sh
0 → 100644
View file @
316d3f90
#!/bin/bash
RANK
=
0
WORLD_SIZE
=
1
DATA_PATH
=
<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH
=
<Specify path>
python pretrain_bert.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-position-embeddings
512
\
--train-iters
2000000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
bert-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--min-lr
0.00001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_bert_distributed.sh
0 → 100644
View file @
316d3f90
#!/bin/bash
GPUS_PER_NODE
=
8
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
NNODES
=
1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH
=
<Specify path>
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
4
\
--seq-length
512
\
--max-position-embeddings
512
\
--train-iters
1000000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
bert-vocab.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--min-lr
1.0e-5
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_gpt2.sh
0 → 100644
View file @
316d3f90
#! /bin/bash
# Runs the "345M" parameter model
RANK
=
0
WORLD_SIZE
=
1
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
python pretrain_gpt2.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--train-iters
500000
\
--lr-decay-iters
320000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
gpt2-vocab.json
\
--merge-file
gpt2-merges.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--min-lr
1.0e-5
\
--lr-decay-style
cosine
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--checkpoint-activations
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
set
+x
Deepspeed/Megatron-LM-v1.1.5-ZeRO3/examples/pretrain_gpt2_distributed.sh
0 → 100644
View file @
316d3f90
#! /bin/bash
# Runs the "345M" parameter model
GPUS_PER_NODE
=
8
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
NNODES
=
1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DATA_PATH
=
<Specify path and file prefix>_text_document
CHECKPOINT_PATH
=
<Specify path>
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_gpt2.py
\
--model-parallel-size
1
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--batch-size
8
\
--seq-length
1024
\
--max-position-embeddings
1024
\
--train-iters
500000
\
--lr-decay-iters
320000
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
--vocab-file
gpt2-vocab.json
\
--merge-file
gpt2-merges.txt
\
--data-impl
mmap
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.00015
\
--lr-decay-style
cosine
\
--min-lr
1.0e-5
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--checkpoint-activations
\
--log-interval
100
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-iters
10
\
--fp16
set
+x
Prev
1
…
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment