Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
GPT2_pytorch
Commits
17814e61
Commit
17814e61
authored
Jul 14, 2023
by
hepj987
Browse files
调整格式
parent
c338d32c
Pipeline
#436
failed with stage
Changes
2
Pipelines
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
162 deletions
+0
-162
megatron/create-data.sh
megatron/create-data.sh
+0
-9
megatron/run-one-node.sh
megatron/run-one-node.sh
+0
-153
No files found.
megatron/create-data.sh
deleted
100644 → 0
View file @
c338d32c
python tools/preprocess_data.py
\
--input
oscar-1GB.jsonl
\
--output-prefix
./data/my-gpt2
\
--vocab
gpt2-vocab.json
\
--dataset-impl
mmap
\
--tokenizer-type
GPT2BPETokenizer
\
--merge-file
gpt2-merges.txt
\
--append-eod
\
--workers
8
\ No newline at end of file
megatron/run-one-node.sh
deleted
100644 → 0
View file @
c338d32c
#!/bin/bash
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
MIOPEN_FIND_MODE
=
3
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
MODEL_NAME
=
gpt2-4tp
DATA_OUTPUT_PATH
=
./
LOGS_PATH
=
$DATA_OUTPUT_PATH
/logs
CHECKPOINT_PATH
=
checkpoint/
$MODEL_NAME
DATA_PATH
=
./data/my-gpt2_text_document
TENSORBOARD_PATH
=
output_dir/tensorboard/
$MODEL_NAME
CODECARBON_PATH
=
output_dir/codecarbon/
$MODEL_NAME
N_GPUS
=
4
TP_SIZE
=
4
# always fixed to the size of a single node
PP_SIZE
=
1
#128 #96 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE
=
2
GLOBAL_BATCH_SIZE
=
32
#256 #1536
NLAYERS
=
24
NHIDDEN
=
1024
#12480
NHEADS
=
16
SEQ_LEN
=
1024
SAVE_INTERVAL
=
1000
#rampup-batch-size 16 16 5859375
OPTIMIZER_ARGS
=
"
\
--optimizer adam
\
--adam-beta1 0.9
\
--adam-beta2 0.95
\
--adam-eps 1e-8
\
--lr 6.0e-5
\
--min-lr 6.0e-6
\
--lr-decay-style cosine
\
--clip-grad 1.0
\
--weight-decay 1e-1
\
"
GPT_ARGS
=
"
\
--num-layers
$NLAYERS
\
--hidden-size
$NHIDDEN
\
--num-attention-heads
$NHEADS
\
--seq-length
$SEQ_LEN
\
--max-position-embeddings
$SEQ_LEN
\
--micro-batch-size
$MICRO_BATCH_SIZE
\
--global-batch-size
$GLOBAL_BATCH_SIZE
\
--train-iters 50
\
--loss-scale 12
\
--vocab-file gpt2-vocab.json
\
--merge-file gpt2-merges.txt
\
--clip-grad 1.0
\
--fp16
\
--checkpoint-activations
\
--seed 42
$OPTIMIZER_ARGS
\
"
OUTPUT_ARGS
=
"
\
--log-interval 1
\
--save-interval
$SAVE_INTERVAL
\
--eval-interval 10
\
--eval-iters 40
\
--tensorboard-dir
$TENSORBOARD_PATH
\
--tensorboard-queue-size 5
\
--log-timers-to-tensorboard
\
--log-batch-size-to-tensorboard
\
--log-validation-ppl-to-tensorboard
\
"
DATA_ARGS
=
"
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
"
ZERO_STAGE
=
1
config_json
=
"./
${
MODEL_NAME
}
_ds_config.json"
cat
<<
EOT
>
$config_json
{
"train_micro_batch_size_per_gpu":
$MICRO_BATCH_SIZE
,
"train_batch_size":
$GLOBAL_BATCH_SIZE
,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage":
$ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS
=
"
\
--deepspeed
\
--deepspeed_config
${
config_json
}
\
--zero-stage
${
ZERO_STAGE
}
\
--deepspeed-activation-checkpointing
\
"
APP
=
"python pretrain_gpt.py
\
--tensor-model-parallel-size
$TP_SIZE
\
--pipeline-model-parallel-size
$PP_SIZE
\
$GPT_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--data-impl mmap
\
--split 949,50,1
\
--distributed-backend nccl
\
$DEEPSPEED_ARGS
\
--rank
${
RANK
}
\
--world_size
${
WORLD_SIZE
}
\
--dist_url env://127.0.0.1::34566
"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment