Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
GPT2_pytorch
Commits
2f999d42
Commit
2f999d42
authored
Sep 09, 2023
by
hepj987
Browse files
dtk23.04带lightop,附fp16启动脚本
parent
ba71120e
Pipeline
#556
failed with stage
Changes
6
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
203 additions
and
9 deletions
+203
-9
README.md
README.md
+1
-1
megatron/model/fused_layer_norm.py
megatron/model/fused_layer_norm.py
+3
-2
megatron/model/fused_softmax.py
megatron/model/fused_softmax.py
+3
-1
megatron/model/transformer.py
megatron/model/transformer.py
+6
-5
run-16B-fp16.sh
run-16B-fp16.sh
+31
-0
single-16B-fp16.sh
single-16B-fp16.sh
+159
-0
No files found.
README.md
View file @
2f999d42
...
@@ -138,7 +138,7 @@ pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --t
...
@@ -138,7 +138,7 @@ pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --t
```
```
#多节点运行
#多节点运行
sbatch run-16B.sh(主要参数在single-16B.sh)
sbatch run-16B.sh(主要参数在single-16B.sh
, 默认以fp32精度训练,如需采用fp16精度可执行sbatch run-16B-fp16.sh
)
```
```
```
```
...
...
megatron/model/fused_layer_norm.py
View file @
2f999d42
...
@@ -32,7 +32,7 @@ import torch.nn.functional as F
...
@@ -32,7 +32,7 @@ import torch.nn.functional as F
global
fused_mix_prec_layer_norm_cuda
global
fused_mix_prec_layer_norm_cuda
fused_mix_prec_layer_norm_cuda
=
None
fused_mix_prec_layer_norm_cuda
=
None
from
lightop
import
op
class
FusedLayerNormAffineFunction
(
torch
.
autograd
.
Function
):
class
FusedLayerNormAffineFunction
(
torch
.
autograd
.
Function
):
...
@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module):
...
@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module):
return
FusedLayerNormAffineFunction
.
apply
(
return
FusedLayerNormAffineFunction
.
apply
(
input
,
self
.
weight
,
self
.
bias
,
self
.
normalized_shape
,
self
.
eps
)
input
,
self
.
weight
,
self
.
bias
,
self
.
normalized_shape
,
self
.
eps
)
else
:
else
:
return
F
.
layer_norm
(
input
,
self
.
normalized_shape
,
self
.
weight
,
self
.
bias
)
#return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
return
op
.
layernorm_forward_autograd
(
input
,
self
.
weight
,
self
.
bias
,
self
.
eps
)
megatron/model/fused_softmax.py
View file @
2f999d42
...
@@ -17,6 +17,7 @@ from functools import lru_cache
...
@@ -17,6 +17,7 @@ from functools import lru_cache
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
megatron.enums
import
AttnMaskType
from
megatron.enums
import
AttnMaskType
from
lightop.fusesoftmax
import
FuseSoftmax
class
ScaledUpperTriangMaskedSoftmax
(
torch
.
autograd
.
Function
):
class
ScaledUpperTriangMaskedSoftmax
(
torch
.
autograd
.
Function
):
"""
"""
...
@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module):
...
@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module):
mask_output
=
self
.
mask_func
(
input
,
mask
)
if
mask
is
not
None
else
input
mask_output
=
self
.
mask_func
(
input
,
mask
)
if
mask
is
not
None
else
input
probs
=
torch
.
nn
.
Softmax
(
dim
=-
1
)(
mask_output
)
#probs = torch.nn.Softmax(dim=-1)(mask_output)
probs
=
FuseSoftmax
(
dim
=-
1
)(
mask_output
)
if
self
.
input_in_float16
and
self
.
softmax_in_fp32
:
if
self
.
input_in_float16
and
self
.
softmax_in_fp32
:
if
self
.
input_in_fp16
:
if
self
.
input_in_fp16
:
...
...
megatron/model/transformer.py
View file @
2f999d42
...
@@ -32,7 +32,7 @@ import deepspeed
...
@@ -32,7 +32,7 @@ import deepspeed
from
.glu_activations
import
GLU_ACTIVATIONS
from
.glu_activations
import
GLU_ACTIVATIONS
from
.positional_embeddings
import
RotaryEmbedding
,
apply_rotary_pos_emb_torch
,
apply_rotary_pos_emb
from
.positional_embeddings
import
RotaryEmbedding
,
apply_rotary_pos_emb_torch
,
apply_rotary_pos_emb
from
lightop
import
op
# flags required to enable jit fusion kernels
# flags required to enable jit fusion kernels
torch
.
_C
.
_jit_set_profiling_mode
(
False
)
torch
.
_C
.
_jit_set_profiling_mode
(
False
)
torch
.
_C
.
_jit_set_profiling_executor
(
False
)
torch
.
_C
.
_jit_set_profiling_executor
(
False
)
...
@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule):
...
@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule):
def
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
training
):
def
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
training
):
# type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
# type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
out
=
torch
.
nn
.
functional
.
dropout
(
x
+
bias
,
p
=
prob
,
training
=
training
)
#out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
out
=
residual
+
out
#out = residual + out
out
=
op
.
add_dropout_forward_autograd
(
x
+
bias
,
residual
,
prob
,
training
)
return
out
return
out
...
@@ -418,13 +419,13 @@ def get_bias_dropout_add(training):
...
@@ -418,13 +419,13 @@ def get_bias_dropout_add(training):
return
_bias_dropout_add
return
_bias_dropout_add
@
torch
.
jit
.
script
#
@torch.jit.script
def
bias_dropout_add_fused_train
(
x
,
bias
,
residual
,
prob
):
def
bias_dropout_add_fused_train
(
x
,
bias
,
residual
,
prob
):
# type: (Tensor, Tensor, Tensor, float) -> Tensor
# type: (Tensor, Tensor, Tensor, float) -> Tensor
return
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
True
)
return
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
True
)
@
torch
.
jit
.
script
#
@torch.jit.script
def
bias_dropout_add_fused_inference
(
x
,
bias
,
residual
,
prob
):
def
bias_dropout_add_fused_inference
(
x
,
bias
,
residual
,
prob
):
# type: (Tensor, Tensor, Tensor, float) -> Tensor
# type: (Tensor, Tensor, Tensor, float) -> Tensor
return
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
False
)
return
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
False
)
...
...
run-16B-fp16.sh
0 → 100644
View file @
2f999d42
#!/bin/bash
#SBATCH -p tydexclu01
#SBATCH -N 16
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --mem 0
#SBATCH --gres=dcu:4
#SBATCH -J gpt2
#SBATCH -o logs/gpt2-16B-%j.out
#SBATCH -e logs/gpt2-16B-%j.out
ulimit
-u
200000
export
NCCL_IB_HCA
=
mlx5
export
NCCL_SOCKET_IFNAME
=
ib0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
echo
"START TIME:
$(
date
)
"
rm
-f
./hostfile/
*
rm
-f
core.
*
hostfile
=
./hostfile/
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile/hostfile-dl-
$SLURM_JOB_ID
done
np
=
$(
cat
$hostfile
|sort|uniq |wc
-l
)
np
=
$((
$np
*
4
))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile/hostfile-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/single-16B-fp16.sh
$dist_url
single-16B-fp16.sh
0 → 100644
View file @
2f999d42
#!/bin/bash
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_IB_HCA
=
mlx5
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
MIOPEN_FIND_MODE
=
3
export
ROCBLAS_COMPUTETYPE_FP16R
=
0
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
MODEL_NAME
=
gpt2-oscar_16B-4tp
DATA_OUTPUT_PATH
=
./
LOGS_PATH
=
$DATA_OUTPUT_PATH
/logs
CHECKPOINT_PATH
=
./output-module/
$MODEL_NAME
DATA_PATH
=
"my-gpt2_text_document"
TENSORBOARD_PATH
=
output_dir/tensorboard/
$MODEL_NAME
CODECARBON_PATH
=
output_dir/codecarbon/
$MODEL_NAME
TP_SIZE
=
4
# always fixed to the size of a single node
PP_SIZE
=
4
# NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE
=
1
GLOBAL_BATCH_SIZE
=
128
NLAYERS
=
40
NHIDDEN
=
5760
NHEADS
=
24
SEQ_LEN
=
2048
SAVE_INTERVAL
=
1000
OPTIMIZER_ARGS
=
"
\
--optimizer adam
\
--adam-beta1 0.9
\
--adam-beta2 0.95
\
--adam-eps 1e-8
\
--lr 6.0e-5
\
--min-lr 6.0e-6
\
--lr-decay-style cosine
\
--clip-grad 1.0
\
--weight-decay 1e-1
\
"
GPT_ARGS
=
"
\
--num-layers
$NLAYERS
\
--hidden-size
$NHIDDEN
\
--num-attention-heads
$NHEADS
\
--seq-length
$SEQ_LEN
\
--max-position-embeddings
$SEQ_LEN
\
--micro-batch-size
$MICRO_BATCH_SIZE
\
--global-batch-size
$GLOBAL_BATCH_SIZE
\
--train_iters 8000
\
--loss-scale 12
\
--vocab-file gpt2-vocab.json
\
--merge-file gpt2-merges.txt
\
--clip-grad 1.0
\
--checkpoint-activations
\
--seed 42
$OPTIMIZER_ARGS
\
"
OUTPUT_ARGS
=
"
\
--log-interval 1
\
--save-interval
$SAVE_INTERVAL
\
--eval-interval 1000
\
--eval-iters 40
\
--tensorboard-dir
$TENSORBOARD_PATH
\
--tensorboard-queue-size 5
\
--log-timers-to-tensorboard
\
--log-batch-size-to-tensorboard
\
--log-validation-ppl-to-tensorboard
\
"
DATA_ARGS
=
"
\
--save
$CHECKPOINT_PATH
\
--load
$CHECKPOINT_PATH
\
--data-path
$DATA_PATH
\
"
ZERO_STAGE
=
1
config_json
=
"./
${
MODEL_NAME
}
_ds_config.json"
cat
<<
EOT
>
$config_json
{
"train_micro_batch_size_per_gpu":
$MICRO_BATCH_SIZE
,
"train_batch_size":
$GLOBAL_BATCH_SIZE
,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage":
$ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS
=
"
\
--deepspeed
\
--deepspeed_config
${
config_json
}
\
--zero-stage
${
ZERO_STAGE
}
\
--deepspeed-activation-checkpointing
\
"
export
CMD
=
"
\
--tensor-model-parallel-size
$TP_SIZE
\
--pipeline-model-parallel-size
$PP_SIZE
\
$GPT_ARGS
\
$DATA_ARGS
\
$OUTPUT_ARGS
\
--data-impl mmap
\
--split 949,50,1
\
--distributed-backend nccl
\
$DEEPSPEED_ARGS
\
"
APP
=
"python3 -u
`
pwd
`
/pretrain_gpt.py
\
--rank
${
RANK
}
\
--world_size
${
WORLD_SIZE
}
\
--dist_url tcp://
${
1
}
:34566
\
--num-workers 2
\
${
CMD
}
\
"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment