Commit 2f999d42 authored by hepj987's avatar hepj987
Browse files

dtk23.04带lightop,附fp16启动脚本

parent ba71120e
Pipeline #556 failed with stage
...@@ -138,7 +138,7 @@ pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --t ...@@ -138,7 +138,7 @@ pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --t
``` ```
#多节点运行 #多节点运行
sbatch run-16B.sh(主要参数在single-16B.sh) sbatch run-16B.sh(主要参数在single-16B.sh, 默认以fp32精度训练,如需采用fp16精度可执行sbatch run-16B-fp16.sh)
``` ```
``` ```
......
...@@ -32,7 +32,7 @@ import torch.nn.functional as F ...@@ -32,7 +32,7 @@ import torch.nn.functional as F
global fused_mix_prec_layer_norm_cuda global fused_mix_prec_layer_norm_cuda
fused_mix_prec_layer_norm_cuda = None fused_mix_prec_layer_norm_cuda = None
from lightop import op
class FusedLayerNormAffineFunction(torch.autograd.Function): class FusedLayerNormAffineFunction(torch.autograd.Function):
...@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module): ...@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module):
return FusedLayerNormAffineFunction.apply( return FusedLayerNormAffineFunction.apply(
input, self.weight, self.bias, self.normalized_shape, self.eps) input, self.weight, self.bias, self.normalized_shape, self.eps)
else: else:
return F.layer_norm(input, self.normalized_shape, self.weight, self.bias) #return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
return op.layernorm_forward_autograd(input, self.weight,self.bias,self.eps)
...@@ -17,6 +17,7 @@ from functools import lru_cache ...@@ -17,6 +17,7 @@ from functools import lru_cache
import torch import torch
import torch.nn as nn import torch.nn as nn
from megatron.enums import AttnMaskType from megatron.enums import AttnMaskType
from lightop.fusesoftmax import FuseSoftmax
class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
""" """
...@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module): ...@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module):
mask_output = self.mask_func(input, mask) if mask is not None else input mask_output = self.mask_func(input, mask) if mask is not None else input
probs = torch.nn.Softmax(dim=-1)(mask_output) #probs = torch.nn.Softmax(dim=-1)(mask_output)
probs = FuseSoftmax(dim=-1)(mask_output)
if self.input_in_float16 and self.softmax_in_fp32: if self.input_in_float16 and self.softmax_in_fp32:
if self.input_in_fp16: if self.input_in_fp16:
......
...@@ -32,7 +32,7 @@ import deepspeed ...@@ -32,7 +32,7 @@ import deepspeed
from .glu_activations import GLU_ACTIVATIONS from .glu_activations import GLU_ACTIVATIONS
from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb
from lightop import op
# flags required to enable jit fusion kernels # flags required to enable jit fusion kernels
torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False) torch._C._jit_set_profiling_executor(False)
...@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule): ...@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule):
def bias_dropout_add(x, bias, residual, prob, training): def bias_dropout_add(x, bias, residual, prob, training):
# type: (Tensor, Tensor, Tensor, float, bool) -> Tensor # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
out = torch.nn.functional.dropout(x + bias, p=prob, training=training) #out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
out = residual + out #out = residual + out
out = op.add_dropout_forward_autograd(x + bias, residual, prob, training)
return out return out
...@@ -418,13 +419,13 @@ def get_bias_dropout_add(training): ...@@ -418,13 +419,13 @@ def get_bias_dropout_add(training):
return _bias_dropout_add return _bias_dropout_add
@torch.jit.script #@torch.jit.script
def bias_dropout_add_fused_train(x, bias, residual, prob): def bias_dropout_add_fused_train(x, bias, residual, prob):
# type: (Tensor, Tensor, Tensor, float) -> Tensor # type: (Tensor, Tensor, Tensor, float) -> Tensor
return bias_dropout_add(x, bias, residual, prob, True) return bias_dropout_add(x, bias, residual, prob, True)
@torch.jit.script #@torch.jit.script
def bias_dropout_add_fused_inference(x, bias, residual, prob): def bias_dropout_add_fused_inference(x, bias, residual, prob):
# type: (Tensor, Tensor, Tensor, float) -> Tensor # type: (Tensor, Tensor, Tensor, float) -> Tensor
return bias_dropout_add(x, bias, residual, prob, False) return bias_dropout_add(x, bias, residual, prob, False)
......
#!/bin/bash
#SBATCH -p tydexclu01
#SBATCH -N 16
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --mem 0
#SBATCH --gres=dcu:4
#SBATCH -J gpt2
#SBATCH -o logs/gpt2-16B-%j.out
#SBATCH -e logs/gpt2-16B-%j.out
ulimit -u 200000
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
echo "START TIME: $(date)"
rm -f ./hostfile/*
rm -f core.*
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single-16B-fp16.sh $dist_url
#!/bin/bash
export NCCL_SOCKET_IFNAME=ib0
export NCCL_IB_HCA=mlx5
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export ROCBLAS_COMPUTETYPE_FP16R=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-oscar_16B-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=./output-module/$MODEL_NAME
DATA_PATH="my-gpt2_text_document"
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=4 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=128
NLAYERS=40
NHIDDEN=5760
NHEADS=24
SEQ_LEN=2048
SAVE_INTERVAL=1000
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train_iters 8000 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 1000 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
export CMD=" \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"
APP="python3 -u `pwd`/pretrain_gpt.py \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
--num-workers 2 \
${CMD} \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment