train_mixtral_8x22B_1nodes.sh 4.85 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#!/bin/bash

for para in $*
do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
done

# Runs Mixtral 8x22B model
source /opt/dtk/env.sh

# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
silencealiang's avatar
silencealiang committed
19
20
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10

# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"

# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1

# data path
CHECKPOINT_PATH="path to CKPT" 
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"

DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
    --local-rank ${LOCAL_RANK}
    --dist-url tcp://${DIST_URL}:${DIST_PORT}
)

MODEL_ARGS=(
    --use-mcore-models
    --disable-bias-linear
    --seq-length 4096
    --max-position-embeddings 65536
    --num-layers 4
    --hidden-size 6144
    --ffn-hidden-size 16384
    --num-attention-heads 48
    --init-method-std 0.01
    --attention-dropout 0.0
    --hidden-dropout 0.0
    --normalization RMSNorm
    --position-embedding-type rope
    --swiglu
    --untie-embeddings-and-output-weights
    --group-query-attention
    --num-query-groups 8
    --no-masked-softmax-fusion
    --no-position-embedding
    --rotary-base 1000000
    --ckpt-format torch
)

MOE_ARGS=(
    --num-experts 8
    --moe-router-topk 2
    --moe-router-load-balancing-type aux_loss
    --moe-aux-loss-coeff 1e-3
    --moe-token-dispatcher-type alltoall
    --moe-expert-capacity-factor 0.5
    --moe-pad-expert-input-to-capacity
    #--moe-grouped-gemm
)

DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
    --tokenizer-model ${TOKENIZER_MODEL}
    --data-path $DATA_PATH
    --split 99990,8,2
)

TRAINING_ARGS=(
    --micro-batch-size 1
    --global-batch-size 256
    --lr 1e-4
    --train-iters 10
silencealiang's avatar
silencealiang committed
99
    --lr-decay-iters 10000
100
    --lr-decay-style cosine
silencealiang's avatar
silencealiang committed
101
    --min-lr 1.0e-6
102
    --weight-decay 0.1
silencealiang's avatar
silencealiang committed
103
    --lr-warmup-iters 2000
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    --clip-grad 1.0
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
)

TORCH_PROFIE_ARGS=(
    --profile
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
    --profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
    --use-pytorch-profiler
)

HIP_PROFIE_ARGS=(
    --profile
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 4
    --profile-step-end 5
    --use-hip-profiler
)

MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
    --expert-model-parallel-size 8
    --expert-tensor-parallel-size 1
silencealiang's avatar
silencealiang committed
132
    --context-parallel-size 1
133
134
135
136
137
138
139
140
141
142
143
144
145
146
    --use-distributed-optimizer
    --sequence-parallel
)

LOGGING_ARGS=(
    --log-throughput \
    --log-interval 1 \
    --save-interval 10000 \
    --eval-interval 1000 \
    --eval-iters -1 \
    #--save $CHECKPOINT_PATH \
    #--load $CHECKPOINT_PATH \
    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
    --no-load-optim \
silencealiang's avatar
silencealiang committed
147
148
    --no-load-rng \
    --no-save-optim
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
)

if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
    )
fi

APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
    ${DISTRIBUTED_ARGS[@]} \
    ${MODEL_ARGS[@]} \
    ${MOE_ARGS[@]} \
    ${DATA_ARGS[@]} \
    ${TRAINING_ARGS[@]} \
    ${MODEL_PARALLEL_ARGS[@]} \
    ${LOGGING_ARGS[@]} \
    "

if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
    mkdir -p hip_prof_data
    APP+=" ${HIP_PROFIE_ARGS[@]}"
    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi

#for hygon cpu
case ${LOCAL_RANK} in
[0])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
180
  numactl --cpunodebind=0 --membind=0 ${APP}
181
182
183
  ;;
[1])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
184
  numactl --cpunodebind=1 --membind=1 ${APP}
185
186
187
  ;;
[2])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
188
  numactl --cpunodebind=2 --membind=2 ${APP}
189
190
191
  ;;
[3])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
192
  numactl --cpunodebind=3 --membind=3 ${APP}
193
194
195
  ;;
[4])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
196
  numactl --cpunodebind=4 --membind=4 ${APP}
197
198
199
  ;;
[5])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
200
  numactl --cpunodebind=5 --membind=5 ${APP}
201
202
203
  ;;
[6])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
204
  numactl --cpunodebind=6 --membind=6 ${APP}
205
206
207
  ;;
[7])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
silencealiang's avatar
silencealiang committed
208
  numactl --cpunodebind=7 --membind=7 ${APP}
209
210
  ;;
esac