Llama_pretraining.sh 7.54 KB
Newer Older
wxj's avatar
wxj committed
1
2
3
4
5
6
7
8
#!/bin/bash
set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
#export HIP_ALLOC_INITIALIZE=0
silencealiang's avatar
add  
silencealiang committed
9
# export GPU_MAX_HW_QUEUES=10
wxj's avatar
wxj committed
10
11
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
silencealiang's avatar
add  
silencealiang committed
12
13
export NCCL_MIN_NCHANNELS=32 # 20
export NCCL_MAX_NCHANNELS=32 # 20
wxj's avatar
wxj committed
14
15
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
silencealiang's avatar
add  
silencealiang committed
16
17
18
19
20
21
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
wxj's avatar
wxj committed
22
export GLOG_minloglevel=3 # 打印error级别的nccl日志
wxj's avatar
wxj committed
23
source /opt/dtk/env.sh
wxj's avatar
wxj committed
24
# 导入hipblaslt库
wxj's avatar
wxj committed
25
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
silencealiang's avatar
add  
silencealiang committed
26
27
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
export LD_LIBRARY_PATH=/public/home/xingjl/dependency/hipblaslt-install-0227/lib:$LD_LIBRARY_PATH 
wxj's avatar
wxj committed
28
# 更新rocblas
wxj's avatar
wxj committed
29
30
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
silencealiang's avatar
add  
silencealiang committed
31
32
export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0224/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
wxj's avatar
wxj committed
33
# torch控制多流转单流
silencealiang's avatar
add  
silencealiang committed
34
35
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export SENDRECV_STREAM_WITH_COMPUTE=1 
wxj's avatar
wxj committed
36
# prof采集添加同步, 避免卡顿
wxj's avatar
wxj committed
37
38
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
wxj's avatar
wxj committed
39
# 采集rocblas size
silencealiang's avatar
add  
silencealiang committed
40
# export ROCBLAS_LAYER=3
wxj's avatar
wxj committed
41
42
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
silencealiang's avatar
add  
silencealiang committed
43
44
45
46
#增加编译缓存
export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
SAVE_PATH=./tmp_7b
wxj's avatar
wxj committed
47
TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
silencealiang's avatar
add  
silencealiang committed
48
49
DATA_PATH="/public/home/xingjl/megatron-lm/llama2_dataset/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
wxj's avatar
wxj committed
50
GPT_MODEL_ARGS=(
wxj's avatar
wxj committed
51
    --num-layers 32
wxj's avatar
wxj committed
52
53
54
55
    --hidden-size 4096
    --ffn-hidden-size 11008 
    --num-attention-heads 32
    --max-position-embeddings 4096
wxj's avatar
wxj committed
56
    --normalization RMSNorm 
silencealiang's avatar
add  
silencealiang committed
57
    --position-embedding-type rope # none # 
wxj's avatar
wxj committed
58
    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
wxj's avatar
wxj committed
59
)
silencealiang's avatar
add  
silencealiang committed
60
61
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
wxj's avatar
wxj committed
62
# --transformer-impl transformer_engine # 走core用这两组参数
wxj's avatar
wxj committed
63
    # --use-mcore-models
wxj's avatar
wxj committed
64
    # --transformer-impl local # 走legacy用这两组参数
wxj's avatar
wxj committed
65
    # --use-legacy-models 
wxj's avatar
wxj committed
66
TRAINING_ARGS=(
wxj's avatar
wxj committed
67
68
69
    --transformer-impl local # 走legacy用这两组参数
    --use-legacy-models 
    --micro-batch-size 1
silencealiang's avatar
add  
silencealiang committed
70
71
    --global-batch-size 64 #32 #240 #60 #512 #64
    --train-iters 50
wxj's avatar
wxj committed
72
73
74
75
76
77
    --weight-decay 0.1 
    --adam-beta1 0.9 
    --adam-beta2 0.95 
    --init-method-std 0.006 
    --clip-grad 1.0 
    --bf16
wxj's avatar
wxj committed
78
79
    # --fp16 # 开启fp16需要指定loss-scale
    # --loss-scale 1024
wxj's avatar
wxj committed
80
81
82
83
    --use-distributed-optimizer 
    --disable-bias-linear
    --attention-dropout 0
    --hidden-dropout 0
silencealiang's avatar
add  
silencealiang committed
84
    # --no-gradient-accumulation-fusion
wxj's avatar
wxj committed
85
86
87
88
89
    --swiglu
    --lr 3.0e-5 
    --lr-decay-style cosine 
    --min-lr 3.0e-6
    --lr-warmup-iters 1
wxj's avatar
wxj committed
90
    --ckpt-format torch
wxj's avatar
wxj committed
91
92
93
94
95
96
    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
    # --recompute-granularity full # 开启重计算降低显存增加耗时
    # --recompute-num-layers 5 #0 #
    # --recompute-method block
    --overlap-grad-reduce # 重叠ddp grad reduce
    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
silencealiang's avatar
add  
silencealiang committed
97
    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
wxj's avatar
wxj committed
98
    --use-flash-attn-cutlass
wxj's avatar
wxj committed
99
)
silencealiang's avatar
add  
silencealiang committed
100
101
102
103
104
105
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
wxj's avatar
wxj committed
106
107
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
silencealiang's avatar
add  
silencealiang committed
108
# --use-flash-attn-torch # torch fa
wxj's avatar
wxj committed
109
110
MODEL_PARALLEL_ARGS=(
    --sequence-parallel
silencealiang's avatar
add  
silencealiang committed
111
	--tensor-model-parallel-size 1
wxj's avatar
wxj committed
112
	--pipeline-model-parallel-size 2
silencealiang's avatar
add  
silencealiang committed
113
114
115
116
    # --context-parallel-size 2
    # --num-layers-per-virtual-pipeline-stage 4
  # --microbatch-group-size-per-virtual-pipeline-stage 1
#   --no-overlap-p2p-communication # 开启后
wxj's avatar
wxj committed
117
118
119
)
DATA_ARGS=(
    --data-path $DATA_PATH 
wxj's avatar
wxj committed
120
    --seq-length 4096 #4096
wxj's avatar
wxj committed
121
122
    --split 949,50,1
    --tokenizer-type Llama2Tokenizer
silencealiang's avatar
add  
silencealiang committed
123
124
    --tokenizer-model /public/home/xingjl/megatron-lm/llama2_dataset/tokenizer.model
    # --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
wxj's avatar
wxj committed
125
126
127
128
129
130
)
EVAL_AND_LOGGING_ARGS=(
    --log-interval 1
    --log-throughput
    --save-interval 1000 
    --eval-interval 1000 
silencealiang's avatar
add  
silencealiang committed
131
132
133
    #--save $SAVE_PATH 
    #--load $SAVE_PATH 
    --eval-iters 3
wxj's avatar
wxj committed
134
135
    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
)
silencealiang's avatar
add  
silencealiang committed
136
137
138
139
140
141
142
# FINETUNE_ARGS=(
#     # --finetune
#     # --pretrained-checkpoint $CHECKPOINT_PATH
#     --load $CHECKPOINT_PATH
#     --no-load-optim
#     --no-load-rng
# )
wxj's avatar
wxj committed
143
144
145
146
147
PROFILE_ARGS=(
    --profile
    --profile-step-start 4
    --profile-step-end 5
    --use-pytorch-profiler
wxj's avatar
wxj committed
148
    --profile-ranks 0 1 2 3 4 5 6 7
wxj's avatar
wxj committed
149
150
    --profile-dir prof_data
)
wxj's avatar
wxj committed
151
152
153
154
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
silencealiang's avatar
add  
silencealiang committed
155
DIST_PORT=34577
wxj's avatar
wxj committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
    --local-rank ${LOCAL_RANK}
    --dist-url tcp://${DIST_URL}:${DIST_PORT}
)
APP="python -u pretrain_gpt.py \
        ${GPT_MODEL_ARGS[@]} \
        ${TRAINING_ARGS[@]} \
        ${MODEL_PARALLEL_ARGS[@]} \
        ${DATA_ARGS[@]} \
        ${EVAL_AND_LOGGING_ARGS[@]} \
        ${DISTRIBUTED_ARGS[@]} \
"
wxj's avatar
wxj committed
170
171
# 开启profile
# ${PROFILE_ARGS[@]} \
silencealiang's avatar
add  
silencealiang committed
172
# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
wxj's avatar
wxj committed
173
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
wxj's avatar
wxj committed
174
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
silencealiang's avatar
add  
silencealiang committed
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# ${APP}
case ${LOCAL_RANK} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=3 --membind=3 ${APP}
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[4])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=4 --membind=4 ${APP}
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[5])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=5 --membind=5 ${APP}
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[6])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=6 --membind=6 ${APP}
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[7])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=7 --membind=7 ${APP}
  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
esac