Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wuxk1
Megatron-LM
Commits
52610942
Commit
52610942
authored
Dec 17, 2024
by
silencealiang
Browse files
添加prof参数
parent
a65607d4
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
81 additions
and
18 deletions
+81
-18
Llama_pretraining.sh
Llama_pretraining.sh
+2
-2
README.md
README.md
+19
-5
megatron/training/arguments.py
megatron/training/arguments.py
+3
-0
megatron/training/training.py
megatron/training/training.py
+14
-5
train_mixtral_8x7B_1nodes.sh
train_mixtral_8x7B_1nodes.sh
+43
-6
No files found.
Llama_pretraining.sh
View file @
52610942
...
@@ -32,7 +32,7 @@ export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH
...
@@ -32,7 +32,7 @@ export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH
CHECKPOINT_PATH
=
./tmp_7b
#$1 #<Specify path>
CHECKPOINT_PATH
=
./tmp_7b
#$1 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/
data
/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document"
#<Specify path and file prefix>_text_document
DATA_PATH
=
"/
public/home/wangxj3/Downloads
/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document"
#<Specify path and file prefix>_text_document
# GPT_MODEL_ARGS=(
# GPT_MODEL_ARGS=(
# --num-layers 32
# --num-layers 32
...
@@ -115,7 +115,7 @@ DATA_ARGS=(
...
@@ -115,7 +115,7 @@ DATA_ARGS=(
--normalization
RMSNorm
--normalization
RMSNorm
--no-position-embedding
--no-position-embedding
--tokenizer-type
Llama2Tokenizer
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/
data
/model_weights/llama2_7b_hf/tokenizer.model
--tokenizer-model
/
public/home/wangxj3/Downloads
/model_weights/llama2_7b_hf/tokenizer.model
)
)
EVAL_AND_LOGGING_ARGS
=(
EVAL_AND_LOGGING_ARGS
=(
...
...
README.md
View file @
52610942
...
@@ -19,8 +19,18 @@
...
@@ -19,8 +19,18 @@
2024.
12.16适配了torch prof
2024.
12.16适配了torch prof
使用方法: 启动脚本中添加下列参数, 即可采集对应的prof信息
使用方法: 启动脚本中添加下列参数, 即可采集对应的prof信息
```
python
# 采集torchprof
mpirun
-
np
8
--
allow
-
run
-
as
-
root
train_mixtral_8x7B_1nodes
.
sh
localhost
--
profiling
=
torch
# 采集hipprof
mpirun
-
np
8
--
allow
-
run
-
as
-
root
train_mixtral_8x7B_1nodes
.
sh
localhost
--
profiling
=
hip
```
```
bash
```
bash
PROFILE_ARGS
=(
# prof相关参数
TORCH_PROFIE_ARGS
=(
--profile
# 开启profile
--profile
# 开启profile
--profile-step-start
4
# skip前3个iter, warm第4个iter
--profile-step-start
4
# skip前3个iter, warm第4个iter
--profile-step-end
5
# 采集第5个iter
--profile-step-end
5
# 采集第5个iter
...
@@ -28,10 +38,14 @@ PROFILE_ARGS=(
...
@@ -28,10 +38,14 @@ PROFILE_ARGS=(
--profile-ranks
0 3
# 采集全局rank 第0和3
--profile-ranks
0 3
# 采集全局rank 第0和3
--profile-dir
./prof_data
# prof文件的保存目录
--profile-dir
./prof_data
# prof文件的保存目录
)
)
APP
=
"...
\
${
PROFILE_ARGS
[@]
}
\
HIP_PROFIE_ARGS
=(
"
--profile
${
APP
}
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
```
```
...
...
megatron/training/arguments.py
View file @
52610942
...
@@ -1263,6 +1263,9 @@ def _add_training_args(parser):
...
@@ -1263,6 +1263,9 @@ def _add_training_args(parser):
help
=
'Use the built-in pytorch profiler. '
help
=
'Use the built-in pytorch profiler. '
'Useful if you wish to view profiles in tensorboard.'
,
'Useful if you wish to view profiles in tensorboard.'
,
dest
=
'use_pytorch_profiler'
)
dest
=
'use_pytorch_profiler'
)
group
.
add_argument
(
'--use-hip-profiler'
,
action
=
'store_true'
,
help
=
'Use HIP PROFILER'
,
dest
=
'use_hip_profiler'
)
group
.
add_argument
(
'--profile-ranks'
,
nargs
=
'+'
,
type
=
int
,
default
=
[
0
],
group
.
add_argument
(
'--profile-ranks'
,
nargs
=
'+'
,
type
=
int
,
default
=
[
0
],
help
=
'Global ranks to profile.'
)
help
=
'Global ranks to profile.'
)
group
.
add_argument
(
'--profile-dir'
,
type
=
str
,
default
=
"./"
,
group
.
add_argument
(
'--profile-dir'
,
type
=
str
,
default
=
"./"
,
...
...
megatron/training/training.py
View file @
52610942
...
@@ -1221,7 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
...
@@ -1221,7 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
if
args
.
use_pytorch_profiler
:
if
args
.
use_pytorch_profiler
:
assert
prof
is
not
None
assert
prof
is
not
None
prof
.
stop
()
prof
.
stop
()
print_rank_0
(
f
"prof stop!"
)
else
:
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
# Manual garbage collection.
# Manual garbage collection.
...
@@ -1412,7 +1413,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
...
@@ -1412,7 +1413,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
def
trace_handler
(
p
):
def
trace_handler
(
p
):
from
pathlib
import
Path
from
pathlib
import
Path
Path
(
f
"
{
args
.
profile_dir
}
"
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
Path
(
f
"
{
args
.
profile_dir
}
"
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
print
(
p
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
,
row_limit
=-
1
))
if
args
.
rank
in
[
0
]:
print
(
p
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
,
row_limit
=-
1
))
p
.
export_chrome_trace
(
"{path}/trace_rank{rank}_step{step}.json"
.
format
(
p
.
export_chrome_trace
(
"{path}/trace_rank{rank}_step{step}.json"
.
format
(
path
=
args
.
profile_dir
,
rank
=
torch
.
distributed
.
get_rank
(),
step
=
p
.
step_num
))
path
=
args
.
profile_dir
,
rank
=
torch
.
distributed
.
get_rank
(),
step
=
p
.
step_num
))
...
@@ -1426,16 +1428,23 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
...
@@ -1426,16 +1428,23 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
warmup
=
1
if
args
.
profile_step_start
>
0
else
0
,
warmup
=
1
if
args
.
profile_step_start
>
0
else
0
,
active
=
args
.
profile_step_end
-
args
.
profile_step_start
,
active
=
args
.
profile_step_end
-
args
.
profile_step_start
,
repeat
=
1
),
repeat
=
1
),
# record_shapes=True,
on_trace_ready
=
trace_handler
)
# with_stack=True,
on_trace_ready
=
trace_handler
,)
prof
.
start
()
prof
.
start
()
elif
args
.
profile
and
torch
.
distributed
.
get_rank
()
in
args
.
profile_ranks
and
args
.
use_hip_profiler
:
import
ctypes
roctracer
=
ctypes
.
cdll
.
LoadLibrary
(
"/opt/dtk/roctracer/lib/libroctracer64.so"
)
# Run training iterations till done.
# Run training iterations till done.
while
iteration
<
args
.
train_iters
:
while
iteration
<
args
.
train_iters
:
if
args
.
profile
and
torch
.
distributed
.
get_rank
()
in
args
.
profile_ranks
:
if
args
.
profile
and
torch
.
distributed
.
get_rank
()
in
args
.
profile_ranks
:
if
args
.
use_pytorch_profiler
:
if
args
.
use_pytorch_profiler
:
prof
.
step
()
prof
.
step
()
elif
args
.
use_hip_profiler
:
if
iteration
==
args
.
profile_step_start
:
roctracer
.
roctracer_start
()
if
iteration
==
args
.
profile_step_end
:
roctracer
.
roctracer_stop
()
elif
iteration
==
args
.
profile_step_start
:
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
torch
.
autograd
.
profiler
.
emit_nvtx
(
record_shapes
=
True
).
__enter__
()
maybe_finalize_async_save
(
blocking
=
False
)
maybe_finalize_async_save
(
blocking
=
False
)
...
...
train_mixtral_8x7B_1nodes.sh
View file @
52610942
#!/bin/bash
#!/bin/bash
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
export
GPU_FLUSH_ON_EXECUTION
=
1
export
HIP_DIRECT_DISPATCH
=
0
fi
done
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# Runs Mixtral 8x7B model
# Runs Mixtral 8x7B model
export
HIP_DIRECT_DISPATCH
=
0
export
HIP_DIRECT_DISPATCH
=
0
...
@@ -17,8 +26,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
...
@@ -17,8 +26,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
#export NCCL_SOCKET_IFNAME=enp145s0f0
#export NCCL_SOCKET_IFNAME=enp145s0f0
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_GDR_READ
=
0
export
NCCL_NET_GDR_READ
=
0
export
GLOG_minloglevel
=
3
export
LD_LIBRARY_PATH
=
/opt/hipblaslt-install/lib/:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/opt/hipblaslt-install/lib/:
$LD_LIBRARY_PATH
RANK
=
$OMPI_COMM_WORLD_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
@@ -26,8 +38,8 @@ DIST_URL=${1}
...
@@ -26,8 +38,8 @@ DIST_URL=${1}
DIST_PORT
=
25900
DIST_PORT
=
25900
CHECKPOINT_PATH
=
./CKPT
CHECKPOINT_PATH
=
./CKPT
TOKENIZER_MODEL
=
../
Mixtral8x7B
/mixtral_dataset/tokenizer.model
TOKENIZER_MODEL
=
../
../megatron-lm
/mixtral_dataset/tokenizer.model
DATA_PATH
=
../
Mixtral8x7B
/mixtral_dataset/my-mixtral_text_document
DATA_PATH
=
../
../megatron-lm
/mixtral_dataset/my-mixtral_text_document
DISTRIBUTED_ARGS
=(
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--rank
${
RANK
}
...
@@ -41,7 +53,7 @@ MODEL_ARGS=(
...
@@ -41,7 +53,7 @@ MODEL_ARGS=(
--disable-bias-linear
--disable-bias-linear
--seq-length
4096
--seq-length
4096
--max-position-embeddings
32768
--max-position-embeddings
32768
--num-layers
2
--num-layers
8
#16
--hidden-size
1024
--hidden-size
1024
--ffn-hidden-size
14336
--ffn-hidden-size
14336
--num-attention-heads
32
--num-attention-heads
32
...
@@ -65,8 +77,6 @@ MOE_ARGS=(
...
@@ -65,8 +77,6 @@ MOE_ARGS=(
--moe-router-load-balancing-type
aux_loss
--moe-router-load-balancing-type
aux_loss
--moe-aux-loss-coeff
1e-2
--moe-aux-loss-coeff
1e-2
--moe-token-dispatcher-type
alltoall
--moe-token-dispatcher-type
alltoall
--overlap-param-gather
--overlap-grad-reduce
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
0.5
--moe-pad-expert-input-to-capacity
--moe-pad-expert-input-to-capacity
--moe-grouped-gemm
--moe-grouped-gemm
...
@@ -81,7 +91,7 @@ DATA_ARGS=(
...
@@ -81,7 +91,7 @@ DATA_ARGS=(
TRAINING_ARGS
=(
TRAINING_ARGS
=(
--micro-batch-size
1
--micro-batch-size
1
--global-batch-size
16
--global-batch-size
1
28
#25
6
--lr
1e-4
--lr
1e-4
--train-iters
20
--train-iters
20
--lr-decay-iters
320000
--lr-decay-iters
320000
...
@@ -91,6 +101,25 @@ TRAINING_ARGS=(
...
@@ -91,6 +101,25 @@ TRAINING_ARGS=(
--lr-warmup-iters
500
--lr-warmup-iters
500
--clip-grad
1.0
--clip-grad
1.0
--bf16
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_data
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
)
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
...
@@ -132,6 +161,14 @@ APP="python3 -u pretrain_gpt.py \
...
@@ -132,6 +161,14 @@ APP="python3 -u pretrain_gpt.py \
${
LOGGING_ARGS
[@]
}
\
${
LOGGING_ARGS
[@]
}
\
"
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
#for hygon cpu
case
${
LOCAL_RANK
}
in
case
${
LOCAL_RANK
}
in
[
0]
)
[
0]
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment