Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wuxk1
Megatron-LM
Commits
e7da80dd
Commit
e7da80dd
authored
Dec 16, 2024
by
wxj
Browse files
Merge branch 'main' into 'main'
更新torchprof支持 See merge request
!6
parents
340ddce9
fef9c0d9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
140 additions
and
82 deletions
+140
-82
Llama_pretraining.sh
Llama_pretraining.sh
+85
-45
megatron/training/arguments.py
megatron/training/arguments.py
+4
-2
megatron/training/training.py
megatron/training/training.py
+51
-35
No files found.
Llama_pretraining.sh
View file @
e7da80dd
...
...
@@ -19,16 +19,34 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export
NCCL_IB_HCA
=
mlx5_1,mlx5_2
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_GDR_READ
=
0
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# te调用gemm需要导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# 更新rocblas
export
LD_LIBRARY_PATH
=
/data/rocblas-install/lib:
$LD_LIBRARY_PATH
# # prof采集添加同步
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
CHECKPOINT_PATH
=
./tmp_7b
#$1 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/datasets/oscar-1GB-llama_text_document"
#<Specify path and file prefix>_text_document
DATA_PATH
=
"/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document"
#<Specify path and file prefix>_text_document
# GPT_MODEL_ARGS=(
# --num-layers 32
# --hidden-size 5120
# --ffn-hidden-size 13824
# --num-attention-heads 40
# --seq-length 4096 #4096
# --max-position-embeddings 32768 #4096
# --num-query-groups 40
# --group-query-attention
# )
GPT_MODEL_ARGS
=(
--num-layers
3
6
--num-layers
6
--hidden-size
4096
--ffn-hidden-size
11008
--num-attention-heads
32
...
...
@@ -36,17 +54,18 @@ GPT_MODEL_ARGS=(
--max-position-embeddings
4096
)
# export NVTE_FLASH_ATTN=1 # 走
a
utlass
#
export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# export NVTE_FLASH_ATTN=1 # 走
c
utlass
export
NVTE_FLASH_ATTN_TRITON
=
1
# 走triton_fa
# --transformer-impl transformer_engine
# --use-mcore-models
# --transformer-impl local
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
local
--use-
legacy
-models
--transformer-impl
transformer_engine
--use-
mcore
-models
--micro-batch-size
1
--global-batch-size
6
0
#240 #512 #64
--train-iters
10
0
--global-batch-size
6
#240
#60
#512 #64
--train-iters
10
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
...
...
@@ -54,24 +73,32 @@ TRAINING_ARGS=(
--clip-grad
1.0
--bf16
--use-distributed-optimizer
--ckpt-format
torch
--disable-bias-linear
--overlap-grad-reduce
--attention-dropout
0
--hidden-dropout
0
--ddp-average-in-collective
--recompute-granularity
full
--recompute-num-layers
5
--recompute-method
block
--no-gradient-accumulation-fusion
--add-qkv-bias
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--recompute-granularity
full
--recompute-num-layers
5
#0 #
--recompute-method
block
--overlap-grad-reduce
--use-flash-attn-triton
)
# --use-flash-attn-ck
# --add-qkv-bias # qwen
# --ckpt-format torch
# --ddp-average-in-collective
# --recompute-granularity full
# --recompute-num-layers 5
# --recompute-method block
# --overlap-grad-reduce
# --use-flash-attn-cutlass
# --use-flash-attn-triton
MODEL_PARALLEL_ARGS
=(
...
...
@@ -88,7 +115,7 @@ DATA_ARGS=(
--normalization
RMSNorm
--no-position-embedding
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/
p
at
h/to
/llama2_7b_hf/tokenizer.model
--tokenizer-model
/
d
at
a/model_weights
/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
...
...
@@ -102,6 +129,15 @@ EVAL_AND_LOGGING_ARGS=(
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 3
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
...
@@ -122,47 +158,51 @@ APP="python -u pretrain_gpt.py \
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
PROFILE_ARGS
[@]
}
\
"
export
HIP_VISIBLE_DEVICES
=
4,5,6,7
# 0,1,2,3 # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case
${
LOCAL_RANK
}
in
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
#
${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${
APP
}
#
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
#
${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${
APP
}
#
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
#
${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${
APP
}
#
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
#
${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${
APP
}
#
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
#
[4])
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# #
numactl --cpunodebind=0 --membind=0 ${APP}
#
;;
#
[5])
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# #
numactl --cpunodebind=0 --membind=0 ${APP}
#
;;
#
[6])
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# #
numactl --cpunodebind=0 --membind=0 ${APP}
#
;;
#
[7])
#
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
# #
numactl --cpunodebind=0 --membind=0 ${APP}
#
;;
esac
megatron/training/arguments.py
View file @
e7da80dd
...
...
@@ -643,7 +643,7 @@ def validate_args(args, defaults={}):
'--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
# FlashAttention
args
.
use_flash_attn
=
args
.
use_flash_attn_c
k
or
args
.
use_flash_attn_triton
args
.
use_flash_attn
=
args
.
use_flash_attn_c
utlass
or
args
.
use_flash_attn_triton
# Legacy RoPE arguments
if
args
.
use_rotary_position_embeddings
:
...
...
@@ -1265,6 +1265,8 @@ def _add_training_args(parser):
dest
=
'use_pytorch_profiler'
)
group
.
add_argument
(
'--profile-ranks'
,
nargs
=
'+'
,
type
=
int
,
default
=
[
0
],
help
=
'Global ranks to profile.'
)
group
.
add_argument
(
'--profile-dir'
,
type
=
str
,
default
=
"./"
,
help
=
'profile dir to save.'
)
group
.
add_argument
(
'--record-memory-history'
,
action
=
"store_true"
,
default
=
False
,
help
=
'Record memory history in last rank.'
)
group
.
add_argument
(
'--memory-snapshot-path'
,
type
=
str
,
default
=
"snapshot.pickle"
,
...
...
@@ -1358,7 +1360,7 @@ def _add_training_args(parser):
group
.
add_argument
(
'--cross-entropy-loss-fusion'
,
action
=
'store_true'
,
help
=
'Enabled fusion of cross entropy loss calculation.'
,
dest
=
'cross_entropy_loss_fusion'
)
group
.
add_argument
(
'--use-flash-attn-c
k
'
,
action
=
'store_true'
,
group
.
add_argument
(
'--use-flash-attn-c
utlass
'
,
action
=
'store_true'
,
help
=
'use FlashAttention implementation of attention. '
'https://arxiv.org/abs/2205.14135'
)
group
.
add_argument
(
'--use-flash-attn-triton'
,
action
=
'store_true'
,
...
...
megatron/training/training.py
View file @
e7da80dd
...
...
@@ -135,6 +135,13 @@ def num_floating_point_operations(args, batch_size):
# - 2x: A GEMM of a m*n tensor with a n*k tensor requires 2mnk floating-point operations.
expansion_factor
=
3
*
2
*
2
# print(f"batch_size: {batch_size}, \
# query_projection_to_hidden_size_ratio: {query_projection_to_hidden_size_ratio}, \
# num_experts_routed_to: {num_experts_routed_to}, \
# gated_linear_multiplier: {gated_linear_multiplier}, \
# shared_expert_ffn_hidden_size: {shared_expert_ffn_hidden_size}, \
# gated_linear_multiplier: {gated_linear_multiplier}, \
# ")
return
(
expansion_factor
*
batch_size
...
...
@@ -1214,8 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
if
args
.
use_pytorch_profiler
:
assert
prof
is
not
None
prof
.
stop
()
else
:
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
print_rank_0
(
f
"prof stop!"
)
# Manual garbage collection.
if
args
.
manual_gc
:
...
...
@@ -1401,15 +1408,27 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
prof
=
None
if
args
.
profile
and
torch
.
distributed
.
get_rank
()
in
args
.
profile_ranks
and
args
.
use_pytorch_profiler
:
def
trace_handler
(
p
):
from
pathlib
import
Path
Path
(
f
"
{
args
.
profile_dir
}
"
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
print
(
p
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
,
row_limit
=-
1
))
p
.
export_chrome_trace
(
"{path}/trace_rank{rank}_step{step}.json"
.
format
(
path
=
args
.
profile_dir
,
rank
=
torch
.
distributed
.
get_rank
(),
step
=
p
.
step_num
))
prof
=
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
,
],
schedule
=
torch
.
profiler
.
schedule
(
wait
=
max
(
args
.
profile_step_start
-
1
,
0
),
warmup
=
1
if
args
.
profile_step_start
>
0
else
0
,
active
=
args
.
profile_step_end
-
args
.
profile_step_start
,
repeat
=
1
),
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
args
.
tensorboard_dir
)
,
record_shapes
=
True
,
with_stack
=
True
)
# record_shapes=True
,
# with_stack
=True,
on_trace_ready
=
trace_handler
,
)
prof
.
start
()
# Run training iterations till done.
...
...
@@ -1417,9 +1436,6 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
if
args
.
profile
and
torch
.
distributed
.
get_rank
()
in
args
.
profile_ranks
:
if
args
.
use_pytorch_profiler
:
prof
.
step
()
elif
iteration
==
args
.
profile_step_start
:
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
torch
.
autograd
.
profiler
.
emit_nvtx
(
record_shapes
=
True
).
__enter__
()
maybe_finalize_async_save
(
blocking
=
False
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment