Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
e2157771
Commit
e2157771
authored
May 13, 2025
by
wangxj
Browse files
添加reproduce功能, 开启后可实现训练loss完全复现, 少量降低训练性能
parent
59cb262a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
69 additions
and
8 deletions
+69
-8
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+3
-0
dcu_megatron/training/arguments.py
dcu_megatron/training/arguments.py
+19
-0
dcu_megatron/training/initialize.py
dcu_megatron/training/initialize.py
+29
-1
examples/llama/Llama2_7b.sh
examples/llama/Llama2_7b.sh
+18
-7
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
e2157771
...
...
@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC):
from
..training.initialize
import
_initialize_distributed
from
..training.initialize
import
_compile_dependencies
from
..training.training
import
train
from
..training.initialize
import
_set_random_seed
MegatronAdaptation
.
register
(
'megatron.training.tokenizer.tokenizer.build_tokenizer'
,
build_tokenizer
)
...
...
@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC):
_initialize_distributed
)
MegatronAdaptation
.
register
(
'megatron.training.initialize._compile_dependencies'
,
_compile_dependencies
)
MegatronAdaptation
.
register
(
'megatron.training.initialize._set_random_seed'
,
_set_random_seed
)
# traing.train
MegatronAdaptation
.
register
(
'megatron.training.training.train'
,
...
...
dcu_megatron/training/arguments.py
View file @
e2157771
...
...
@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser
=
_add_training_args
(
parser
)
parser
=
_add_extra_training_args
(
parser
)
parser
=
_add_initialization_args
(
parser
)
parser
=
_add_extra_initialization_args
(
parser
)
parser
=
_add_learning_rate_args
(
parser
)
parser
=
_add_checkpointing_args
(
parser
)
parser
=
_add_mixed_precision_args
(
parser
)
...
...
@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser
=
_add_extra_distributed_args
(
parser
)
parser
=
_add_validation_args
(
parser
)
parser
=
_add_data_args
(
parser
)
parser
=
_add_extra_data_args
(
parser
)
parser
=
_add_tokenizer_args
(
parser
)
parser
=
_add_extra_tokenizer_args
(
parser
)
parser
=
_add_autoresume_args
(
parser
)
...
...
@@ -140,7 +142,24 @@ def _add_extra_training_args(parser):
return
parser
def
_add_extra_initialization_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'extra initialization args'
)
group
.
add_argument
(
'--reproduce'
,
action
=
'store_true'
,
help
=
'reproduce train loss, need set --seed > 0.'
)
return
parser
def
_add_extra_data_args
(
parser
):
# 删除原参数
remove_original_params
(
parser
,
[
"num_workers"
])
# 重定义参数
group
=
parser
.
add_argument_group
(
title
=
'extra data args'
)
group
.
add_argument
(
'--num-workers'
,
type
=
int
,
default
=
0
,
help
=
"Dataloader number of workers."
)
return
parser
def
_add_extra_tokenizer_args
(
parser
):
# 删除原参数
remove_original_params
(
parser
,
[
"tokenizer_type"
])
...
...
dcu_megatron/training/initialize.py
View file @
e2157771
"""Megatron initialization."""
import
random
import
time
import
numpy
as
np
import
torch
from
datetime
import
timedelta
from
megatron.training
import
get_args
from
megatron.core
import
mpu
from
megatron.core
import
mpu
,
tensor_parallel
def
_initialize_distributed
(
get_embedding_ranks
,
get_position_embedding_ranks
):
...
...
@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
f
"
{
mpu
.
get_pipeline_model_parallel_world_size
()
}
"
)
def
_set_random_seed
(
seed_
,
data_parallel_random_init
=
False
,
te_rng_tracker
=
False
,
inference_rng_tracker
=
False
):
"""Set random seed for reproducability."""
args
=
get_args
()
if
seed_
is
not
None
and
seed_
>
0
:
# Ensure that different pipeline MP stages get different seeds.
seed
=
seed_
+
(
100
*
mpu
.
get_pipeline_model_parallel_rank
())
# Ensure different data parallel ranks get different seeds
if
data_parallel_random_init
:
seed
=
seed
+
(
10
*
mpu
.
get_data_parallel_rank
())
# 设置cpu随机种子
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
device_count
()
>
0
:
# 设置gpu随机种子
tensor_parallel
.
model_parallel_cuda_manual_seed
(
seed
,
te_rng_tracker
,
inference_rng_tracker
)
if
args
.
reproduce
:
assert
(
args
.
attention_dropout
>
0
)
is
False
,
f
"To utilize the reproduction function, args.attention_dropout =
{
args
.
attention_dropout
}
must be set to 0."
assert
(
args
.
hidden_dropout
>
0
)
is
False
,
f
"To utilize the reproduction function, args.hidden_dropout =
{
args
.
hidden_dropout
}
must be set to 0."
torch
.
backends
.
cudnn
.
deterministic
=
True
# 设置cudnn后端为确定性算法
torch
.
backends
.
cudnn
.
benchmark
=
False
# 固定卷积算法
torch
.
use_deterministic_algorithms
(
True
)
# 使用torch的deterministic算子 避免不确定性
else
:
raise
ValueError
(
"Seed ({}) should be a positive integer."
.
format
(
seed_
))
def
_compile_dependencies
():
...
...
examples/llama/Llama2_7b.sh
View file @
e2157771
#!/bin/bash
# set -eux
for
para
in
$
*
for
para
in
$
@
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
]]
;
then
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
...
...
@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"/
workspace
/rccl-test/rccl-tests-0204/topo-input.xml"
export
NCCL_TOPO_FILE
=
"/
public/home/wangxj/Projects
/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# hipblaslt库
export
LD_LIBRARY_PATH
=
/
data
/blas/hipblaslt-install
-dtk-25.04-0212
/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/
public/home/wangxj/Downloads
/blas/hipblaslt-install
0507-1
/lib:
$LD_LIBRARY_PATH
# rocblas
export
LD_LIBRARY_PATH
=
/
data
/blas/rocblas-install-0331-release/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/
public/home/wangxj/Downloads
/blas/rocblas-install-0331-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
...
...
@@ -51,7 +60,7 @@ export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/
data
/datasets/oscar-1GB/oscar-1GB-llama2_text_document"
#<Specify path and file prefix>_text_document
DATA_PATH
=
"/
public/home/wangxj/Downloads
/datasets/oscar-1GB/oscar-1GB-llama2_text_document"
#<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
32
...
...
@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=(
--num-attention-heads
32
--max-position-embeddings
4096
--normalization
RMSNorm
#
Lightop
RMSNorm
--normalization
Lightop
RMSNorm
# RMSNorm
--position-embedding-type
rope
# none #
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
...
...
@@ -104,6 +113,8 @@ TRAINING_ARGS=(
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
# --reproduce
--num-workers
2
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
...
...
@@ -130,7 +141,7 @@ DATA_ARGS=(
--seq-length
4096
#4096
--split
949,50,1
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/
data
/model_weights/llama2_7b_hf/tokenizer.model
--tokenizer-model
/
public/home/wangxj/Downloads
/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment