Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
e2157771
Commit
e2157771
authored
May 13, 2025
by
wangxj
Browse files
添加reproduce功能, 开启后可实现训练loss完全复现, 少量降低训练性能
parent
59cb262a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
69 additions
and
8 deletions
+69
-8
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+3
-0
dcu_megatron/training/arguments.py
dcu_megatron/training/arguments.py
+19
-0
dcu_megatron/training/initialize.py
dcu_megatron/training/initialize.py
+29
-1
examples/llama/Llama2_7b.sh
examples/llama/Llama2_7b.sh
+18
-7
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
e2157771
...
@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC):
from
..training.initialize
import
_initialize_distributed
from
..training.initialize
import
_initialize_distributed
from
..training.initialize
import
_compile_dependencies
from
..training.initialize
import
_compile_dependencies
from
..training.training
import
train
from
..training.training
import
train
from
..training.initialize
import
_set_random_seed
MegatronAdaptation
.
register
(
'megatron.training.tokenizer.tokenizer.build_tokenizer'
,
MegatronAdaptation
.
register
(
'megatron.training.tokenizer.tokenizer.build_tokenizer'
,
build_tokenizer
)
build_tokenizer
)
...
@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC):
_initialize_distributed
)
_initialize_distributed
)
MegatronAdaptation
.
register
(
'megatron.training.initialize._compile_dependencies'
,
MegatronAdaptation
.
register
(
'megatron.training.initialize._compile_dependencies'
,
_compile_dependencies
)
_compile_dependencies
)
MegatronAdaptation
.
register
(
'megatron.training.initialize._set_random_seed'
,
_set_random_seed
)
# traing.train
# traing.train
MegatronAdaptation
.
register
(
'megatron.training.training.train'
,
MegatronAdaptation
.
register
(
'megatron.training.training.train'
,
...
...
dcu_megatron/training/arguments.py
View file @
e2157771
...
@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
...
@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser
=
_add_training_args
(
parser
)
parser
=
_add_training_args
(
parser
)
parser
=
_add_extra_training_args
(
parser
)
parser
=
_add_extra_training_args
(
parser
)
parser
=
_add_initialization_args
(
parser
)
parser
=
_add_initialization_args
(
parser
)
parser
=
_add_extra_initialization_args
(
parser
)
parser
=
_add_learning_rate_args
(
parser
)
parser
=
_add_learning_rate_args
(
parser
)
parser
=
_add_checkpointing_args
(
parser
)
parser
=
_add_checkpointing_args
(
parser
)
parser
=
_add_mixed_precision_args
(
parser
)
parser
=
_add_mixed_precision_args
(
parser
)
...
@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
...
@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser
=
_add_extra_distributed_args
(
parser
)
parser
=
_add_extra_distributed_args
(
parser
)
parser
=
_add_validation_args
(
parser
)
parser
=
_add_validation_args
(
parser
)
parser
=
_add_data_args
(
parser
)
parser
=
_add_data_args
(
parser
)
parser
=
_add_extra_data_args
(
parser
)
parser
=
_add_tokenizer_args
(
parser
)
parser
=
_add_tokenizer_args
(
parser
)
parser
=
_add_extra_tokenizer_args
(
parser
)
parser
=
_add_extra_tokenizer_args
(
parser
)
parser
=
_add_autoresume_args
(
parser
)
parser
=
_add_autoresume_args
(
parser
)
...
@@ -140,7 +142,24 @@ def _add_extra_training_args(parser):
...
@@ -140,7 +142,24 @@ def _add_extra_training_args(parser):
return
parser
return
parser
def
_add_extra_initialization_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'extra initialization args'
)
group
.
add_argument
(
'--reproduce'
,
action
=
'store_true'
,
help
=
'reproduce train loss, need set --seed > 0.'
)
return
parser
def
_add_extra_data_args
(
parser
):
# 删除原参数
remove_original_params
(
parser
,
[
"num_workers"
])
# 重定义参数
group
=
parser
.
add_argument_group
(
title
=
'extra data args'
)
group
.
add_argument
(
'--num-workers'
,
type
=
int
,
default
=
0
,
help
=
"Dataloader number of workers."
)
return
parser
def
_add_extra_tokenizer_args
(
parser
):
def
_add_extra_tokenizer_args
(
parser
):
# 删除原参数
# 删除原参数
remove_original_params
(
parser
,
[
"tokenizer_type"
])
remove_original_params
(
parser
,
[
"tokenizer_type"
])
...
...
dcu_megatron/training/initialize.py
View file @
e2157771
"""Megatron initialization."""
"""Megatron initialization."""
import
random
import
time
import
time
import
numpy
as
np
import
torch
import
torch
from
datetime
import
timedelta
from
datetime
import
timedelta
from
megatron.training
import
get_args
from
megatron.training
import
get_args
from
megatron.core
import
mpu
from
megatron.core
import
mpu
,
tensor_parallel
def
_initialize_distributed
(
get_embedding_ranks
,
get_position_embedding_ranks
):
def
_initialize_distributed
(
get_embedding_ranks
,
get_position_embedding_ranks
):
...
@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
...
@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
f
"
{
mpu
.
get_pipeline_model_parallel_world_size
()
}
"
f
"
{
mpu
.
get_pipeline_model_parallel_world_size
()
}
"
)
)
def
_set_random_seed
(
seed_
,
data_parallel_random_init
=
False
,
te_rng_tracker
=
False
,
inference_rng_tracker
=
False
):
"""Set random seed for reproducability."""
args
=
get_args
()
if
seed_
is
not
None
and
seed_
>
0
:
# Ensure that different pipeline MP stages get different seeds.
seed
=
seed_
+
(
100
*
mpu
.
get_pipeline_model_parallel_rank
())
# Ensure different data parallel ranks get different seeds
if
data_parallel_random_init
:
seed
=
seed
+
(
10
*
mpu
.
get_data_parallel_rank
())
# 设置cpu随机种子
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
device_count
()
>
0
:
# 设置gpu随机种子
tensor_parallel
.
model_parallel_cuda_manual_seed
(
seed
,
te_rng_tracker
,
inference_rng_tracker
)
if
args
.
reproduce
:
assert
(
args
.
attention_dropout
>
0
)
is
False
,
f
"To utilize the reproduction function, args.attention_dropout =
{
args
.
attention_dropout
}
must be set to 0."
assert
(
args
.
hidden_dropout
>
0
)
is
False
,
f
"To utilize the reproduction function, args.hidden_dropout =
{
args
.
hidden_dropout
}
must be set to 0."
torch
.
backends
.
cudnn
.
deterministic
=
True
# 设置cudnn后端为确定性算法
torch
.
backends
.
cudnn
.
benchmark
=
False
# 固定卷积算法
torch
.
use_deterministic_algorithms
(
True
)
# 使用torch的deterministic算子 避免不确定性
else
:
raise
ValueError
(
"Seed ({}) should be a positive integer."
.
format
(
seed_
))
def
_compile_dependencies
():
def
_compile_dependencies
():
...
...
examples/llama/Llama2_7b.sh
View file @
e2157771
#!/bin/bash
#!/bin/bash
# set -eux
# set -eux
for
para
in
$
*
for
para
in
$
@
do
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
]]
;
then
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
fi
done
done
...
@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7
...
@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"/
workspace
/rccl-test/rccl-tests-0204/topo-input.xml"
export
NCCL_TOPO_FILE
=
"/
public/home/wangxj/Projects
/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# hipblaslt库
# hipblaslt库
export
LD_LIBRARY_PATH
=
/
data
/blas/hipblaslt-install
-dtk-25.04-0212
/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/
public/home/wangxj/Downloads
/blas/hipblaslt-install
0507-1
/lib:
$LD_LIBRARY_PATH
# rocblas
# rocblas
export
LD_LIBRARY_PATH
=
/
data
/blas/rocblas-install-0331-release/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/
public/home/wangxj/Downloads
/blas/rocblas-install-0331-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
...
@@ -51,7 +60,7 @@ export cache_size_limit=64
...
@@ -51,7 +60,7 @@ export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/
data
/datasets/oscar-1GB/oscar-1GB-llama2_text_document"
#<Specify path and file prefix>_text_document
DATA_PATH
=
"/
public/home/wangxj/Downloads
/datasets/oscar-1GB/oscar-1GB-llama2_text_document"
#<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
GPT_MODEL_ARGS
=(
--num-layers
32
--num-layers
32
...
@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=(
...
@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=(
--num-attention-heads
32
--num-attention-heads
32
--max-position-embeddings
4096
--max-position-embeddings
4096
--normalization
RMSNorm
#
Lightop
RMSNorm
--normalization
Lightop
RMSNorm
# RMSNorm
--position-embedding-type
rope
# none #
--position-embedding-type
rope
# none #
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
)
...
@@ -104,6 +113,8 @@ TRAINING_ARGS=(
...
@@ -104,6 +113,8 @@ TRAINING_ARGS=(
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
--use-flash-attn
# --reproduce
--num-workers
2
)
)
# 使用torch fa的环境变量
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
...
@@ -130,7 +141,7 @@ DATA_ARGS=(
...
@@ -130,7 +141,7 @@ DATA_ARGS=(
--seq-length
4096
#4096
--seq-length
4096
#4096
--split
949,50,1
--split
949,50,1
--tokenizer-type
Llama2Tokenizer
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/
data
/model_weights/llama2_7b_hf/tokenizer.model
--tokenizer-model
/
public/home/wangxj/Downloads
/model_weights/llama2_7b_hf/tokenizer.model
)
)
EVAL_AND_LOGGING_ARGS
=(
EVAL_AND_LOGGING_ARGS
=(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment