Commit eb4333f0 authored by wangxj's avatar wangxj
Browse files

添加reproduce功能, 开启后可实现训练loss完全复现, 少量降低训练性能

parent 57944e55
......@@ -191,6 +191,7 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..training.initialize import _initialize_distributed
from ..training.initialize import _compile_dependencies
from ..training.training import train
from ..training.initialize import _set_random_seed
MegatronAdaptation.register('megatron.training.tokenizer.tokenizer.build_tokenizer',
build_tokenizer)
......@@ -200,6 +201,9 @@ class CoreAdaptation(MegatronAdaptationABC):
# remove fused_kernels
MegatronAdaptation.register('megatron.training.initialize._compile_dependencies',
_compile_dependencies)
# 添加固定seed
MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
_set_random_seed)
# add trace_handler
MegatronAdaptation.register('megatron.training.training.train',
......
......@@ -24,6 +24,7 @@ def add_megatron_arguments_patch(parser: argparse.ArgumentParser):
# add extra arguments
parser = _add_extra_network_size_args(parser)
parser = _add_extra_training_args(parser)
parser = _add_extra_initialization_args(parser)
parser = _add_extra_distributed_args(parser)
parser = _add_extra_tokenizer_args(parser)
......@@ -101,6 +102,14 @@ def _add_extra_training_args(parser):
return parser
def _add_extra_initialization_args(parser):
group = parser.add_argument_group(title='extra initialization args')
group.add_argument('--reproduce', action='store_true',
help='reproduce train loss, need set --seed > 0.')
return parser
def _add_extra_tokenizer_args(parser):
# 删除原参数
remove_original_params(parser, ["tokenizer_type"])
......
"""Megatron initialization."""
import time
import torch
import random
import numpy as np
from datetime import timedelta
from megatron.training import get_args
from megatron.core import mpu
from megatron.core import mpu, tensor_parallel
def _compile_dependencies():
......@@ -149,3 +151,36 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
f"> initialized pipeline model parallel with size "
f"{mpu.get_pipeline_model_parallel_world_size()}"
)
def _set_random_seed(
seed_: int,
data_parallel_random_init: bool = False,
te_rng_tracker: bool = False,
inference_rng_tracker: bool = False,
use_cudagraphable_rng: bool = False,
):
"""Set random seed for reproducability."""
args = get_args()
if seed_ is not None and seed_ > 0:
# Ensure that different pipeline MP stages get different seeds.
seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
# Ensure different data parallel ranks get different seeds
if data_parallel_random_init:
seed = seed + (10 * mpu.get_data_parallel_rank())
# 设置cpu随机种子
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.device_count() > 0:
# 设置gpu随机种子
tensor_parallel.model_parallel_cuda_manual_seed(seed, te_rng_tracker, inference_rng_tracker, use_cudagraphable_rng)
if args.reproduce:
assert (args.attention_dropout > 0) is False, f"To utilize the reproduction function, args.attention_dropout = {args.attention_dropout} must be set to 0."
assert (args.hidden_dropout > 0) is False, f"To utilize the reproduction function, args.hidden_dropout = {args.hidden_dropout} must be set to 0."
torch.backends.cudnn.deterministic = True # 设置cudnn后端为确定性算法
torch.backends.cudnn.benchmark = False # 固定卷积算法
torch.use_deterministic_algorithms(True) # 使用torch的deterministic算子 避免不确定性
else:
raise ValueError("Seed ({}) should be a positive integer.".format(seed_))
node036 slots=8
\ No newline at end of file
#!/bin/bash
INITIALIZATION_ARGS=( --num-workers 2)
for para in $*
do
if [[ $para == --data_path* ]];then
......@@ -10,6 +12,16 @@ do
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
elif [[ $para == --reproduce* ]];then
INITIALIZATION_ARGS=( --reproduce --num-workers 0)
export MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC=1 # miopen 确定算法打开
export ROCBLAS_ATOMICS_MOD=0 # rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export MIOPEN_DEBUG_CONV_FFT=0
export MIOPEN_DEBUG_CONV_DIRECT=0
export MIOPEN_DEBUG_CONV_GEMM=1
export MIOPEN_DEBUG_CONV_WINOGRAD=0
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
fi
done
......@@ -63,7 +75,7 @@ TRAINING_ARGS=(
--use-legacy-models
--micro-batch-size 1
--global-batch-size 256
--train-iters 10
--train-iters 50
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
......@@ -134,6 +146,7 @@ APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
${INITIALIZATION_ARGS[@]} \
"
if [[ $profiling == "torch" ]]; then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment