Commit e2157771 authored by wangxj's avatar wangxj
Browse files

添加reproduce功能, 开启后可实现训练loss完全复现, 少量降低训练性能

parent 59cb262a
......@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..training.initialize import _initialize_distributed
from ..training.initialize import _compile_dependencies
from ..training.training import train
from ..training.initialize import _set_random_seed
MegatronAdaptation.register('megatron.training.tokenizer.tokenizer.build_tokenizer',
build_tokenizer)
......@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC):
_initialize_distributed)
MegatronAdaptation.register('megatron.training.initialize._compile_dependencies',
_compile_dependencies)
MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
_set_random_seed)
# traing.train
MegatronAdaptation.register('megatron.training.training.train',
......
......@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_training_args(parser)
parser = _add_extra_training_args(parser)
parser = _add_initialization_args(parser)
parser = _add_extra_initialization_args(parser)
parser = _add_learning_rate_args(parser)
parser = _add_checkpointing_args(parser)
parser = _add_mixed_precision_args(parser)
......@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_extra_distributed_args(parser)
parser = _add_validation_args(parser)
parser = _add_data_args(parser)
parser = _add_extra_data_args(parser)
parser = _add_tokenizer_args(parser)
parser = _add_extra_tokenizer_args(parser)
parser = _add_autoresume_args(parser)
......@@ -140,7 +142,24 @@ def _add_extra_training_args(parser):
return parser
def _add_extra_initialization_args(parser):
group = parser.add_argument_group(title='extra initialization args')
group.add_argument('--reproduce', action='store_true',
help='reproduce train loss, need set --seed > 0.')
return parser
def _add_extra_data_args(parser):
# 删除原参数
remove_original_params(parser, ["num_workers"])
# 重定义参数
group = parser.add_argument_group(title='extra data args')
group.add_argument('--num-workers', type=int, default=0,
help="Dataloader number of workers.")
return parser
def _add_extra_tokenizer_args(parser):
# 删除原参数
remove_original_params(parser, ["tokenizer_type"])
......
"""Megatron initialization."""
import random
import time
import numpy as np
import torch
from datetime import timedelta
from megatron.training import get_args
from megatron.core import mpu
from megatron.core import mpu, tensor_parallel
def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
......@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
f"{mpu.get_pipeline_model_parallel_world_size()}"
)
def _set_random_seed(seed_, data_parallel_random_init=False, te_rng_tracker=False, inference_rng_tracker=False):
"""Set random seed for reproducability."""
args = get_args()
if seed_ is not None and seed_ > 0:
# Ensure that different pipeline MP stages get different seeds.
seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
# Ensure different data parallel ranks get different seeds
if data_parallel_random_init:
seed = seed + (10 * mpu.get_data_parallel_rank())
# 设置cpu随机种子
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.device_count() > 0:
# 设置gpu随机种子
tensor_parallel.model_parallel_cuda_manual_seed(seed, te_rng_tracker, inference_rng_tracker)
if args.reproduce:
assert (args.attention_dropout > 0) is False, f"To utilize the reproduction function, args.attention_dropout = {args.attention_dropout} must be set to 0."
assert (args.hidden_dropout > 0) is False, f"To utilize the reproduction function, args.hidden_dropout = {args.hidden_dropout} must be set to 0."
torch.backends.cudnn.deterministic = True # 设置cudnn后端为确定性算法
torch.backends.cudnn.benchmark = False # 固定卷积算法
torch.use_deterministic_algorithms(True) # 使用torch的deterministic算子 避免不确定性
else:
raise ValueError("Seed ({}) should be a positive integer.".format(seed_))
def _compile_dependencies():
......
#!/bin/bash
# set -eux
for para in $*
for para in $@
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
elif [[ $para == --reproduce ]];then
export MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC=1 # miopen 确定算法打开
export ROCBLAS_ATOMICS_MOD=0 # rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export MIOPEN_DEBUG_CONV_FFT=0
export MIOPEN_DEBUG_CONV_DIRECT=0
export MIOPEN_DEBUG_CONV_GEMM=1
export MIOPEN_DEBUG_CONV_WINOGRAD=0
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
fi
done
......@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# hipblaslt库
export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/hipblaslt-install0507-1/lib:$LD_LIBRARY_PATH
# rocblas
export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
......@@ -51,7 +60,7 @@ export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
DATA_PATH="/public/home/wangxj/Downloads/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 32
......@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=(
--num-attention-heads 32
--max-position-embeddings 4096
--normalization RMSNorm # LightopRMSNorm
--normalization LightopRMSNorm # RMSNorm
--position-embedding-type rope # none #
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
)
......@@ -104,6 +113,8 @@ TRAINING_ARGS=(
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
# --reproduce
--num-workers 2
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
......@@ -130,7 +141,7 @@ DATA_ARGS=(
--seq-length 4096 #4096
--split 949,50,1
--tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
--tokenizer-model /public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment