Commit e2157771 authored by wangxj's avatar wangxj
Browse files

添加reproduce功能, 开启后可实现训练loss完全复现, 少量降低训练性能

parent 59cb262a
...@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..training.initialize import _initialize_distributed from ..training.initialize import _initialize_distributed
from ..training.initialize import _compile_dependencies from ..training.initialize import _compile_dependencies
from ..training.training import train from ..training.training import train
from ..training.initialize import _set_random_seed
MegatronAdaptation.register('megatron.training.tokenizer.tokenizer.build_tokenizer', MegatronAdaptation.register('megatron.training.tokenizer.tokenizer.build_tokenizer',
build_tokenizer) build_tokenizer)
...@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC):
_initialize_distributed) _initialize_distributed)
MegatronAdaptation.register('megatron.training.initialize._compile_dependencies', MegatronAdaptation.register('megatron.training.initialize._compile_dependencies',
_compile_dependencies) _compile_dependencies)
MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
_set_random_seed)
# traing.train # traing.train
MegatronAdaptation.register('megatron.training.training.train', MegatronAdaptation.register('megatron.training.training.train',
......
...@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_training_args(parser) parser = _add_training_args(parser)
parser = _add_extra_training_args(parser) parser = _add_extra_training_args(parser)
parser = _add_initialization_args(parser) parser = _add_initialization_args(parser)
parser = _add_extra_initialization_args(parser)
parser = _add_learning_rate_args(parser) parser = _add_learning_rate_args(parser)
parser = _add_checkpointing_args(parser) parser = _add_checkpointing_args(parser)
parser = _add_mixed_precision_args(parser) parser = _add_mixed_precision_args(parser)
...@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_extra_distributed_args(parser) parser = _add_extra_distributed_args(parser)
parser = _add_validation_args(parser) parser = _add_validation_args(parser)
parser = _add_data_args(parser) parser = _add_data_args(parser)
parser = _add_extra_data_args(parser)
parser = _add_tokenizer_args(parser) parser = _add_tokenizer_args(parser)
parser = _add_extra_tokenizer_args(parser) parser = _add_extra_tokenizer_args(parser)
parser = _add_autoresume_args(parser) parser = _add_autoresume_args(parser)
...@@ -140,6 +142,23 @@ def _add_extra_training_args(parser): ...@@ -140,6 +142,23 @@ def _add_extra_training_args(parser):
return parser return parser
def _add_extra_initialization_args(parser):
group = parser.add_argument_group(title='extra initialization args')
group.add_argument('--reproduce', action='store_true',
help='reproduce train loss, need set --seed > 0.')
return parser
def _add_extra_data_args(parser):
# 删除原参数
remove_original_params(parser, ["num_workers"])
# 重定义参数
group = parser.add_argument_group(title='extra data args')
group.add_argument('--num-workers', type=int, default=0,
help="Dataloader number of workers.")
return parser
def _add_extra_tokenizer_args(parser): def _add_extra_tokenizer_args(parser):
# 删除原参数 # 删除原参数
......
"""Megatron initialization.""" """Megatron initialization."""
import random
import time import time
import numpy as np
import torch import torch
from datetime import timedelta from datetime import timedelta
from megatron.training import get_args from megatron.training import get_args
from megatron.core import mpu from megatron.core import mpu, tensor_parallel
def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks): def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
...@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks): ...@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
f"{mpu.get_pipeline_model_parallel_world_size()}" f"{mpu.get_pipeline_model_parallel_world_size()}"
) )
def _set_random_seed(seed_, data_parallel_random_init=False, te_rng_tracker=False, inference_rng_tracker=False):
"""Set random seed for reproducability."""
args = get_args()
if seed_ is not None and seed_ > 0:
# Ensure that different pipeline MP stages get different seeds.
seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
# Ensure different data parallel ranks get different seeds
if data_parallel_random_init:
seed = seed + (10 * mpu.get_data_parallel_rank())
# 设置cpu随机种子
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.device_count() > 0:
# 设置gpu随机种子
tensor_parallel.model_parallel_cuda_manual_seed(seed, te_rng_tracker, inference_rng_tracker)
if args.reproduce:
assert (args.attention_dropout > 0) is False, f"To utilize the reproduction function, args.attention_dropout = {args.attention_dropout} must be set to 0."
assert (args.hidden_dropout > 0) is False, f"To utilize the reproduction function, args.hidden_dropout = {args.hidden_dropout} must be set to 0."
torch.backends.cudnn.deterministic = True # 设置cudnn后端为确定性算法
torch.backends.cudnn.benchmark = False # 固定卷积算法
torch.use_deterministic_algorithms(True) # 使用torch的deterministic算子 避免不确定性
else:
raise ValueError("Seed ({}) should be a positive integer.".format(seed_))
def _compile_dependencies(): def _compile_dependencies():
......
#!/bin/bash #!/bin/bash
# set -eux # set -eux
for para in $* for para in $@
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
elif [[ $para == --reproduce ]];then
export MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC=1 # miopen 确定算法打开
export ROCBLAS_ATOMICS_MOD=0 # rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export MIOPEN_DEBUG_CONV_FFT=0
export MIOPEN_DEBUG_CONV_DIRECT=0
export MIOPEN_DEBUG_CONV_GEMM=1
export MIOPEN_DEBUG_CONV_WINOGRAD=0
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
fi fi
done done
...@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml" export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志 export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh source /opt/dtk/env.sh
# hipblaslt库 # hipblaslt库
export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/hipblaslt-install0507-1/lib:$LD_LIBRARY_PATH
# rocblas # rocblas
export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流 # torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1 export ALLREDUCE_STREAM_WITH_COMPUTE=1
...@@ -51,7 +60,7 @@ export cache_size_limit=64 ...@@ -51,7 +60,7 @@ export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # # CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path> TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document DATA_PATH="/public/home/wangxj/Downloads/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=( GPT_MODEL_ARGS=(
--num-layers 32 --num-layers 32
...@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=( ...@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=(
--num-attention-heads 32 --num-attention-heads 32
--max-position-embeddings 4096 --max-position-embeddings 4096
--normalization RMSNorm # LightopRMSNorm --normalization LightopRMSNorm # RMSNorm
--position-embedding-type rope # none # --position-embedding-type rope # none #
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性 --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
) )
...@@ -104,6 +113,8 @@ TRAINING_ARGS=( ...@@ -104,6 +113,8 @@ TRAINING_ARGS=(
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配 # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠 # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn --use-flash-attn
# --reproduce
--num-workers 2
) )
# 使用torch fa的环境变量 # 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 # export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
...@@ -130,7 +141,7 @@ DATA_ARGS=( ...@@ -130,7 +141,7 @@ DATA_ARGS=(
--seq-length 4096 #4096 --seq-length 4096 #4096
--split 949,50,1 --split 949,50,1
--tokenizer-type Llama2Tokenizer --tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model --tokenizer-model /public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model
) )
EVAL_AND_LOGGING_ARGS=( EVAL_AND_LOGGING_ARGS=(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment