Commit e2157771 authored by wangxj's avatar wangxj
Browse files

添加reproduce功能, 开启后可实现训练loss完全复现, 少量降低训练性能

parent 59cb262a
...@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -201,6 +201,7 @@ class CoreAdaptation(MegatronAdaptationABC):
from ..training.initialize import _initialize_distributed from ..training.initialize import _initialize_distributed
from ..training.initialize import _compile_dependencies from ..training.initialize import _compile_dependencies
from ..training.training import train from ..training.training import train
from ..training.initialize import _set_random_seed
MegatronAdaptation.register('megatron.training.tokenizer.tokenizer.build_tokenizer', MegatronAdaptation.register('megatron.training.tokenizer.tokenizer.build_tokenizer',
build_tokenizer) build_tokenizer)
...@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -208,6 +209,8 @@ class CoreAdaptation(MegatronAdaptationABC):
_initialize_distributed) _initialize_distributed)
MegatronAdaptation.register('megatron.training.initialize._compile_dependencies', MegatronAdaptation.register('megatron.training.initialize._compile_dependencies',
_compile_dependencies) _compile_dependencies)
MegatronAdaptation.register('megatron.training.initialize._set_random_seed',
_set_random_seed)
# traing.train # traing.train
MegatronAdaptation.register('megatron.training.training.train', MegatronAdaptation.register('megatron.training.training.train',
......
...@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -56,6 +56,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_training_args(parser) parser = _add_training_args(parser)
parser = _add_extra_training_args(parser) parser = _add_extra_training_args(parser)
parser = _add_initialization_args(parser) parser = _add_initialization_args(parser)
parser = _add_extra_initialization_args(parser)
parser = _add_learning_rate_args(parser) parser = _add_learning_rate_args(parser)
parser = _add_checkpointing_args(parser) parser = _add_checkpointing_args(parser)
parser = _add_mixed_precision_args(parser) parser = _add_mixed_precision_args(parser)
...@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): ...@@ -63,6 +64,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_extra_distributed_args(parser) parser = _add_extra_distributed_args(parser)
parser = _add_validation_args(parser) parser = _add_validation_args(parser)
parser = _add_data_args(parser) parser = _add_data_args(parser)
parser = _add_extra_data_args(parser)
parser = _add_tokenizer_args(parser) parser = _add_tokenizer_args(parser)
parser = _add_extra_tokenizer_args(parser) parser = _add_extra_tokenizer_args(parser)
parser = _add_autoresume_args(parser) parser = _add_autoresume_args(parser)
...@@ -140,7 +142,24 @@ def _add_extra_training_args(parser): ...@@ -140,7 +142,24 @@ def _add_extra_training_args(parser):
return parser return parser
def _add_extra_initialization_args(parser):
group = parser.add_argument_group(title='extra initialization args')
group.add_argument('--reproduce', action='store_true',
help='reproduce train loss, need set --seed > 0.')
return parser
def _add_extra_data_args(parser):
# 删除原参数
remove_original_params(parser, ["num_workers"])
# 重定义参数
group = parser.add_argument_group(title='extra data args')
group.add_argument('--num-workers', type=int, default=0,
help="Dataloader number of workers.")
return parser
def _add_extra_tokenizer_args(parser): def _add_extra_tokenizer_args(parser):
# 删除原参数 # 删除原参数
remove_original_params(parser, ["tokenizer_type"]) remove_original_params(parser, ["tokenizer_type"])
......
"""Megatron initialization.""" """Megatron initialization."""
import random
import time import time
import numpy as np
import torch import torch
from datetime import timedelta from datetime import timedelta
from megatron.training import get_args from megatron.training import get_args
from megatron.core import mpu from megatron.core import mpu, tensor_parallel
def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks): def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
...@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks): ...@@ -79,6 +81,32 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
f"{mpu.get_pipeline_model_parallel_world_size()}" f"{mpu.get_pipeline_model_parallel_world_size()}"
) )
def _set_random_seed(seed_, data_parallel_random_init=False, te_rng_tracker=False, inference_rng_tracker=False):
"""Set random seed for reproducability."""
args = get_args()
if seed_ is not None and seed_ > 0:
# Ensure that different pipeline MP stages get different seeds.
seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
# Ensure different data parallel ranks get different seeds
if data_parallel_random_init:
seed = seed + (10 * mpu.get_data_parallel_rank())
# 设置cpu随机种子
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.device_count() > 0:
# 设置gpu随机种子
tensor_parallel.model_parallel_cuda_manual_seed(seed, te_rng_tracker, inference_rng_tracker)
if args.reproduce:
assert (args.attention_dropout > 0) is False, f"To utilize the reproduction function, args.attention_dropout = {args.attention_dropout} must be set to 0."
assert (args.hidden_dropout > 0) is False, f"To utilize the reproduction function, args.hidden_dropout = {args.hidden_dropout} must be set to 0."
torch.backends.cudnn.deterministic = True # 设置cudnn后端为确定性算法
torch.backends.cudnn.benchmark = False # 固定卷积算法
torch.use_deterministic_algorithms(True) # 使用torch的deterministic算子 避免不确定性
else:
raise ValueError("Seed ({}) should be a positive integer.".format(seed_))
def _compile_dependencies(): def _compile_dependencies():
......
#!/bin/bash #!/bin/bash
# set -eux # set -eux
for para in $* for para in $@
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
elif [[ $para == --reproduce ]];then
export MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC=1 # miopen 确定算法打开
export ROCBLAS_ATOMICS_MOD=0 # rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export MIOPEN_DEBUG_CONV_FFT=0
export MIOPEN_DEBUG_CONV_DIRECT=0
export MIOPEN_DEBUG_CONV_GEMM=1
export MIOPEN_DEBUG_CONV_WINOGRAD=0
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
fi fi
done done
...@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -31,15 +40,15 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml" export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志 export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh source /opt/dtk/env.sh
# hipblaslt库 # hipblaslt库
export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/hipblaslt-install0507-1/lib:$LD_LIBRARY_PATH
# rocblas # rocblas
export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流 # torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1 export ALLREDUCE_STREAM_WITH_COMPUTE=1
...@@ -51,7 +60,7 @@ export cache_size_limit=64 ...@@ -51,7 +60,7 @@ export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # # CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path> TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document DATA_PATH="/public/home/wangxj/Downloads/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=( GPT_MODEL_ARGS=(
--num-layers 32 --num-layers 32
...@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=( ...@@ -60,7 +69,7 @@ GPT_MODEL_ARGS=(
--num-attention-heads 32 --num-attention-heads 32
--max-position-embeddings 4096 --max-position-embeddings 4096
--normalization RMSNorm # LightopRMSNorm --normalization LightopRMSNorm # RMSNorm
--position-embedding-type rope # none # --position-embedding-type rope # none #
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性 --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
) )
...@@ -104,6 +113,8 @@ TRAINING_ARGS=( ...@@ -104,6 +113,8 @@ TRAINING_ARGS=(
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配 # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠 # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn --use-flash-attn
# --reproduce
--num-workers 2
) )
# 使用torch fa的环境变量 # 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 # export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
...@@ -130,7 +141,7 @@ DATA_ARGS=( ...@@ -130,7 +141,7 @@ DATA_ARGS=(
--seq-length 4096 #4096 --seq-length 4096 #4096
--split 949,50,1 --split 949,50,1
--tokenizer-type Llama2Tokenizer --tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model --tokenizer-model /public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model
) )
EVAL_AND_LOGGING_ARGS=( EVAL_AND_LOGGING_ARGS=(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment