"examples/vscode:/vscode.git/clone" did not exist on "bde4880c9cceada20b387d3110061c65249dabcc"
Commit 88cfdb04 authored by liangjing's avatar liangjing
Browse files

增加线上提交作业方式

parent 298890be
...@@ -61,8 +61,33 @@ python依赖安装(如已经拉取镜像可忽略下述依赖安装步骤) ...@@ -61,8 +61,33 @@ python依赖安装(如已经拉取镜像可忽略下述依赖安装步骤)
#不同环境的配置及数据的存放路径会有不同,请根据实际情况进行调整run_benchmark_8gpu.sh脚本中的如下内容: #不同环境的配置及数据的存放路径会有不同,请根据实际情况进行调整run_benchmark_8gpu.sh脚本中的如下内容:
BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert"} //调整为具体的数据的路径 BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert"} //调整为具体的数据的路径
### 测试规模
线上资源进行双节点八卡规模性能&&精度测试
### 环境配置
需要采用提供的环境,已经打包,可通过网盘下载,下载链接如下:
链接:https://pan.baidu.com/s/1YI4SPDEzb4F5_oJaEuhWEw?pwd=a9xx
提取码:a9xx
### 预训练模型
/workspace/bert_data文件夹存放预训练模型如下:
├── /workpalce/bert_data/phase1
└── └──model.ckpt-28252.tf_pickled #预训练模型
### 训练
训练命令:
step1. 将获取的环境tar包解压至适宜位置,并按照位置信息&&平台信息对env.sh进行修改
step2. 依照平台信息对run_sbatch.sh作业提交脚本进行修改
step3. 执行命令 sbatch run_sbatch.sh
#不同环境的配置及数据的存放路径会有不同,请根据实际情况进行调整run_benchmark.sh脚本中的如下内容:
BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert"} //调整为具体的数据的路径
## 测试结果 ## 测试结果
采用上述输入数据,加速卡采用Z100L*8,可最终达到官方收敛要求; 采用上述输入数据,加速卡采用Z100L*8 && 线上Z100*8,可最终达到官方收敛要求;
## 源码仓库及问题反馈 ## 源码仓库及问题反馈
* https://developer.hpccube.com/codes/modelzoo/mlperf_bert-large * https://developer.hpccube.com/codes/modelzoo/mlperf_bert-large
......
#!/bin/bash
module purge
module load compiler/devtoolset/7.3.1
module load mpi/hpcx/2.4.1/gcc-7.3.1
module load compiler/rocm/dtk-21.04
#for mlperf_bert
export PYTHONPATH=/path_to_env
export PATH=/path_to_env/bin:$PATH
export LD_LIBRARY_PATH=/path_to_env/lib/python3.6:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/path_to_env/lib:$LD_LIBRARY_PATH
export INCLUDE=/path_to_env/include/python3.6m:$INCLUDE
export ROCM_DIR=/public/software/compiler/rocm/dtk-21.04
export PATH=${ROCM_DIR}/bin:${ROCM_DIR}/hcc/bin:${ROCM_DIR}/hip/bin:${ROCM_DIR}/miopen/bin:${ROCM_DIR}/opencl/bin/x86_64:$PATH
export LD_LIBRARY_PATH=${ROCM_DIR}/lib:${ROCM_DIR}/lib64:${ROCM_DIR}/miopen/lib:${ROCM_DIR}/opencl/lib/x86_64:/public/software/compiler/rocm/dtk-21.04/roctracer/lib:$LD_LIBRARY_PATH
export C_INCLUDE_PATH=${ROCM_DIR}/include:${ROCM_DIR}/opencl/include:${ROCM_DIR}/hiprand/include:${ROCM_DIR}/miopen/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=${ROCM_DIR}/include:${ROCM_DIR}/miopen/include:$CPLUS_INCLUDE_PATH
export ROCM_PATH=${ROCM_DIR}
export HIP_PATH=${ROCM_DIR}/hip/
export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch/gflags-2.1.2-build/include:/public/software/apps/DeepLearning/PyTorch/glog-build/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch/gflags-2.1.2-build/include:/public/software/apps/DeepLearning/PyTorch/glog-build/include:$CPLUS_INCLUDE_PATH
export LD_LIBRARY_PATH=/public/software/apps/DeepLearning/PyTorch/glog-build/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/software/apps/DeepLearning/PyTorch/lmdb-0.9.24-build/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/software/apps/DeepLearning/PyTorch/opencv-2.4.13.6-build/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/software/apps/DeepLearning/PyTorch/openblas-0.3.7-build/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/software/apps/DeepLearning/PyTorch/gflags-2.1.2-build/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/software/apps/DeepLearning/PyTorch/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/software/apps/DeepLearning/PyTorch/openmp-build/lib:$LD_LIBRARY_PATH
#!/bin/bash
################################################################################
# Copyright 2016-2021 by SW Group, Chengdu Hygon IC Design Co., Ltd.
# All right reserved. See COPYRIGHT for detailed Information.
#
# @file set_env.sh
# @brief set env variables for running test.
#
# @author wangmingliang <wangmingliang@hygon.cn>
# @date 2022/03/23
# @history 1.0
################################################################################
export HYGON_ROCM_INSTALL=/opt/dtk-21.04
# library path
# fix LD_LIBRARY_PATH begin or end with colon
export LD_LIBRARY_PATH=$(echo ${LD_LIBRARY_PATH} | sed 's/:$//; s/^://;')
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/lib:${HYGON_ROCM_INSTALL}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hip/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipcub/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipfft/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hiprand/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipsolver/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hipsparse/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/hsa/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/llvm/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/miopen/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/miopengemm/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/oam/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/opencl/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rccl/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocalution/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocclr/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocfft/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocm_smi/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocprim/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocprofiler/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocrand/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocsolver/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocsparse/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocthrust/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/roctracer/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=${HYGON_ROCM_INSTALL}/rocblas/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export C_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/rocrand/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/rocrand/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/hiprand/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=${HYGON_ROCM_INSTALL}/hiprand/include:$CPLUS_INCLUDE_PATH
# executable path
# fix PATH begin or end with colon
export PATH=$(echo ${PATH} | sed 's/:$//; s/^://;')
export PATH=${HYGON_ROCM_INSTALL}/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/hip/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/llvm/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/llvm/lib/clang/13.0.0/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/miopen/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/opencl/bin${PATH:+:${PATH}}
export PATH=${HYGON_ROCM_INSTALL}/rocprofiler/bin${PATH:+:${PATH}}
export PATH=/public/home/zhangqha/bladisc/hmmer/bin${PATH:+:${PATH}}
export PATH=/public/home/zhangqha/bladisc/hh-suite-master/build/bin${PATH:+:${PATH}}
export PATH=/public/home/zhangqha/bladisc/kalign/build/bin${PATH:+:${PATH}}
# component path
export ROCM_PATH=${HYGON_ROCM_INSTALL}
export HSA_PATH=${HYGON_ROCM_INSTALL}/hsa
export HIP_PATH=${HYGON_ROCM_INSTALL}/hip
export HIP_ROCCLR_HOME=${HYGON_ROCM_INSTALL}/hip
export HIP_LIB_PATH=${HYGON_ROCM_INSTALL}/hip/lib
export DEVICE_LIB_PATH=${HYGON_ROCM_INSTALL}/amdgcn/bitcode
export HIP_CLANG_PATH=${HYGON_ROCM_INSTALL}/llvm/bin
export HIP_RUNTIME="rocclr"
export HIP_COMPILER="clang"
#export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$CPLUS_INCLUDE_PATH
#export INCLUDE=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$INCLUDE
#export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:/public/home/zhangqha/openssl_install/include:/public/home/zhangqha/openssl_install/include/openssl:/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:${HIP_PATH}/include:$C_INCLUDE_PATH
#!/bin/bash
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
export FLAGS_rocm_dir=/public/software/compiler/rocm/dtk-21.04/
export FLAGS_max_inplace_grad_add=2
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export USE_NV_INPUT=1
USE_UNCOMPRESSED_DATASET=1
BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/software/apps/DeepLearning/Data/mlperf/bert"}
export USE_NV_INPUT
UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
export DATA_DIR=$UNCOMPRESSED_DATA_DIR
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
export DATA_DIR="$VARLENGTH_DATA_DIR"
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
else
export USE_UNCOMPRESSED_DATASET=1
fi
export USE_UNCOMPRESSED_DATASET
export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
export PYTHON=python3
export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-""}
OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
function get_device_id() {
$PYTHON <<EOF
import paddle
import os
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if gpus is None:
print($OMPI_COMM_WORLD_RANK)
else:
gpus = gpus.split(",")
print(gpus[$OMPI_COMM_WORLD_RANK])
EOF
}
if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
export CUDA_VISIBLE_DEVICES=0,1,2,3 #$(expr $OMPI_COMM_WORLD_RANK % 4) #`get_device_id`
export IS_TRAINER=1
export IS_READER=0
else
export CUDA_VISIBLE_DEVICES=""
export IS_TRAINER=0
export IS_READER=1
fi
echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
export FLAGS_sync_nccl_allreduce=0
export FLAGS_fraction_of_gpu_memory_to_use=0.99
export FLAGS_call_stack_level=2
export FLAGS_use_fast_math=0
export FLAGS_enable_nvtx=1
batch_size=4
eval_batch_size=63
use_amp=True
use_pure_fp16=True
max_steps=7100
log_freq=50
eval_iter_start_samples=150000
eval_iter_samples=150000
max_seq_length=512
dense_seq_output=True
unpad=False
unpad_fmha=False
fused_bias_mha=True
fused_bias_fc=True
## can be False or True
weight_transpose=True
fused_dropout_add_ln=False
exchange_padding=True
cpu_exchange_padding=True
distributed_lamb=True
unpad_embed=False
unpad_fmha_mke_opt=True
sort_eval_data=False
LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
mkdir -p ${LOG_DIR}
LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
export FLAGS_max_inplace_grad_add=2
if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
fi
fi
BERT_CMD="run_pretrain.py \
--max_predictions_per_seq 76 \
--train_batch_size $batch_size \
--eval_batch_size $eval_batch_size \
--sort_eval_data $sort_eval_data \
--learning_rate 0.000425 \
--weight_decay 1e-2 \
--lamb_epsilon 1e-6 \
--start_warmup_step 0 \
--warmup_proportion 0.0 \
--warmup_steps 0 \
--input_dir $DATA_DIR \
--log_freq $log_freq \
--max_steps $max_steps \
--tf_ckpt_path $TF_CKPT_PATH \
--bert_config_path $BERT_CONFIG_PATH \
--unpad $unpad \
--unpad_fmha $unpad_fmha \
--unpad_fmha_mke_opt $unpad_fmha_mke_opt \
--unpad_embed $unpad_embed \
--fused_bias_mha $fused_bias_mha \
--fused_bias_fc $fused_bias_fc \
--fused_dropout_add_ln $fused_dropout_add_ln \
--weight_transpose $weight_transpose \
--max_seq_length $max_seq_length \
--eval_dir $EVAL_DIR \
--distributed_lamb $distributed_lamb \
--exchange_padding $exchange_padding \
--cpu_exchange_padding $cpu_exchange_padding \
--seed $SEED \
--use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
--dense_seq_output $dense_seq_output \
--gradient_accumulation_steps 14 \
--opt_lamb_beta_1 0.9 \
--opt_lamb_beta_2 0.999 \
--enable_addto True \
--use_pure_fp16 $use_pure_fp16 \
--use_amp $use_amp"
APP="python3 -u $BERT_CMD"
case $(expr $lrank % 4) in
[0])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3
export FLAGS_selected_gpus=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE
;;
[1])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3
export FLAGS_selected_gpus=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE
;;
[2])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3
export FLAGS_selected_gpus=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE
;;
[3])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3
export FLAGS_selected_gpus=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP} >& $LOG_FILE
;;
esac
#!/bin/bash
#SBATCH -p xxx
#SBATCH -N 2
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=4
#SBATCH --gres=dcu:4
#SBATCH -J mlperf
source ./env.sh
cp ./rundir_8gpu/init_env.py .
export HSA_FORCE_FINE_GRAIN_PCIE=1
export PYTHON=python3
export PADDLE_TRAINER_ENDPOINTS=`$PYTHON -c "import list;print(list.get_list())"`
echo $PADDLE_TRAINER_ENDPOINTS
#set -e
hostfile=./hostfile
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
num_node=$(cat $hostfile|sort|uniq |wc -l)
num_DCU=$(($num_node*4))
export LD_LIBRARY_PATH=/public/software/compiler/rocm/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export use_hierarchical_allreduce=True
export NCCL_IB_HCA=mlx5_0:1
rm hosts
p=0
for i in `cat hostfile`;do
for j in `seq 1 4`;do
num=$[ $p * 4 + $j - 1]
echo "rank ${num}=${i} slot=$[j-1]" >> hosts
done
p=$(expr $p + 1)
done
for i in `cat hostfile`;do
for j in `seq 1 4`;do
num=$[ $p * 4 + $j - 1]
echo "rank ${num}=${i} slot=$[j+3]" >> hosts
done
p=$(expr $p + 1)
done
export HIP_LAUNCH_BLOCKING=1
export PADDLE_TRAINERS_NUM=$num_DCU
export SEED=${SEED:-"$RANDOM"}
echo "PADDLE_TRAINER_ENDPOINTS " $PADDLE_TRAINER_ENDPOINTS
num_process=$(($PADDLE_TRAINERS_NUM*2))
if [[ $num_process -gt 1 ]]; then
ORTERUN=`which orterun`
mpirun="mpirun --allow-run-as-root -np $num_process --rankfile hosts -mca btl_tcp_if_exclude ib0 --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark.sh"
else
mpirun=""
fi
for NPROC_PER_NODE in 4; do
echo "command is " $mpirun $CMD
export NPROC_PER_NODE=$NPROC_PER_NODE
$mpirun $CMD
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment