#!/bin/bash # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -ex export FLAGS_rocm_dir=/opt/dtk-21.04 export HIP_LAUNCH_BLOCKING=1 export FLAGS_max_inplace_grad_add=2 export HSA_FORCE_FINE_GRAIN_PCIE=1 #export NCCL_DEBUG=INFO export NCCL_P2P_LEVEL=5 export USE_NV_INPUT=1 USE_UNCOMPRESSED_DATASET=1 BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert"} export USE_NV_INPUT UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength export DATA_DIR=$UNCOMPRESSED_DATA_DIR export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then export DATA_DIR="$VARLENGTH_DATA_DIR" export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval else export USE_UNCOMPRESSED_DATASET=1 fi export USE_UNCOMPRESSED_DATASET export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json export PYTHON=python3 export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK} export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"} export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-"localhost:60045"} #export RCCL_TIMELINE_EXPORT=1 #export HIP_KERNEL_PRINTF=1 #export RCCL_TIMELINE_CFG_FILENAME=.timeline.cfg #export RCCL_TIMELINE_DUMP_DIR=/public/home/zhangqha/mlperf_last/bert/paddle/fp16/performance.test.fp16.4096gbs/timeline #export RCCL_TIMELINE_MAXPROFILING_THREADS=8 #export RCCL_TIMELINE_EVENT_SKIP=398 #export RCCL_TIMELINE_MAXPROFILING_EVENTS=2048 OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"} lrank=$OMPI_COMM_WORLD_LOCAL_RANK function get_device_id() { $PYTHON <&1 | tee $LOG_FILE" #Run experiments python3 -u $BERT_CMD #echo ${APP} #export NCCL_IB_HCA=mlx5_0:1 #case ${lrank} in #case $(expr $lrank % 4) in #[0]) # echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)" # export HIP_VISIBLE_DEVICES=0,1,2,3 # export FLAGS_selected_gpus=0 # export UCX_NET_DEVICES=mlx5_0:1 # export UCX_IB_PCI_BW=mlx5_0:50Gbs # numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE # ;; #[1]) # echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)" # export HIP_VISIBLE_DEVICES=0,1,2,3 # export FLAGS_selected_gpus=1 # export UCX_NET_DEVICES=mlx5_1:1 # export UCX_IB_PCI_BW=mlx5_1:50Gbs # numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE # ;; #[2]) # echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)" # export HIP_VISIBLE_DEVICES=0,1,2,3 # export FLAGS_selected_gpus=2 # export UCX_NET_DEVICES=mlx5_2:1 # export UCX_IB_PCI_BW=mlx5_2:50Gbs # numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE # ;; #[3]) # echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)" # export HIP_VISIBLE_DEVICES=0,1,2,3 # export FLAGS_selected_gpus=3 # export UCX_NET_DEVICES=mlx5_3:1 # export UCX_IB_PCI_BW=mlx5_3:50Gbs # numactl --cpunodebind=3 --membind=3 ${APP} >& $LOG_FILE # ;; #esac