version 1

581b8d15 · liangjing · 581b8d15 · 581b8d15 · 581b8d15 · 581b8d15
Commit 581b8d15 authored Apr 10, 2023 by liangjing
20 changed files
--- a/dataset.py
+++ b/dataset.py
--- a/e2e_time.sh
+++ b/e2e_time.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+get_timestamp() {
+  local log_file="$1"     
+  cat "$log_file" | grep -E 'run_start|run_stop' | awk '{print $5}' | awk -F',' '{print $1}' 
+}
+
+unset GREP_OPTIONS
+for i in `seq 1 10`; do
+  log_file="$1$i/worker.0"
+  start_t=`get_timestamp "$log_file" | head -n 1`
+  end_t=`get_timestamp "$log_file" | tail -n 1`
+  time_cost=`python -c "print(($end_t - $start_t) / 60.0 / 1000.0)"`
+  echo "$time_cost"
+done
--- a/exchange_padding/init_env.py
+++ b/exchange_padding/init_env.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from mpi4py import MPI
+import numpy as np
+import time
+import paddle
+from pybind.functions import process_allgathered_inputs as process_bert_inputs
+from pybind.functions import process_eval_inputs as process_bert_eval_inputs
+import h5py
+import random
+
+global_comm = MPI.COMM_WORLD
+global_rank = global_comm.rank
+global_world_size = global_comm.size
+assert global_world_size % 2 == 0
+
+
+def create_group_comm(ranks):
+    ranks = list(ranks)
+    new_group = global_comm.group.Incl(ranks)
+    new_comm = global_comm.Create_group(new_group)
+    return new_comm
+
+
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
+    return seeds
+
+
+def broadcast_seeds(comm, seeds, root=0):
+    seeds = np.array(seeds).astype(np.int64)
+    comm.Bcast(seeds, root=root)
+    return seeds.tolist()
+
+
+def select_dataset_file_for_each_worker(files, f_start_id, worker_num,
+                                        worker_index):
+    """
+    Spliting the train file according to the worker index.
+    """
+    num_files = len(files)
+    if worker_num > num_files:
+        remainder = worker_num % num_files
+        data_file = files[(
+            f_start_id * worker_num + worker_index + remainder * f_start_id) %
+                          num_files]
+    else:
+        data_file = files[(f_start_id * worker_num + worker_index) % num_files]
+    # limin-todo: 
+    #data_file = "/data2/zengjinle/dataset/bert_data/hdf5/training-4320/hdf5_4320_shards_uncompressed/part_01799_of_04320.hdf5"
+    #print("data_file: ", data_file)
+    return data_file
+
+
+def read_hdf5_file(input_file, dtype=np.int16):
+    keys = [
+        'input_ids',
+        'input_mask',
+        'segment_ids',
+        'masked_lm_positions',
+        'masked_lm_ids',
+        'next_sentence_labels',
+    ]
+    if not os.path.exists(input_file):
+        return None
+    with h5py.File(input_file, 'r') as f:
+        outputs = [np.array(f[key], dtype=dtype) for key in keys]
+        n = outputs[0].shape[0]
+        masked_lm_labels = np.zeros(outputs[0].shape, dtype=dtype)
+        lengths = np.zeros(n, dtype=dtype)
+        for i in range(n):
+            masked_lm_positions = outputs[3][i]
+            masked_lm_ids = outputs[4][i]
+            length = np.count_nonzero(masked_lm_positions)
+            masked_lm_labels[i][
+                masked_lm_positions[:length]] = masked_lm_ids[:length]
+            lengths[i] = np.count_nonzero(outputs[1][i])
+        outputs = [
+            outputs[0], outputs[2], outputs[1], masked_lm_labels, outputs[-1],
+            lengths
+        ]
+        idx = np.random.choice(np.arange(n), n, replace=False)
+        for i in range(len(outputs)):
+            outputs[i] = outputs[i][idx]
+    return outputs
+
+
+def read_eval_hdf5_file(input_file, dtype=np.int16):
+    keys = [
+        'input_ids',
+        'input_mask',
+        'segment_ids',
+        'masked_lm_positions',
+        'masked_lm_ids',
+        'next_sentence_labels',
+    ]
+    if not os.path.exists(input_file):
+        return None
+    with h5py.File(input_file, 'r') as f:
+        outputs = [np.asarray(f[key][:]) for key in keys]
+        nsamples = outputs[0].shape[0]
+
+        all_data = []
+        for index in range(nsamples):
+            [
+                input_ids, input_mask, segment_ids, masked_lm_positions,
+                masked_lm_ids, next_sentence_labels
+            ] = [
+                input[index].astype(dtype)
+                if indice < 5 else np.asarray(input[index].astype(dtype))
+                for indice, input in enumerate(outputs)
+            ]
+
+            length = np.count_nonzero(masked_lm_positions)
+            masked_lm_positions = masked_lm_positions[:length]
+            masked_lm_ids = masked_lm_ids[:length]
+
+            masked_lm_labels = np.zeros(input_ids.shape, dtype=dtype)
+            masked_lm_labels[masked_lm_positions] = masked_lm_ids
+
+            #if index == 0:
+            #    print("masked_lm_labels = ", masked_lm_labels)
+            #    print("masked_lm_positions = ", masked_lm_positions)
+            #    print("masked_lm_ids = ", masked_lm_ids)
+            seq_len = np.asarray(np.count_nonzero(input_mask))
+
+            data = [
+                input_ids,
+                segment_ids,
+                input_mask,
+                masked_lm_labels,
+                next_sentence_labels,
+                seq_len,
+            ]
+            # (2050, ), i.e., 512 * 4 + 1 + 1
+            one_sample_data = np.concatenate([d.flatten() for d in data])
+
+            all_data.extend(one_sample_data)
+
+        # (2050000, ) -> (10000, 2050)
+    return np.asarray(all_data).reshape((nsamples, -1))
+
+
+class WorkerInitObj(object):
+    "Construct the object with different seed, and the Dataloader will generate the data "
+    "with different seed in each worker."
+
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+
+
+class Context:
+    def __init__(self):
+        half_size = int(global_world_size / 2)
+        self.trainer_id = global_rank % half_size
+        self.trainer_num = half_size
+        self.is_trainer = (global_rank < half_size)
+
+        self.reader_id = self.trainer_id
+        self.reader_num = self.trainer_num
+        self.is_reader = not self.is_trainer
+
+        self.trainer_comm = create_group_comm(range(0, half_size))
+        self.reader_comm = create_group_comm(
+            range(half_size, global_world_size))
+        self.trainer_reader_comm = create_group_comm(
+            [self.trainer_id, self.trainer_id + half_size])
+        self.global_comm = global_comm
+
+    def init_args(self, args, dtype=np.int16):
+        self.args = args
+        self.files = [
+            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
+            if os.path.isfile(os.path.join(args.input_dir, f)) and "part" in f
+        ]
+        self.files.sort()
+        self.fid_buf = np.array([1], dtype=np.int64)
+        with h5py.File(self.files[0], 'r') as f:
+            self.num_samples = np.array(f["next_sentence_labels"][:]).size
+
+        self.batch_size = args.train_batch_size
+        self.max_seq_length = args.max_seq_length
+        self.worker_seeds, self.shuffling_seeds = self._setup_seeds(
+            args.seed, args.num_epochs_to_generate_seeds_for)
+        self.epoch_idx = 0
+
+        data_buf_size = self.num_samples * 4 * self.max_seq_length + self.num_samples * 2
+        self.data_buf = np.empty(
+            shape=[self.trainer_num * data_buf_size], dtype=dtype)
+
+        self.eval_dir = args.eval_dir
+        self.num_eval_examples = args.num_eval_examples
+        self.eval_batch_size = args.eval_batch_size
+
+        cur_seed = self.worker_seeds[self.trainer_id]
+        np.random.seed(cur_seed)
+        random.seed(cur_seed)
+        paddle.seed(cur_seed)
+        self.worker_init = WorkerInitObj(cur_seed)
+        self.barrier()
+
+    def shuffle_files(self):
+        random.Random(self.shuffling_seeds[self.epoch_idx]).shuffle(self.files)
+        self.epoch_idx += 1
+
+    def _setup_seeds(self, master_seed, epochs):
+        if master_seed is None:
+            master_seed = random.SystemRandom().randint(0, 2**32 - 1)
+            if self.trainer_id == 0:
+                print('Using random master seed: {}'.format(master_seed))
+        else:
+            print('Using master seed from command line: {}'.format(master_seed))
+
+        # initialize seeding RNG
+        seeding_rng = random.Random(master_seed)
+
+        # generate worker seeds, one seed for every distributed worker
+        worker_seeds = generate_seeds(seeding_rng, self.trainer_num)
+
+        # generate seeds for data shuffling, one seed for every epoch
+        shuffling_seeds = generate_seeds(seeding_rng, epochs)
+
+        worker_seeds = broadcast_seeds(self.global_comm, worker_seeds)
+        shuffling_seeds = broadcast_seeds(self.global_comm, shuffling_seeds)
+        return worker_seeds, shuffling_seeds
+
+    def worker_seed(self):
+        return self.worker_seeds[self.trainer_id]
+
+    def barrier(self):
+        self.global_comm.barrier()
+
+    def stop_reader(self):
+        if self.is_trainer:
+            self.read_file(-1)
+
+    def file_num(self):
+        return len(self.files)
+
+    def read_file(self, f_id=None):
+        if self.is_trainer:
+            self.fid_buf[0] = f_id
+            self.trainer_reader_comm.Isend(self.fid_buf, dest=1)
+            if f_id == 0:
+                self.shuffle_files()
+            elif f_id < 0:
+                return
+
+            self.trainer_reader_comm.Recv(self.data_buf, source=1)
+            results = process_bert_inputs(self.data_buf, self.num_samples,
+                                          self.max_seq_length, self.batch_size,
+                                          self.trainer_id, self.trainer_num)
+
+            return results
+        else:
+            self.trainer_reader_comm.Recv(self.fid_buf, 0)
+            f_id = self.fid_buf[0]
+            if f_id == 0:
+                self.shuffle_files()
+            elif f_id < 0:
+                return False
+
+            fname = select_dataset_file_for_each_worker(
+                self.files, f_id, self.trainer_num, self.trainer_id)
+            data = read_hdf5_file(fname, dtype=self.data_buf.dtype)
+            send_buf = np.concatenate([d.flatten() for d in data])
+            self.reader_comm.Allgather(send_buf, self.data_buf)
+            self.trainer_reader_comm.Send(self.data_buf, dest=0)
+            return True
+
+    def read_eval_file(self):
+        if self.is_trainer:
+
+            eval_data = []
+            for eval_file in sorted(os.listdir(self.eval_dir)):
+                eval_file_path = os.path.join(self.eval_dir, eval_file)
+                if os.path.isfile(eval_file_path) and 'part' in eval_file_path:
+                    data = read_eval_hdf5_file(
+                        eval_file_path, dtype=self.data_buf.dtype)
+                    eval_data.extend(data)
+                    if len(eval_data) > self.num_eval_examples:
+                        break
+
+            chunk_size = self.num_eval_examples // self.trainer_num
+            rank = self.trainer_id
+            remainder = self.num_eval_examples % self.trainer_num
+            if rank < remainder:
+                eval_data = eval_data[(chunk_size + 1) * rank:(chunk_size + 1) *
+                                      (rank + 1)]
+            else:
+                eval_data = eval_data[chunk_size * rank + remainder:chunk_size *
+                                      (rank + 1) + remainder]
+
+            results = process_bert_eval_inputs(eval_data, self.max_seq_length,
+                                               self.eval_batch_size,
+                                               self.args.sort_eval_data)
+            return results
+
+
+_context = Context()
+
+
+def get_context():
+    return _context
--- a/exchange_padding/run_8gpu.sh
+++ b/exchange_padding/run_8gpu.sh
+#!/bin/bash
+export MIOPEN_FIND_MODE=1
+export PADDLE_TRAINERS_NUM=8
+export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012
+export PYTHON=python3
+export SEED=${SEED:-"$RANDOM"}
+export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
+#export HSA_FORCE_FINE_GRAIN_PCIE=1
+#export NCCL_P2P_LEVEL=5
+#export use_hierarchical_allreduce=True
+export num_process=16
+
+if [[ $num_process -gt 1 ]]; then
+  ORTERUN=`which orterun`
+  mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none  -x UCX_IB_ADDR_TYPE=ib_global -x UCX_TLS=rc_x,rocm_copy -mca btl_tcp_if_exclude ib0 -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark.sh"
+else
+  mpirun=""
+fi
+
+echo "command is " $mpirun $CMD
+for NPROC_PER_NODE in 8; do
+  echo "command is " $mpirun $CMD
+  export NPROC_PER_NODE=$NPROC_PER_NODE
+  $mpirun $CMD
+done
+ 
+#mpirun -np 8 --allow-run-as-root  --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON ./run_benchmark.sh
--- a/exchange_padding/run_benchmark.sh
+++ b/exchange_padding/run_benchmark.sh
+#!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+#export ROCBLAS_LAYER=3
+export FLAGS_rocm_dir=/opt/dtk-21.04/
+export FLAGS_max_inplace_grad_add=2
+export NCCL_P2P_LEVEL=5
+
+export USE_NV_INPUT=1
+
+USE_UNCOMPRESSED_DATASET=1
+
+BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert/"}
+export USE_NV_INPUT
+UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
+VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
+
+export DATA_DIR=$UNCOMPRESSED_DATA_DIR
+export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
+  export DATA_DIR="$VARLENGTH_DATA_DIR"
+  export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+else
+  export USE_UNCOMPRESSED_DATASET=1
+fi
+export USE_UNCOMPRESSED_DATASET
+export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
+export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
+
+export PYTHON=python3
+export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
+export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
+export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-""}
+
+OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+
+function get_device_id() {
+$PYTHON <<EOF
+import paddle
+import os
+gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+if gpus is None:
+    print($OMPI_COMM_WORLD_RANK)
+else:
+    gpus = gpus.split(",")
+    print(gpus[$OMPI_COMM_WORLD_RANK])
+EOF
+}
+if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #$(expr $OMPI_COMM_WORLD_RANK % 4) #`get_device_id`
+  export IS_TRAINER=1
+  export IS_READER=0
+else
+  export CUDA_VISIBLE_DEVICES=""
+  export IS_TRAINER=0
+  export IS_READER=1
+fi
+
+echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
+
+export FLAGS_sync_nccl_allreduce=0
+export FLAGS_fraction_of_gpu_memory_to_use=0.99
+#export FLAGS_allocator_strategy=naive_best_fit
+export FLAGS_call_stack_level=2
+export FLAGS_use_fast_math=0
+#export FLAGS_check_nan_inf=1
+#export PADDLE_INF_NAN_SKIP_OP="matmul_v2_grad,reshape2_grad,cast,elementwise_add_grad,reduce_mean_grad,softmax_with_cross_entropy_grad"
+#export CUDA_LAUNCH_BLOCKING=1
+#export FLAGS_benchmark=1
+export FLAGS_enable_nvtx=1
+
+#export FLAGS_inplace_addto_external_ops=custom_fused_dense_grad
+
+batch_size=4
+eval_batch_size=63
+#eval_batch_size=16
+use_amp=True
+use_pure_fp16=True
+
+max_steps=7100
+log_freq=50
+eval_iter_start_samples=150000
+eval_iter_samples=150000
+max_seq_length=512
+
+dense_seq_output=True
+unpad=False
+unpad_fmha=False
+fused_bias_mha=True
+fused_bias_fc=True
+## can be False or True 
+weight_transpose=True
+
+#fused_dropout_add_ln=True
+fused_dropout_add_ln=False
+exchange_padding=True
+cpu_exchange_padding=True
+
+distributed_lamb=True
+
+unpad_embed=False
+unpad_fmha_mke_opt=True
+
+sort_eval_data=False
+
+LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
+mkdir -p ${LOG_DIR}
+LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
+
+#export FLAGS_lamb_allreduce_first=1
+#export FLAGS_use_multi_tensor_apply=1
+export FLAGS_max_inplace_grad_add=2
+
+if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
+  if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
+    export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
+  fi
+fi
+
+
+#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
+BERT_CMD="run_pretrain.py \
+   --max_predictions_per_seq 76 \
+   --train_batch_size $batch_size   \
+   --eval_batch_size $eval_batch_size \
+   --sort_eval_data $sort_eval_data \
+   --learning_rate 0.000425 \
+   --weight_decay 1e-2 \
+   --lamb_epsilon 1e-6 \
+   --start_warmup_step 0 \
+   --warmup_proportion 0.0 \
+   --warmup_steps 0 \
+   --input_dir $DATA_DIR \
+   --log_freq $log_freq \
+   --max_steps $max_steps \
+   --tf_ckpt_path $TF_CKPT_PATH \
+   --bert_config_path $BERT_CONFIG_PATH \
+   --unpad $unpad \
+   --unpad_fmha $unpad_fmha \
+   --unpad_fmha_mke_opt $unpad_fmha_mke_opt \
+   --unpad_embed $unpad_embed \
+   --fused_bias_mha $fused_bias_mha \
+   --fused_bias_fc $fused_bias_fc \
+   --fused_dropout_add_ln $fused_dropout_add_ln \
+   --weight_transpose $weight_transpose \
+   --max_seq_length $max_seq_length \
+   --eval_dir $EVAL_DIR \
+   --distributed_lamb $distributed_lamb \
+   --exchange_padding $exchange_padding \
+   --cpu_exchange_padding $cpu_exchange_padding \
+   --seed $SEED \
+   --use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
+   --dense_seq_output $dense_seq_output \
+   --gradient_accumulation_steps 14 \
+   --opt_lamb_beta_1 0.9 \
+   --opt_lamb_beta_2 0.999 \
+   --enable_addto True \
+   --use_pure_fp16 $use_pure_fp16 \
+   --use_amp $use_amp"
+
+## 2>&1 | tee $LOG_FILE"
+
+APP="python3 -u $BERT_CMD"
+case $(expr $lrank % 8) in 
+[0])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=0
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE
+  ;;
+[1])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=1
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE
+  ;;
+[2])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=2
+ # export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE
+  ;;
+[3])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=3
+#  export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}  >& $LOG_FILE
+  ;;
+[4])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=4
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=4 --membind=4 ${APP} >& $LOG_FILE
+  ;;
+[5])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=5
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=5 --membind=5 ${APP} >& $LOG_FILE
+  ;;
+[6])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=6
+ # export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=6 --membind=6 ${APP} >& $LOG_FILE
+  ;;
+[7])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=7
+#  export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=7 --membind=7 ${APP}  >& $LOG_FILE
+  ;;
+
+esac
+
--- a/external_ops/acc_merge/acc_merge.cc
+++ b/external_ops/acc_merge/acc_merge.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+std::vector<std::vector<int64_t>> AccMergeInferShape(
+    const std::vector<int64_t> &acc, const std::vector<int64_t> &total) {
+  return {{2}, {2}};
+}
+
+std::vector<paddle::DataType> AccMergeInferDType(paddle::DataType acc,
+                                                 paddle::DataType total) {
+  return {paddle::DataType::FLOAT64, paddle::DataType::INT64};
+}
+
+PD_BUILD_OP(acc_merge)
+    .Inputs({"Acc", "Total"})
+    .Outputs({"Out", "Step"})
+    .SetInferShapeFn(PD_INFER_SHAPE(AccMergeInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(AccMergeInferDType));
--- a/external_ops/acc_merge/acc_merge.cu
+++ b/external_ops/acc_merge/acc_merge.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
+
+template <typename T, bool NeedAccumulate>
+static __device__ __forceinline__ void AccMerge(T acc, T total, T *out) {
+  auto correct = static_cast<int64_t>(acc * total + 0.5);
+  if (NeedAccumulate) {
+    out[0] += correct;
+    out[1] += total;
+  } else {
+    out[0] = correct;
+    out[1] = total;
+  }
+}
+
+template <typename T1, typename T2, bool NeedAccumulate>
+static __global__ void AccMergeKernelCPUTotal(const T1 *acc,
+                                              int64_t total,
+                                              T2 *out) {
+  AccMerge<T2, NeedAccumulate>(*acc, total, out);
+}
+
+template <typename T1, typename T2, bool NeedAccumulate>
+static __global__ void AccMergeKernelGPUTotal(const T1 *acc,
+                                              const T1 *total,
+                                              T2 *out) {
+  AccMerge<T2, NeedAccumulate>(*acc, *total, out);
+}
+
+__PD_DEFINE_RAW_OP_KERNEL_FUNC(acc_merge, ctx) {
+  namespace f = paddle::framework;
+  namespace p = paddle::platform;
+
+  auto &step_t = *ctx.Output<f::Tensor>("Step");
+  auto *step = step_t.data<int64_t>();
+  if (step[1] <= 0) return;
+
+  const auto &total_t = *ctx.Input<f::Tensor>("Total");
+  bool is_cpu_place = p::is_cpu_place(total_t.place());
+
+  using Type1 = float;
+  using Type2 = double;
+
+  const auto &acc_t = *ctx.Input<f::Tensor>("Acc");
+  auto *acc = acc_t.data<Type1>();
+
+  auto &out_t = *ctx.Output<f::Tensor>("Out");
+  out_t.Resize({2});
+  auto *out = out_t.mutable_data<Type2>(acc_t.place());
+
+  auto stream = ctx.cuda_device_context().stream();
+  if (step[0] == 0) {
+    if (is_cpu_place) {
+      AccMergeKernelCPUTotal<Type1, Type2, false><<<1, 1, 0, stream>>>(
+          acc, *total_t.data<int64_t>(), out);
+    } else {
+      AccMergeKernelGPUTotal<Type1, Type2, false><<<1, 1, 0, stream>>>(
+          acc, total_t.data<float>(), out);
+    }
+  } else {
+    if (is_cpu_place) {
+      AccMergeKernelCPUTotal<Type1, Type2, true><<<1, 1, 0, stream>>>(
+          acc, *total_t.data<int64_t>(), out);
+    } else {
+      AccMergeKernelGPUTotal<Type1, Type2, true><<<1, 1, 0, stream>>>(
+          acc, total_t.data<Type1>(), out);
+    }
+  }
+
+  step[0] = (step[0] + 1) % step[1];
+}
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/acc_merge.cu.o
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/acc_merge.cu.o
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/acc_merge.o
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/acc_merge.o
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/custom_setup_ops.so
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/custom_setup_ops.so
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/lr_op_cuda.cu.o
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/lr_op_cuda.cu.o
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/lr_op_cuda.o
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/lr_op_cuda.o
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/sort_bert_inputs_across_devices.cu.o
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/sort_bert_inputs_across_devices.cu.o
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/sort_bert_inputs_across_devices.o
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/sort_bert_inputs_across_devices.o
--- a/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/version.txt
+++ b/external_ops/build/custom_setup_ops/lib.linux-x86_64-3.6/version.txt
+{
+    "custom_setup_ops.so": "56632beeaffe91ec64ae156669db712a",
+    "define_macros": [
+        [
+            "PADDLE_WITH_HIP",
+            null
+        ],
+        [
+            "PADDLE_WITH_RCCL",
+            null
+        ]
+    ],
+    "extra_compile_args": [
+        "-w"
+    ],
+    "extra_link_args": [
+        "-l:core_avx.so",
+        "-lamdhip64"
+    ],
+    "include_dirs": [
+        "/public/home/zhangqha/.conda/envs/hhenv/include/python3.6m",
+        "/public/home/zhangqha/.conda/envs/hhenv/lib/python3.6/site-packages/numpy/core/include",
+        "/public/home/zhangqha/dtk-21.04/hipcub/include",
+        "/public/home/zhangqha/dtk-21.04/hiprand/include",
+        "/public/home/zhangqha/dtk-21.04/hipsparse/include",
+        "/public/home/zhangqha/dtk-21.04/include",
+        "/public/home/zhangqha/dtk-21.04/miopen/include",
+        "/public/home/zhangqha/dtk-21.04/rccl/include",
+        "/public/home/zhangqha/dtk-21.04/rocblas/include",
+        "/public/home/zhangqha/dtk-21.04/rocfft/include",
+        "/public/home/zhangqha/dtk-21.04/rocprim/include",
+        "/public/home/zhangqha/dtk-21.04/rocrand/include",
+        "/public/home/zhangqha/dtk-21.04/rocsparse/include",
+        "/public/home/zhangqha/dtk-21.04/rocthrust/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/paddle/fluid/platform",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/dlpack/src/extern_dlpack/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/eigen3/src/extern_eigen3",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/brpc/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gflags/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/glog/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gloo/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gtest/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/leveldb/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/libmct/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/mklml/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/mklml/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/protobuf/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/rocksdb/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/snappy/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/utf8proc/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/warpctc/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xbyak/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xbyak/include/xbyak",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xxhash/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xxhash/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/zlib/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/pocketfft/src",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/pybind/src/extern_pybind/include",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/threadpool/src/extern_threadpool",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/paddle/fluid/framework/io",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/paddle/fluid/platform",
+        "/public/home/zhangqha/for_baidu/Paddle-develop/patches/thrust",
+        "/usr/local/lib/python3.6/site-packages/paddle/include",
+        "/usr/local/lib/python3.6/site-packages/paddle/include/third_party",
+        "/opt/dtk-21.04/include"
+    ],
+    "library_dirs": [
+        "/usr/local/lib/python3.6/site-packages/paddle/libs",
+        "/opt/dtk-21.04/lib",
+        "/usr/local/lib/python3.6/site-packages/paddle/fluid"
+    ],
+    "runtime_library_dirs": [
+        "/usr/local/lib/python3.6/site-packages/paddle/libs",
+        "/opt/dtk-21.04/lib",
+        "/usr/local/lib/python3.6/site-packages/paddle/fluid"
+    ],
+    "sources": [
+        "./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc",
+        "./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu",
+        "./lr_op/lr_op_cuda.cc",
+        "./lr_op/lr_op_cuda.cu",
+        "./acc_merge/acc_merge.cc",
+        "./acc_merge/acc_merge.cu"
+    ],
+    "undef_macros": []
+}
\ No newline at end of file
--- a/external_ops/custom_setup_ops.egg-info/PKG-INFO
+++ b/external_ops/custom_setup_ops.egg-info/PKG-INFO
+Metadata-Version: 1.0
+Name: custom-setup-ops
+Version: 0.0.0
+Summary: UNKNOWN
+Home-page: UNKNOWN
+Author: UNKNOWN
+Author-email: UNKNOWN
+License: UNKNOWN
+Description: UNKNOWN
+Platform: UNKNOWN
--- a/external_ops/custom_setup_ops.egg-info/SOURCES.txt
+++ b/external_ops/custom_setup_ops.egg-info/SOURCES.txt
+setup.py
+./acc_merge/acc_merge.cc
+./acc_merge/acc_merge.cu
+./lr_op/lr_op_cuda.cc
+./lr_op/lr_op_cuda.cu
+./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc
+./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu
+custom_setup_ops.egg-info/PKG-INFO
+custom_setup_ops.egg-info/SOURCES.txt
+custom_setup_ops.egg-info/dependency_links.txt
+custom_setup_ops.egg-info/not-zip-safe
+custom_setup_ops.egg-info/top_level.txt
\ No newline at end of file
--- a/external_ops/custom_setup_ops.egg-info/dependency_links.txt
+++ b/external_ops/custom_setup_ops.egg-info/dependency_links.txt
+
--- a/external_ops/custom_setup_ops.egg-info/not-zip-safe
+++ b/external_ops/custom_setup_ops.egg-info/not-zip-safe
+
--- a/external_ops/custom_setup_ops.egg-info/top_level.txt
+++ b/external_ops/custom_setup_ops.egg-info/top_level.txt
+custom_setup_ops