version 1

581b8d15 · liangjing · 581b8d15 · 581b8d15 · 581b8d15 · 581b8d15
Commit 581b8d15 authored Apr 10, 2023 by liangjing
20 changed files
--- a/pybind/build/pybind11/CMakeFiles/CMakeDirectoryInformation.cmake
+++ b/pybind/build/pybind11/CMakeFiles/CMakeDirectoryInformation.cmake
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.16
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/public/home/zhangqha/bert/pybind")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/public/home/zhangqha/bert/pybind/build")
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
--- a/pybind/build/pybind11/CMakeFiles/progress.marks
+++ b/pybind/build/pybind11/CMakeFiles/progress.marks
+0
--- a/pybind/build/pybind11/Makefile
+++ b/pybind/build/pybind11/Makefile
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.16
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+#=============================================================================
+# Special targets provided by cmake.
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+.SUFFIXES: .hpux_make_needs_suffix_list
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+#=============================================================================
+# Set environment variables for the build.
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+# The CMake executable.
+CMAKE_COMMAND = /opt/cmake/bin/cmake
+# The command to remove a file.
+RM = /opt/cmake/bin/cmake -E remove -f
+# Escaping for special characters.
+EQUALS = =
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /public/home/zhangqha/bert/pybind
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /public/home/zhangqha/bert/pybind/build
+#=============================================================================
+# Targets provided globally by CMake.
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/opt/cmake/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/opt/cmake/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+# The main all target
+all: cmake_check_build_system
+	cd /public/home/zhangqha/bert/pybind/build && $(CMAKE_COMMAND) -E cmake_progress_start /public/home/zhangqha/bert/pybind/build/CMakeFiles /public/home/zhangqha/bert/pybind/build/pybind11/CMakeFiles/progress.marks
+	cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /public/home/zhangqha/bert/pybind/build/CMakeFiles 0
+.PHONY : all
+# The main clean target
+clean:
+	cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/clean
+.PHONY : clean
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+# Prepare targets for installation.
+preinstall: all
+	cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/preinstall
+.PHONY : preinstall
+# Prepare targets for installation.
+preinstall/fast:
+	cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/preinstall
+.PHONY : preinstall/fast
+# clear depends
+depend:
+	cd /public/home/zhangqha/bert/pybind/build && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... rebuild_cache"
+	@echo "... edit_cache"
+.PHONY : help
+#=============================================================================
+# Special targets to cleanup operation of make.
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /public/home/zhangqha/bert/pybind/build && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
--- a/pybind/build/pybind11/cmake_install.cmake
+++ b/pybind/build/pybind11/cmake_install.cmake
+# Install script for directory: /public/home/zhangqha/bert/pybind/pybind11
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "Release")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "0")
+endif()
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
--- a/pybind/compile.py
+++ b/pybind/compile.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle
+import sys
+import paddle.fluid.core as core
+from paddle.utils.cpp_extension.extension_utils import _get_include_dirs_when_compiling
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+os.chdir(cur_dir)
+CMAKELISTS_TEMPLATE = '''
+cmake_minimum_required(VERSION 3.4...3.18)
+project(functions LANGUAGES CXX)
+add_subdirectory(pybind11)
+%s
+%s
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -D__HIP_PLATFORM_HCC__=1 ")
+set(extension_name "functions")
+add_definitions("-DMLPERF_EXTENSION_NAME=${extension_name}")
+pybind11_add_module(${extension_name} functions.cc)
+target_link_libraries(${extension_name} PRIVATE %s)
+'''
+compile_dir = os.environ["COMPILE_DIR"]
+dir_lists = _get_include_dirs_when_compiling(compile_dir)
+dirs = ["include_directories({})".format(d) for d in dir_lists]
+macros = {}
+#if core.is_compiled_with_cuda():
+#    macros['PADDLE_WITH_CUDA'] = None
+#    macros['EIGEN_USE_GPU'] = None
+if core.is_compiled_with_mkldnn():
+    macros['PADDLE_WITH_MKLDNN'] = None
+if core.is_compiled_with_nccl():
+    macros['PADDLE_WITH_NCCL'] = None
+macros['PADDLE_WITH_RCCL'] = None
+macros['EIGEN_USE_HIP'] = None
+macros['THRUST_IGNORE_CUB_VERSION_CHECK'] = None
+macros = "\n".join([
+    'add_definitions(-D{}{})'.format(k, '=' + str(v) if v is not None else "")
+    for k, v in macros.items()
+])
+paddle_so = os.path.join(os.path.dirname(paddle.__file__), "fluid/core_avx.so")
+cmakelists_context = CMAKELISTS_TEMPLATE % ("\n".join(dirs), macros, paddle_so)
+with open("CMakeLists.txt", "w") as f:
+    f.write(cmakelists_context)
+cmake_args = {
+    'CMAKE_BUILD_TYPE': 'Release',
+    'PYBIND11_PYTHON_VERSION':
+    '{}.{}'.format(sys.version_info.major, sys.version_info.minor),
+}
+cmd = "rm -rf pybind11 && ln -s {}/third_party/pybind/src/extern_pybind pybind11".format(
+    compile_dir)
+assert os.system(cmd) == 0
+cmd = "rm -rf build && mkdir -p build && cd build && cmake .. {} && make -j `nproc`"
+cmd = cmd.format(" ".join(
+    ["-D{}={}".format(k, v) for k, v in cmake_args.items()]))
+assert os.system(cmd) == 0
+print(cmd)
+so_file = [f for f in os.listdir('build') if f.endswith('.so')]
+assert len(so_file) == 1
+so_file = so_file[0]
+assert os.system("cp build/{} .".format(so_file)) == 0
--- a/pybind/functions.cc
+++ b/pybind/functions.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cstdint>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+namespace py = pybind11;
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+PYBIND11_MAKE_OPAQUE(framework::LoDTensorArray);
+constexpr int kInputIdsIdx = 0;
+constexpr int kSegmentIdsIdx = 1;
+constexpr int kInputMaskIdx = 2;
+constexpr int kMaskedLmLabelsIdx = 3;
+constexpr int kNextSentenceLabelsIdx = 4;
+constexpr int kSeqLenIdx = 5;
+constexpr int kPrefixSumSeqLenIdx = 6;
+constexpr int kNonZerosIndicesIdx = 7;
+constexpr int kMaskedLmIdsIdx = 8;
+constexpr int kMaskedLmPositionIdx = 9;
+constexpr int kNumValidIdx = 10;
+constexpr int kNumTensors = 11;
+template <typename T>
+std::vector<std::vector<framework::LoDTensorArray>>
+ProcessAllGatheredBERTInputs(
+    const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
+    size_t num_samples,
+    size_t max_seq_length,
+    size_t batch_size,
+    size_t trainer_id,
+    size_t num_trainers) {
+  using TensorT = framework::LoDTensor;
+  PADDLE_ENFORCE_EQ(array.ndim(), 1);
+  size_t length = array.shape()[0];
+  const T *arr = array.data();
+  py::gil_scoped_release gil_release_guard;
+  const size_t nbatch = (num_samples + batch_size - 1) / batch_size;
+  std::unique_ptr<T[]> seq_indices(new T[batch_size * num_trainers]);
+  const size_t numel = num_samples * max_seq_length;
+  const size_t num_per_device = numel * 4 + num_samples * 2;
+  PADDLE_ENFORCE_EQ(num_per_device * num_trainers, length);
+  auto resize_and_alloc = [](TensorT *t, const framework::DDim &dim) -> T * {
+    t->Resize(dim);
+    return t->mutable_data<T>(platform::CPUPlace());
+  };
+  auto resize_and_alloc_int = [](TensorT *t,
+                                 const framework::DDim &dim) -> int * {
+    t->Resize(dim);
+    return t->mutable_data<int>(platform::CPUPlace());
+  };
+  auto resize_and_alloc_float32 = [](TensorT *t,
+                                     const framework::DDim &dim) -> float * {
+    t->Resize(dim);
+    return t->mutable_data<float>(platform::CPUPlace());
+  };
+  VLOG(10) << "num_samples = " << num_samples;
+  VLOG(10) << "max_seq_length = " << max_seq_length;
+  VLOG(10) << "batch_size = " << batch_size;
+  VLOG(10) << "trainer_id = " << trainer_id;
+  VLOG(10) << "num_trainers = " << num_trainers;
+  VLOG(10) << "nbatch = " << nbatch;
+  VLOG(10) << "length= " << length;
+  std::vector<std::vector<framework::LoDTensorArray>> gpu_cpu_tensors;
+  std::vector<framework::LoDTensorArray> tensors(nbatch);
+  std::vector<framework::LoDTensorArray> tensors_2(nbatch);
+  for (size_t i = 0; i < nbatch; ++i) {
+    const size_t cur_bs =
+        std::min((i + 1) * batch_size, num_samples) - i * batch_size;
+    VLOG(10) << "Mini batch " << i << " " << cur_bs;
+    const size_t seq_length_offset =
+        num_samples * max_seq_length * 4 + num_samples + i * batch_size;
+    const size_t total_seq_length = cur_bs * num_trainers;
+    std::iota(seq_indices.get(),
+              seq_indices.get() + total_seq_length,
+              static_cast<size_t>(0));
+    std::sort(seq_indices.get(),
+              seq_indices.get() + total_seq_length,
+              [&](size_t idx1, size_t idx2) {
+                size_t real_idx1 = (idx1 % num_trainers) * num_per_device +
+                                   (idx1 / num_trainers) + seq_length_offset;
+                size_t real_idx2 = (idx2 % num_trainers) * num_per_device +
+                                   (idx2 / num_trainers) + seq_length_offset;
+                return arr[real_idx1] > arr[real_idx2];
+              });
+    tensors[i].resize(kNumTensors);
+    tensors_2[i].resize(1);
+    auto *input_ids = resize_and_alloc(
+        &tensors[i][kInputIdsIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *segment_ids = resize_and_alloc(
+        &tensors[i][kSegmentIdsIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *input_mask = resize_and_alloc(
+        &tensors[i][kInputMaskIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *masked_lm_labels = resize_and_alloc(
+        &tensors[i][kMaskedLmLabelsIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *next_sentence_labels = resize_and_alloc(
+        &tensors[i][kNextSentenceLabelsIdx], {static_cast<int64_t>(cur_bs)});
+    auto *seq_len = resize_and_alloc_int(&tensors[i][kSeqLenIdx],
+                                         {static_cast<int64_t>(cur_bs)});
+    auto *prefix_sum_seq_len = resize_and_alloc_int(
+        &tensors[i][kPrefixSumSeqLenIdx], {static_cast<int64_t>(cur_bs + 1)});
+    auto *num_valid = resize_and_alloc_float32(&tensors[i][kNumValidIdx],
+                                               {static_cast<int64_t>(1)});
+    // cpu tensor
+    auto *host_prefix_sum_seq_len = resize_and_alloc_int(
+        &tensors_2[i][0], {static_cast<int64_t>(cur_bs + 1)});
+    prefix_sum_seq_len[0] = 0;
+    int sum_seq_len = 0;
+    for (size_t j = 0; j < cur_bs; ++j) {
+      const size_t idx = seq_indices.get()[j * num_trainers + trainer_id];
+      const size_t dev_id = idx % num_trainers;
+      const T *data = arr + dev_id * num_per_device;
+      const size_t sample_id = idx / num_trainers + i * batch_size;
+      std::memcpy(input_ids + j * max_seq_length,
+                  data + sample_id * max_seq_length,
+                  max_seq_length * sizeof(T));
+      std::memcpy(segment_ids + j * max_seq_length,
+                  data + numel + sample_id * max_seq_length,
+                  max_seq_length * sizeof(T));
+      std::memcpy(input_mask + j * max_seq_length,
+                  data + 2 * numel + sample_id * max_seq_length,
+                  max_seq_length * sizeof(T));
+      std::memcpy(masked_lm_labels + j * max_seq_length,
+                  data + 3 * numel + sample_id * max_seq_length,
+                  max_seq_length * sizeof(T));
+      next_sentence_labels[j] = data[4 * numel + sample_id];
+      seq_len[j] = data[4 * numel + num_samples + sample_id];
+      sum_seq_len += seq_len[j];
+      if (j > 0) {
+        prefix_sum_seq_len[j] = prefix_sum_seq_len[j - 1] + seq_len[j - 1];
+      }
+    }
+    prefix_sum_seq_len[cur_bs] =
+        prefix_sum_seq_len[cur_bs - 1] + seq_len[cur_bs - 1];
+    std::memcpy(host_prefix_sum_seq_len,
+                prefix_sum_seq_len,
+                sizeof(int) * (cur_bs + 1));
+    PADDLE_ENFORCE_LE(sum_seq_len, cur_bs * max_seq_length);
+    auto *nonzeros_indices = resize_and_alloc_int(
+        &tensors[i][kNonZerosIndicesIdx], {static_cast<int64_t>(sum_seq_len)});
+    int cur_nonzero_ind = 0;
+    int cur_num_valid = 0;
+    for (size_t j = 0; j < cur_bs; ++j) {
+      for (size_t k = 0; k < max_seq_length; ++k) {
+        int ids = j * max_seq_length + k;
+        if (input_mask[ids] != 0) {
+          nonzeros_indices[cur_nonzero_ind++] = static_cast<int>(ids);
+        }
+        if (masked_lm_labels[ids] != 0) {
+          cur_num_valid += 1;
+        }
+      }
+    }
+    PADDLE_ENFORCE_EQ(cur_nonzero_ind, sum_seq_len);
+    *num_valid = static_cast<float>(cur_num_valid);
+    auto *masked_lm_ids = resize_and_alloc_int(
+        &tensors[i][kMaskedLmIdsIdx], {static_cast<int64_t>(cur_num_valid)});
+    auto *masked_lm_positions =
+        resize_and_alloc_int(&tensors[i][kMaskedLmPositionIdx],
+                             {static_cast<int64_t>(cur_num_valid)});
+    cur_num_valid = 0;
+    for (size_t j = 0; j < cur_bs; ++j) {
+      for (size_t k = 0; k < max_seq_length; ++k) {
+        int ids = j * max_seq_length + k;
+        if (masked_lm_labels[ids] != 0) {
+          masked_lm_positions[cur_num_valid] = ids;
+          masked_lm_ids[cur_num_valid] = masked_lm_labels[ids];
+          cur_num_valid += 1;
+        }
+      }
+    }
+  }
+  gpu_cpu_tensors.push_back(tensors);
+  gpu_cpu_tensors.push_back(tensors_2);
+  return gpu_cpu_tensors;
+}
+template <typename T>
+std::vector<std::vector<framework::LoDTensorArray>> ProcessBERTEvalInputs(
+    const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
+    size_t max_seq_length,
+    size_t batch_size,
+    bool need_sort) {
+  using TensorT = framework::LoDTensor;
+  PADDLE_ENFORCE_EQ(array.ndim(), 2);
+  size_t num_samples = array.shape()[0];
+  size_t one_sample_len = array.shape()[1];
+  const T *arr = array.data();
+  py::gil_scoped_release gil_release_guard;
+  std::unique_ptr<size_t[]> seq_indices;
+  if (need_sort) {
+    seq_indices.reset(new size_t[num_samples]);
+    std::iota(seq_indices.get(),
+              seq_indices.get() + num_samples,
+              static_cast<size_t>(0));
+    std::sort(seq_indices.get(),
+              seq_indices.get() + num_samples,
+              [arr, one_sample_len](size_t idx1, size_t idx2) {
+                idx1 = (idx1 + 1) * one_sample_len - 1;
+                idx2 = (idx2 + 1) * one_sample_len - 1;
+                return arr[idx1] < arr[idx2];
+              });
+  }
+  const size_t nbatch = (num_samples + batch_size - 1) / batch_size;
+  auto resize_and_alloc = [](TensorT *t, const framework::DDim &dim) -> T * {
+    t->Resize(dim);
+    return t->mutable_data<T>(platform::CPUPlace());
+  };
+  auto resize_and_alloc_int = [](TensorT *t,
+                                 const framework::DDim &dim) -> int * {
+    t->Resize(dim);
+    return t->mutable_data<int>(platform::CPUPlace());
+  };
+  auto resize_and_alloc_float32 = [](TensorT *t,
+                                     const framework::DDim &dim) -> float * {
+    t->Resize(dim);
+    return t->mutable_data<float>(platform::CPUPlace());
+  };
+  VLOG(10) << "one_sample_len = " << one_sample_len;
+  VLOG(10) << "num_samples = " << num_samples;
+  VLOG(10) << "max_seq_length = " << max_seq_length;
+  VLOG(10) << "batch_size = " << batch_size;
+  VLOG(10) << "nbatch = " << nbatch;
+  std::vector<std::vector<framework::LoDTensorArray>> gpu_cpu_tensors;
+  std::vector<framework::LoDTensorArray> tensors(nbatch);
+  std::vector<framework::LoDTensorArray> tensors_2(nbatch);
+  for (size_t i = 0; i < nbatch; ++i) {
+    const size_t cur_bs =
+        std::min((i + 1) * batch_size, num_samples) - i * batch_size;
+    VLOG(10) << "Mini batch " << i << " " << cur_bs;
+    tensors[i].resize(kNumTensors);
+    tensors_2[i].resize(1);
+    auto *input_ids = resize_and_alloc(
+        &tensors[i][kInputIdsIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *segment_ids = resize_and_alloc(
+        &tensors[i][kSegmentIdsIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *input_mask = resize_and_alloc(
+        &tensors[i][kInputMaskIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *masked_lm_labels = resize_and_alloc(
+        &tensors[i][kMaskedLmLabelsIdx],
+        {static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
+    auto *next_sentence_labels = resize_and_alloc(
+        &tensors[i][kNextSentenceLabelsIdx], {static_cast<int64_t>(cur_bs)});
+    auto *seq_len = resize_and_alloc_int(&tensors[i][kSeqLenIdx],
+                                         {static_cast<int64_t>(cur_bs)});
+    auto *prefix_sum_seq_len = resize_and_alloc_int(
+        &tensors[i][kPrefixSumSeqLenIdx], {static_cast<int64_t>(cur_bs + 1)});
+    auto *num_valid = resize_and_alloc_float32(&tensors[i][kNumValidIdx],
+                                               {static_cast<int64_t>(1)});
+    // cpu tensor
+    auto *host_prefix_sum_seq_len = resize_and_alloc_int(
+        &tensors_2[i][0], {static_cast<int64_t>(cur_bs + 1)});
+    prefix_sum_seq_len[0] = 0;
+    int sum_seq_len = 0;
+    for (size_t j = 0; j < cur_bs; ++j) {
+      const T *data = arr;
+      size_t sample_id = j + i * batch_size;
+      if (need_sort) sample_id = seq_indices.get()[sample_id];
+      std::memcpy(input_ids + j * max_seq_length,
+                  data + sample_id * one_sample_len,
+                  max_seq_length * sizeof(T));
+      std::memcpy(segment_ids + j * max_seq_length,
+                  data + sample_id * one_sample_len + max_seq_length,
+                  max_seq_length * sizeof(T));
+      std::memcpy(input_mask + j * max_seq_length,
+                  data + sample_id * one_sample_len + 2 * max_seq_length,
+                  max_seq_length * sizeof(T));
+      std::memcpy(masked_lm_labels + j * max_seq_length,
+                  data + sample_id * one_sample_len + 3 * max_seq_length,
+                  max_seq_length * sizeof(T));
+      next_sentence_labels[j] =
+          data[sample_id * one_sample_len + 4 * max_seq_length];
+      seq_len[j] = data[sample_id * one_sample_len + 4 * max_seq_length + 1];
+      sum_seq_len += seq_len[j];
+      if (j > 0) {
+        prefix_sum_seq_len[j] = prefix_sum_seq_len[j - 1] + seq_len[j - 1];
+      }
+    }
+    prefix_sum_seq_len[cur_bs] =
+        prefix_sum_seq_len[cur_bs - 1] + seq_len[cur_bs - 1];
+    std::memcpy(host_prefix_sum_seq_len,
+                prefix_sum_seq_len,
+                sizeof(int) * (cur_bs + 1));
+    PADDLE_ENFORCE_LE(sum_seq_len, cur_bs * max_seq_length);
+    auto *nonzeros_indices = resize_and_alloc_int(
+        &tensors[i][kNonZerosIndicesIdx], {static_cast<int64_t>(sum_seq_len)});
+    int cur_nonzero_ind = 0;
+    int cur_num_valid = 0;
+    for (size_t j = 0; j < cur_bs; ++j) {
+      for (size_t k = 0; k < max_seq_length; ++k) {
+        int ids = j * max_seq_length + k;
+        if (input_mask[ids] != 0) {
+          nonzeros_indices[cur_nonzero_ind++] = static_cast<int>(ids);
+        }
+        if (masked_lm_labels[ids] != 0) {
+          cur_num_valid += 1;
+        }
+      }
+    }
+    PADDLE_ENFORCE_EQ(cur_nonzero_ind, sum_seq_len);
+    *num_valid = cur_num_valid;
+    auto *masked_lm_ids = resize_and_alloc_int(
+        &tensors[i][kMaskedLmIdsIdx], {static_cast<int64_t>(cur_num_valid)});
+    auto *masked_lm_positions =
+        resize_and_alloc_int(&tensors[i][kMaskedLmPositionIdx],
+                             {static_cast<int64_t>(cur_num_valid)});
+    cur_num_valid = 0;
+    for (size_t j = 0; j < cur_bs; ++j) {
+      for (size_t k = 0; k < max_seq_length; ++k) {
+        int ids = j * max_seq_length + k;
+        if (masked_lm_labels[ids] != 0) {
+          masked_lm_positions[cur_num_valid] = ids;
+          masked_lm_ids[cur_num_valid] = masked_lm_labels[ids];
+          cur_num_valid += 1;
+        }
+      }
+    }
+  }
+  gpu_cpu_tensors.push_back(tensors);
+  gpu_cpu_tensors.push_back(tensors_2);
+  return gpu_cpu_tensors;
+}
+PYBIND11_MODULE(MLPERF_EXTENSION_NAME, m) {
+  m.def("process_allgathered_inputs", &ProcessAllGatheredBERTInputs<int16_t>);
+  m.def("process_allgathered_inputs", &ProcessAllGatheredBERTInputs<int32_t>);
+  m.def("process_allgathered_inputs", &ProcessAllGatheredBERTInputs<int64_t>);
+  m.def("process_eval_inputs", &ProcessBERTEvalInputs<int16_t>);
+  m.def("process_eval_inputs", &ProcessBERTEvalInputs<int32_t>);
+  m.def("process_eval_inputs", &ProcessBERTEvalInputs<int64_t>);
+}
--- a/pybind/functions.cpython-36m-x86_64-linux-gnu.so
+++ b/pybind/functions.cpython-36m-x86_64-linux-gnu.so
--- a/pybind/torch_extensions/compile.sh
+++ b/pybind/torch_extensions/compile.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+DIR=$(readlink -f `dirname "$0"`)
+g++ "$DIR/nccl.cc" -std=c++17 -fPIC -shared -o "$DIR/libnccl_wrapper.so" -I/usr/local/cuda/include -ldl -lnccl
+(cd "$DIR" && rm -rf build && python3.8 setup.py install --force)
+echo "Set the following env before run:"
+echo ""
+echo 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:'$DIR" LD_PRELOAD=$DIR/libnccl_wrapper.so"
+echo ""
--- a/pybind/torch_extensions/nccl.cc
+++ b/pybind/torch_extensions/nccl.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define ASSERT_CHECK(__cond)                          \
+  do {                                                \
+    if (!(__cond)) throw std::runtime_error(#__cond); \
+  } while (0)
+#include "nccl.h"  // NOLINT
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include "dlfcn.h"  // NOLINT
+constexpr ncclRedOp_t UNUSED = ncclProd;
+using AllReduceT = decltype(&ncclAllReduce);
+using ReduceScatterT = decltype(&ncclReduceScatter);
+using RedOpCreatePreMulSumT = decltype(&ncclRedOpCreatePreMulSum);
+using RedOpDestroyT = decltype(&ncclRedOpDestroy);
+static std::string GetNCCLSoPath() {
+  const char *env = std::getenv("NCCL_SO_PATH");
+  return env ? std::string(env) : "libnccl.so";
+}
+struct NCCLHandle {
+  NCCLHandle() {
+    auto so_path = GetNCCLSoPath();
+    void *handle = dlopen(so_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    this->ncclAllReduce =
+        reinterpret_cast<AllReduceT>(dlsym(handle, "ncclAllReduce"));
+    ASSERT_CHECK(this->ncclAllReduce != nullptr);
+    this->ncclReduceScatter =
+        reinterpret_cast<ReduceScatterT>(dlsym(handle, "ncclReduceScatter"));
+    ASSERT_CHECK(this->ncclReduceScatter != nullptr);
+    this->ncclRedOpCreatePreMulSum = reinterpret_cast<RedOpCreatePreMulSumT>(
+        dlsym(handle, "ncclRedOpCreatePreMulSum"));
+    ASSERT_CHECK(this->ncclRedOpCreatePreMulSum != nullptr);
+    this->ncclRedOpDestroy =
+        reinterpret_cast<RedOpDestroyT>(dlsym(handle, "ncclRedOpDestroy"));
+    ASSERT_CHECK(this->ncclRedOpDestroy != nullptr);
+    fprintf(stderr, "%s loaded successfully\n", so_path.c_str());
+  }
+  AllReduceT ncclAllReduce = nullptr;
+  ReduceScatterT ncclReduceScatter = nullptr;
+  RedOpCreatePreMulSumT ncclRedOpCreatePreMulSum = nullptr;
+  RedOpDestroyT ncclRedOpDestroy = nullptr;
+} g_nccl_handle;
+struct NCCLPreMulSumInfo {
+  void Init(const void *scalar,
+            ncclDataType_t dtype,
+            ncclScalarResidence_t residence) {
+    scalar_ = const_cast<void *>(scalar);
+    dtype_ = dtype;
+    residence_ = residence;
+  }
+  ncclRedOp_t CreateOrReturn(ncclRedOp_t op, ncclComm_t comm) {
+    if (op != UNUSED) return op;
+    ASSERT_CHECK(ncclSuccess ==
+                 g_nccl_handle.ncclRedOpCreatePreMulSum(
+                     &op_, scalar_, dtype_, residence_, comm));
+    comm_ = comm;
+    is_created_ = true;
+    return op_;
+  }
+  void Destroy() {
+    if (is_created_) {
+      ASSERT_CHECK(ncclSuccess == g_nccl_handle.ncclRedOpDestroy(op_, comm_));
+      op_ = UNUSED;
+      comm_ = nullptr;
+      is_created_ = false;
+    }
+  }
+ private:
+  ncclRedOp_t op_ = UNUSED;
+  ncclComm_t comm_ = nullptr;
+  bool is_created_ = false;
+  void *scalar_ = nullptr;
+  ncclDataType_t dtype_ = ncclFloat16;
+  ncclScalarResidence_t residence_ = ncclScalarDevice;
+} g_info;
+extern "C" {
+void InitNCCLPreMulSum(const void *scalar,
+                       ncclDataType_t dtype,
+                       ncclScalarResidence_t residence) {
+  g_info.Init(scalar, dtype, residence);
+}
+ncclResult_t ncclAllReduce(const void *sendbuff,
+                           void *recvbuff,
+                           size_t count,
+                           ncclDataType_t datatype,
+                           ncclRedOp_t op,
+                           ncclComm_t comm,
+                           cudaStream_t stream) {
+  op = g_info.CreateOrReturn(op, comm);
+  auto ret = g_nccl_handle.ncclAllReduce(
+      sendbuff, recvbuff, count, datatype, op, comm, stream);
+  g_info.Destroy();
+  return ret;
+}
+ncclResult_t ncclReduceScatter(const void *sendbuff,
+                               void *recvbuff,
+                               size_t recvcount,
+                               ncclDataType_t datatype,
+                               ncclRedOp_t op,
+                               ncclComm_t comm,
+                               cudaStream_t stream) {
+  op = g_info.CreateOrReturn(op, comm);
+  auto ret = g_nccl_handle.ncclReduceScatter(
+      sendbuff, recvbuff, recvcount, datatype, op, comm, stream);
+  g_info.Destroy();
+  return ret;
+}
+}  // extern "C"
--- a/pybind/torch_extensions/setup.py
+++ b/pybind/torch_extensions/setup.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import os
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
+from setuptools import setup, find_packages
+os.environ['TORCH_CUDA_ARCH_LIST'] = '8.0'
+setup(
+    name="torch_ex",
+    version="0.1",
+    description="PyTorch Extensions written by Baidu",
+    ext_modules=[
+        CUDAExtension(
+            name="torch_ex",
+            sources=['torch_ex.cc'],
+            extra_compile_args={
+                "cxx": [
+                    "-O3", "-DVERSION_GE_1_1", "-DVERSION_GE_1_3",
+                    "-DVERSION_GE_1_5", "-fPIC"
+                ],
+                "nvcc": [
+                    "-O3", "-DVERSION_GE_1_1", "-DVERSION_GE_1_3",
+                    "-DVERSION_GE_1_5", "-Xcompiler='-fPIC'"
+                ],
+            },
+            extra_link_args=['-fPIC', '-lnccl_wrapper'])
+    ],
+    cmdclass={"build_ext": BuildExtension})
--- a/pybind/torch_extensions/torch_ex.cc
+++ b/pybind/torch_extensions/torch_ex.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <torch/extension.h>
+#include "nccl.h"  // NOLINT
+#define ASSERT_CHECK(__cond)                          \
+  do {                                                \
+    if (!(__cond)) throw std::runtime_error(#__cond); \
+  } while (0)
+extern "C" {
+extern void InitNCCLPreMulSum(const void *scalar,
+                              ncclDataType_t dtype,
+                              ncclScalarResidence_t residence);
+}
+void InitNCCLPreMulSumByTensor(const at::Tensor &t) {
+  const void *scalar = t.data_ptr();
+  auto dtype = t.options().dtype();
+  ncclDataType_t nccl_dtype;
+  if (dtype == at::ScalarType::Half) {
+    nccl_dtype = ncclFloat16;
+  } else if (dtype == at::ScalarType::Float) {
+    nccl_dtype = ncclFloat32;
+  } else if (dtype == at::ScalarType::Double) {
+    nccl_dtype = ncclFloat64;
+  } else {
+    ASSERT_CHECK(false);
+  }
+  auto residence =
+      (t.device().is_cuda() ? ncclScalarDevice : ncclScalarHostImmediate);
+  InitNCCLPreMulSum(scalar, nccl_dtype, residence);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  namespace py = pybind11;
+  m.def("make_nccl_premul_sum",
+        &InitNCCLPreMulSumByTensor,
+        py::call_guard<py::gil_scoped_release>());
+}
--- a/readme.bak.md
+++ b/readme.bak.md
+# Bert介绍
+## 应用领域：
+自然语言理解大模型
+## 目标精度
+Mask-LM accuracy 达到0.72
+## 模型基本参数设置
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+# 测试前准备
+## 数据集准备
+###  progress bars in model download and training scripts
+boto3==1.14.0
+gdown==3.13.0
+git+https://github.com/mlcommons/logging.git@2.0.0-rc2
+h5py==2.10.0
+html2text==2020.1.16
+ipdb==0.13.2
+nltk==3.5
+onnxruntime==1.3.0
+parameterized
+progressbar==2.5
+requests==2.23.0
+six==1.15.0
+tensorflow==2.2.0
+数据预处理时尽量将所有采用库的版本号对齐，以免出现md5码不一致问题
+参见bert目录下 README.md制作数据
+## 环境部署
+1、准备dtk 21.04环境
+2、Mlperf bert文件夹内包含paddlepaddle_rocm-0.0.0-cp36-cp36m-linux_x86_64.whl
+python3 -m pip install  paddlepaddle_rocm-0.0.0-cp36-cp36m-linux_x86_64.whl
+## 安装python依赖包
+ pip install -r requirements.txt  -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
+# 测试脚本
+## 8卡打开exchange padding测试
+cp rundir_8gpu_exchange/* .
+sbatch run_sbatch.sh
+## 1024卡大规模并发测试
+cp rundir_8gpu_exchange/* .
+sbatch run_sbatch.sh
+输出结果见worker.*文件
+# 优化测试结果整理
+测试数据存放目录：result.log
+##  扩展性测试
+| GPU卡数 | 单卡batch_size | gradient_accumulation | 吞吐量（seq/s）       | 并行效率                     |
+|-------|--------------|-----------------------|------------------|--------------------------|
+| 4     | 4            | 1                     | 36.69            | 100%                     |
+| 8     | 4            | 1                     | 65.7             | 89.53%                   |
+| 1024  | 4            | 1                     | 7723.38          | 82.23%                   |
+| 1024  | 8            | 1                     | 9362.93-.9416.84 | 99.6%-100.25%(以单节点4卡为基准） |
+##  性能优化测试
+| GPU卡数 | 单卡batch_size | gradient_accumulation_steps | global batch size |      |  混精度          | gemm优化       | softmax+softmax_cross_entropy  | distributed_fused_lamb | GeLU近似算法      | exchange padding     | 收敛global_steps | walltime(s) |
+|-------|--------------|-----------------------------|-------------------|------|---------------|--------------|--------------------------------|------------------------|---------------|----------------------|----------------|-------------|
+| 8     | 4            | 14                          | 448               | 优化前： |               | 51.26seq/s   | 85.3seq/s                      | 89.59seq/s             |               | 6697 (global steps)  | 6697           | 32522.67    |
+|       |              |                             |                   | 优化后： | 91.92seq/s    | 85.3seq/s    | 89.59seq/s                     | 91.92seq/s             | 91.92seq/s    |  5692 (global steps) | 5692           |             |
+| 1024  | 4            | 1                           | 4096              | 优化前： | 4458.04seq/s  |              | 7461seq/s                      | 5174.44seq/s           | 7353.08seq/s  | 必须off                | 684            | 369.325     |
+|       |              |                             |                   | 优化后： | 7723.38seq/s  | 5174.44seq/s | 7723.38seq/s                   | 7461seq/s              | 7723.38seq/s  |                      |                |             |
+| 1024  | 8            | 2                           | 16384             | 优化前： | 　---          |              | 10634seq/s                     | 9083seq/s              |               | 必须off                | 794            | 580.618     |
+|       |              |                             |                   | 优化后： | 11330.07seq/s | 9083seq/s    | 11330.07seq/s                  | 10634seq/s             | 11330.07seq/s |
--- a/requirements.txt
+++ b/requirements.txt
+jieba
+h5py
+colorlog
+colorama
+seqeval
+multiprocess
+mpi4py
+paddlenlp
+git+https://github.com/mlperf/logging.git@2.0.0-rc1
--- a/run_1gpu.sh
+++ b/run_1gpu.sh
+#!/bin/bash
+cp rundir_1gpu/init_env.py .
+export PADDLE_TRAINERS_NUM=1
+export PADDLE_TRAINER_ENDPOINTS=localhost:60001
+export SEED=${SEED:-"$RANDOM"}
+mpirun -np 1 --allow-run-as-root -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS ./run_benchmark_1gpu.sh
--- a/run_8gpu.sh
+++ b/run_8gpu.sh
+#!/bin/bash
+cp rundir_8gpu/init_env.py .
+export MIOPEN_FIND_MODE=1
+export PADDLE_TRAINERS_NUM=8
+export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012
+export PYTHON=python3
+export SEED=${SEED:-"$RANDOM"}
+export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_P2P_LEVEL=5
+export use_hierarchical_allreduce=True
+export num_process=16
+if [[ $num_process -gt 1 ]]; then
+  ORTERUN=`which orterun`
+  mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark_8gpu.sh"
+else
+  mpirun=""
+fi
+echo "command is " $mpirun $CMD
+for NPROC_PER_NODE in 8; do
+  export NPROC_PER_NODE=$NPROC_PER_NODE
+  $mpirun $CMD
+done
--- a/run_8gpu_fp32.sh
+++ b/run_8gpu_fp32.sh
+#!/bin/bash
+cp rundir_8gpu/init_env.py .
+export MIOPEN_FIND_MODE=1
+export PADDLE_TRAINERS_NUM=8
+export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012
+export PYTHON=python3
+export SEED=${SEED:-"$RANDOM"}
+export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_P2P_LEVEL=5
+export use_hierarchical_allreduce=True
+export num_process=16
+if [[ $num_process -gt 1 ]]; then
+  ORTERUN=`which orterun`
+  mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark_8gpu_fp32.sh"
+else
+  mpirun=""
+fi
+echo "command is " $mpirun $CMD
+for NPROC_PER_NODE in 8; do
+  export NPROC_PER_NODE=$NPROC_PER_NODE
+  $mpirun $CMD
+done
--- a/run_benchmark_1gpu.sh
+++ b/run_benchmark_1gpu.sh
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -ex
+export FLAGS_rocm_dir=/opt/dtk-21.04
+export FLAGS_max_inplace_grad_add=2
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_P2P_LEVEL=5
+export USE_NV_INPUT=1
+USE_UNCOMPRESSED_DATASET=1
+BASE_DATA_DIR=${BASE_DATA_DIR:-"/data/mlperf/bert"}
+export USE_NV_INPUT
+UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
+VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
+export DATA_DIR=$UNCOMPRESSED_DATA_DIR
+export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
+  export DATA_DIR="$VARLENGTH_DATA_DIR"
+  export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+else
+  export USE_UNCOMPRESSED_DATASET=1
+fi
+export USE_UNCOMPRESSED_DATASET
+export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
+export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
+export PYTHON=python3
+export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
+export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
+export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-"localhost:60045"}
+OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+function get_device_id() {
+$PYTHON <<EOF
+import paddle
+import os
+gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+if gpus is None:
+    print($OMPI_COMM_WORLD_RANK)
+else:
+    gpus = gpus.split(",")
+    print(gpus[$OMPI_COMM_WORLD_RANK])
+EOF
+}
+if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
+  export CUDA_VISIBLE_DEVICES=0
+  export IS_TRAINER=1
+  export IS_READER=0
+else
+  export CUDA_VISIBLE_DEVICES=""
+  export IS_TRAINER=0
+  export IS_READER=1
+fi
+echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
+export FLAGS_sync_nccl_allreduce=0
+export FLAGS_fraction_of_gpu_memory_to_use=0.99
+export FLAGS_call_stack_level=2
+export FLAGS_use_fast_math=0
+export FLAGS_enable_nvtx=1
+batch_size=4
+eval_batch_size=9
+#use_amp=True
+#use_pure_fp16=True
+use_amp=True
+use_pure_fp16=False
+max_steps=820
+log_freq=50
+eval_iter_start_samples=175000
+eval_iter_samples=175000
+max_seq_length=512
+dense_seq_output=True
+unpad=False
+unpad_fmha=False
+fused_bias_mha=True
+fused_bias_fc=True
+## can be False or True 
+weight_transpose=True
+###fused_dropout_add_ln=True
+fused_dropout_add_ln=False
+exchange_padding=True
+cpu_exchange_padding=True
+distributed_lamb=True
+unpad_embed=False
+unpad_fmha_mke_opt=True
+sort_eval_data=False
+LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
+mkdir -p ${LOG_DIR}
+LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
+export FLAGS_max_inplace_grad_add=2
+if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
+  if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
+    export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
+  fi
+fi
+#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
+BERT_CMD="run_pretrain.py \
+   --max_predictions_per_seq 76 \
+   --train_batch_size $batch_size   \
+   --eval_batch_size $eval_batch_size \
+   --sort_eval_data $sort_eval_data \
+   --learning_rate 0.00255 \
+   --weight_decay 0.0 \
+   --lamb_epsilon 1e-06 \
+   --start_warmup_step -76 \
+   --warmup_proportion 0.0 \
+   --warmup_steps 256 \
+   --input_dir $DATA_DIR \
+   --log_freq $log_freq \
+   --max_steps $max_steps \
+   --tf_ckpt_path $TF_CKPT_PATH \
+   --bert_config_path $BERT_CONFIG_PATH \
+   --unpad $unpad \
+   --unpad_fmha $unpad_fmha \
+   --unpad_fmha_mke_opt $unpad_fmha_mke_opt \
+   --unpad_embed $unpad_embed \
+   --fused_bias_mha $fused_bias_mha \
+   --fused_bias_fc $fused_bias_fc \
+   --fused_dropout_add_ln $fused_dropout_add_ln \
+   --weight_transpose $weight_transpose \
+   --max_seq_length $max_seq_length \
+   --eval_dir $EVAL_DIR \
+   --distributed_lamb $distributed_lamb \
+   --exchange_padding $exchange_padding \
+   --cpu_exchange_padding $cpu_exchange_padding \
+   --seed $SEED \
+   --use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
+   --dense_seq_output $dense_seq_output \
+   --gradient_accumulation_steps 1 \
+   --opt_lamb_beta_1 0.71 \
+   --opt_lamb_beta_2 0.88 \
+   --enable_addto True \
+   --use_pure_fp16 $use_pure_fp16 \
+   --use_amp $use_amp"
+#Run experiments
+#python3 -u $BERT_CMD >& $LOG_FILE
+python3 -u $BERT_CMD
--- a/run_benchmark_8gpu.sh
+++ b/run_benchmark_8gpu.sh
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -ex
+#export ROCBLAS_LAYER=3
+export FLAGS_rocm_dir=/opt/dtk-21.04/
+export FLAGS_max_inplace_grad_add=2
+export NCCL_P2P_LEVEL=5
+export USE_NV_INPUT=1
+USE_UNCOMPRESSED_DATASET=1
+BASE_DATA_DIR=${BASE_DATA_DIR:-"/data/mlperf/bert"}
+export USE_NV_INPUT
+UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
+VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
+export DATA_DIR=$UNCOMPRESSED_DATA_DIR
+export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
+  export DATA_DIR="$VARLENGTH_DATA_DIR"
+  export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+else
+  export USE_UNCOMPRESSED_DATASET=1
+fi
+export USE_UNCOMPRESSED_DATASET
+export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
+export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
+export PYTHON=python3
+export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
+export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
+export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-""}
+OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+function get_device_id() {
+$PYTHON <<EOF
+import paddle
+import os
+gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+if gpus is None:
+    print($OMPI_COMM_WORLD_RANK)
+else:
+    gpus = gpus.split(",")
+    print(gpus[$OMPI_COMM_WORLD_RANK])
+EOF
+}
+if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #$(expr $OMPI_COMM_WORLD_RANK % 4) #`get_device_id`
+  export IS_TRAINER=1
+  export IS_READER=0
+else
+  export CUDA_VISIBLE_DEVICES=""
+  export IS_TRAINER=0
+  export IS_READER=1
+fi
+echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
+export FLAGS_sync_nccl_allreduce=0
+export FLAGS_fraction_of_gpu_memory_to_use=0.99
+#export FLAGS_allocator_strategy=naive_best_fit
+export FLAGS_call_stack_level=2
+export FLAGS_use_fast_math=0
+export FLAGS_enable_nvtx=1
+#export FLAGS_inplace_addto_external_ops=custom_fused_dense_grad
+batch_size=4
+eval_batch_size=63
+#eval_batch_size=16
+use_amp=True
+use_pure_fp16=True
+max_steps=7100
+log_freq=50
+eval_iter_start_samples=150000
+eval_iter_samples=150000
+max_seq_length=512
+dense_seq_output=True
+unpad=False
+unpad_fmha=False
+fused_bias_mha=True
+fused_bias_fc=True
+## can be False or True 
+weight_transpose=True
+#fused_dropout_add_ln=True
+fused_dropout_add_ln=False
+exchange_padding=True
+cpu_exchange_padding=True
+distributed_lamb=True
+unpad_embed=False
+unpad_fmha_mke_opt=True
+sort_eval_data=False
+LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
+mkdir -p ${LOG_DIR}
+LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
+#export FLAGS_lamb_allreduce_first=1
+#export FLAGS_use_multi_tensor_apply=1
+export FLAGS_max_inplace_grad_add=2
+if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
+  if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
+    export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
+  fi
+fi
+#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
+BERT_CMD="run_pretrain.py \
+   --max_predictions_per_seq 76 \
+   --train_batch_size $batch_size   \
+   --eval_batch_size $eval_batch_size \
+   --sort_eval_data $sort_eval_data \
+   --learning_rate 0.000425 \
+   --weight_decay 1e-2 \
+   --lamb_epsilon 1e-6 \
+   --start_warmup_step 0 \
+   --warmup_proportion 0.0 \
+   --warmup_steps 0 \
+   --input_dir $DATA_DIR \
+   --log_freq $log_freq \
+   --max_steps $max_steps \
+   --tf_ckpt_path $TF_CKPT_PATH \
+   --bert_config_path $BERT_CONFIG_PATH \
+   --unpad $unpad \
+   --unpad_fmha $unpad_fmha \
+   --unpad_fmha_mke_opt $unpad_fmha_mke_opt \
+   --unpad_embed $unpad_embed \
+   --fused_bias_mha $fused_bias_mha \
+   --fused_bias_fc $fused_bias_fc \
+   --fused_dropout_add_ln $fused_dropout_add_ln \
+   --weight_transpose $weight_transpose \
+   --max_seq_length $max_seq_length \
+   --eval_dir $EVAL_DIR \
+   --distributed_lamb $distributed_lamb \
+   --exchange_padding $exchange_padding \
+   --cpu_exchange_padding $cpu_exchange_padding \
+   --seed $SEED \
+   --use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
+   --dense_seq_output $dense_seq_output \
+   --gradient_accumulation_steps 14 \
+   --opt_lamb_beta_1 0.9 \
+   --opt_lamb_beta_2 0.999 \
+   --enable_addto True \
+   --use_pure_fp16 $use_pure_fp16 \
+   --use_amp $use_amp"
+## 2>&1 | tee $LOG_FILE"
+APP="python3 -u $BERT_CMD"
+case $(expr $lrank % 8) in 
+[0])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=0
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE
+  ;;
+[1])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=1
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE
+  ;;
+[2])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=2
+ # export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE
+  ;;
+[3])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=3
+#  export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}  >& $LOG_FILE
+  ;;
+[4])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=4
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=4 --membind=4 ${APP} >& $LOG_FILE
+  ;;
+[5])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=5
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=5 --membind=5 ${APP} >& $LOG_FILE
+  ;;
+[6])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=6
+ # export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=6 --membind=6 ${APP} >& $LOG_FILE
+  ;;
+[7])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=7
+#  export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=7 --membind=7 ${APP}  >& $LOG_FILE
+  ;;
+esac
--- a/run_benchmark_8gpu_fp32.sh
+++ b/run_benchmark_8gpu_fp32.sh
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -ex
+#export ROCBLAS_LAYER=3
+export FLAGS_rocm_dir=/opt/dtk-21.04/
+export FLAGS_max_inplace_grad_add=2
+export NCCL_P2P_LEVEL=5
+export USE_NV_INPUT=1
+USE_UNCOMPRESSED_DATASET=1
+BASE_DATA_DIR=${BASE_DATA_DIR:-"/data/mlperf/bert"}
+export USE_NV_INPUT
+UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
+VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
+export DATA_DIR=$UNCOMPRESSED_DATA_DIR
+export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
+  export DATA_DIR="$VARLENGTH_DATA_DIR"
+  export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
+else
+  export USE_UNCOMPRESSED_DATASET=1
+fi
+export USE_UNCOMPRESSED_DATASET
+export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
+export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
+export PYTHON=python3
+export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
+export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
+export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-""}
+OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+function get_device_id() {
+$PYTHON <<EOF
+import paddle
+import os
+gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+if gpus is None:
+    print($OMPI_COMM_WORLD_RANK)
+else:
+    gpus = gpus.split(",")
+    print(gpus[$OMPI_COMM_WORLD_RANK])
+EOF
+}
+if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #$(expr $OMPI_COMM_WORLD_RANK % 4) #`get_device_id`
+  export IS_TRAINER=1
+  export IS_READER=0
+else
+  export CUDA_VISIBLE_DEVICES=""
+  export IS_TRAINER=0
+  export IS_READER=1
+fi
+echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
+export FLAGS_sync_nccl_allreduce=0
+export FLAGS_fraction_of_gpu_memory_to_use=0.99
+#export FLAGS_allocator_strategy=naive_best_fit
+export FLAGS_call_stack_level=2
+export FLAGS_use_fast_math=0
+export FLAGS_enable_nvtx=1
+#export FLAGS_inplace_addto_external_ops=custom_fused_dense_grad
+batch_size=4
+eval_batch_size=63
+#eval_batch_size=16
+use_amp=True
+use_pure_fp16=False
+max_steps=7100
+log_freq=50
+eval_iter_start_samples=150000
+eval_iter_samples=150000
+max_seq_length=512
+dense_seq_output=True
+unpad=False
+unpad_fmha=False
+fused_bias_mha=True
+fused_bias_fc=True
+## can be False or True 
+weight_transpose=True
+#fused_dropout_add_ln=True
+fused_dropout_add_ln=False
+exchange_padding=True
+cpu_exchange_padding=True
+distributed_lamb=True
+unpad_embed=False
+unpad_fmha_mke_opt=True
+sort_eval_data=False
+LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
+mkdir -p ${LOG_DIR}
+LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
+#export FLAGS_lamb_allreduce_first=1
+#export FLAGS_use_multi_tensor_apply=1
+export FLAGS_max_inplace_grad_add=2
+if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
+  if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
+    export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
+  fi
+fi
+#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
+BERT_CMD="run_pretrain_fp32.py \
+   --max_predictions_per_seq 76 \
+   --train_batch_size $batch_size   \
+   --eval_batch_size $eval_batch_size \
+   --sort_eval_data $sort_eval_data \
+   --learning_rate 0.000425 \
+   --weight_decay 1e-2 \
+   --lamb_epsilon 1e-6 \
+   --start_warmup_step 0 \
+   --warmup_proportion 0.0 \
+   --warmup_steps 0 \
+   --input_dir $DATA_DIR \
+   --log_freq $log_freq \
+   --max_steps $max_steps \
+   --tf_ckpt_path $TF_CKPT_PATH \
+   --bert_config_path $BERT_CONFIG_PATH \
+   --unpad $unpad \
+   --unpad_fmha $unpad_fmha \
+   --unpad_fmha_mke_opt $unpad_fmha_mke_opt \
+   --unpad_embed $unpad_embed \
+   --fused_bias_mha $fused_bias_mha \
+   --fused_bias_fc $fused_bias_fc \
+   --fused_dropout_add_ln $fused_dropout_add_ln \
+   --weight_transpose $weight_transpose \
+   --max_seq_length $max_seq_length \
+   --eval_dir $EVAL_DIR \
+   --distributed_lamb $distributed_lamb \
+   --exchange_padding $exchange_padding \
+   --cpu_exchange_padding $cpu_exchange_padding \
+   --seed $SEED \
+   --use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
+   --dense_seq_output $dense_seq_output \
+   --gradient_accumulation_steps 14 \
+   --opt_lamb_beta_1 0.9 \
+   --opt_lamb_beta_2 0.999 \
+   --enable_addto True \
+   --use_pure_fp16 $use_pure_fp16 \
+   --use_amp $use_amp"
+## 2>&1 | tee $LOG_FILE"
+APP="python3 -u $BERT_CMD"
+case $(expr $lrank % 8) in 
+[0])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=0
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE
+  ;;
+[1])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=1
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE
+  ;;
+[2])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=2
+ # export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE
+  ;;
+[3])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=3
+#  export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}  >& $LOG_FILE
+  ;;
+[4])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=4
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=4 --membind=4 ${APP} >& $LOG_FILE
+  ;;
+[5])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=5
+  #export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=5 --membind=5 ${APP} >& $LOG_FILE
+  ;;
+[6])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=6
+ # export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=6 --membind=6 ${APP} >& $LOG_FILE
+  ;;
+[7])
+  echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export FLAGS_selected_gpus=7
+#  export UCX_NET_DEVICES=mlx5_0:1
+  #export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=7 --membind=7 ${APP}  >& $LOG_FILE
+  ;;
+esac
--- a/run_main.sh
+++ b/run_main.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+export BASE_DATA_DIR="/home/users/mlperf-workspace/bert_data" 
+export NEXP="${2:-"400"}"
+export STAGE=${STAGE:-"run"}
+export CONT="nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev-test"
+export CLEAR_CACHES=1
+if [[ $STAGE == "build" ]]; then
+  bash -ex Dockerfiles/build_with_pip_install_whl.sh  
+else
+  # rm -rf results
+  bash -ex run_with_docker.sh
+fi