version 1

581b8d15 · liangjing · 581b8d15 · 581b8d15 · 581b8d15 · 581b8d15
Commit 581b8d15 authored Apr 10, 2023 by liangjing
20 changed files
--- a/external_ops/sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc
+++ b/external_ops/sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/extension.h"
+#define CHECK_GPU_INPUT(__x)                                 \
+  do {                                                       \
+    if (__x.place().GetType() != phi::AllocationType::GPU) { \
+      PD_THROW(#__x " must be GPU Tensor.");                 \
+    }                                                        \
+  } while (0)
+//#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+std::vector<paddle::Tensor> GPUSortBERTInputsAcrossDevices(
+    const paddle::Tensor &input_ids,
+    const paddle::Tensor &segment_ids,
+    const paddle::Tensor &input_mask,
+    const paddle::Tensor &masked_lm_labels,
+    const paddle::Tensor &next_sentence_labels,
+    int max_batch_size,
+    int ring_id,
+    int device_id,
+    int num_devices);
+#endif
+static std::vector<paddle::Tensor> SortBERTInputsAcrossDevices(
+    const paddle::Tensor &input_ids,
+    const paddle::Tensor &segment_ids,
+    const paddle::Tensor &input_mask,
+    const paddle::Tensor &masked_lm_labels,
+    const paddle::Tensor &next_sentence_labels,
+    int max_batch_size,
+    int ring_id,
+    int device_id,
+    int num_devices) {
+//#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+  CHECK_GPU_INPUT(input_ids);
+  CHECK_GPU_INPUT(segment_ids);
+  CHECK_GPU_INPUT(input_mask);
+  CHECK_GPU_INPUT(masked_lm_labels);
+  CHECK_GPU_INPUT(next_sentence_labels);
+  return GPUSortBERTInputsAcrossDevices(input_ids,
+                                        segment_ids,
+                                        input_mask,
+                                        masked_lm_labels,
+                                        next_sentence_labels,
+                                        max_batch_size,
+                                        ring_id,
+                                        device_id,
+                                        num_devices);
+#else
+  PADDLE_THROW(platform::errors::InvalidArgument("Does not support CPU yet."));
+#endif
+}
+static std::vector<std::vector<int64_t>> SortBERTInputsAcrossDevicesInferShape(
+    const std::vector<int64_t> &input_ids_shape,
+    const std::vector<int64_t> &segment_ids_shape,
+    const std::vector<int64_t> &input_mask_shape,
+    const std::vector<int64_t> &masked_lm_labels_shape,
+    const std::vector<int64_t> &next_sentence_labels_shape,
+    const int &max_batch_size,
+    const int &ring_id,
+    const int &device_id,
+    const int &num_devices) {
+  return {input_ids_shape,
+          segment_ids_shape,
+          input_mask_shape,
+          masked_lm_labels_shape,
+          next_sentence_labels_shape};
+}
+static std::vector<paddle::DataType> SortBERTInputsAcrossDevicesInferDType(
+    paddle::DataType input_ids_dtype,
+    paddle::DataType segment_ids_dtype,
+    paddle::DataType input_mask_dtype,
+    paddle::DataType masked_lm_labels_dtype,
+    paddle::DataType next_sentence_labels_dtype) {
+  return {input_ids_dtype,
+          segment_ids_dtype,
+          input_mask_dtype,
+          masked_lm_labels_dtype,
+          next_sentence_labels_dtype};
+}
+PD_BUILD_OP(sort_bert_inputs_across_devices)
+    .Inputs({"InputIds",
+             "SegmentIds",
+             "InputMask",
+             "MaskedLMLabels",
+             "NextSentenceLabels"})
+    .Outputs({"InputIdsOut",
+              "SegmentIdsOut",
+              "InputMaskOut",
+              "MaskedLMLabelsOut",
+              "NextSentenceLabelsOut"})
+    .Attrs({"max_batch_size: int",
+            "ring_id: int",
+            "device_id: int",
+            "num_devices: int"})
+    .SetKernelFn(PD_KERNEL(SortBERTInputsAcrossDevices))
+    .SetInferShapeFn(PD_INFER_SHAPE(SortBERTInputsAcrossDevicesInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(SortBERTInputsAcrossDevicesInferDType));
--- a/external_ops/sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu
+++ b/external_ops/sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include "glog/logging.h"
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/for_range.h"
+namespace framework = paddle::framework;
+namespace operators = paddle::operators;
+namespace platform = paddle::platform;
+namespace memory = paddle::memory;
+namespace kps = paddle::operators::kernel_primitives;
+template <typename T>
+struct NCCLDataTypeTrait;
+#define DEFINE_NCCL_DTYPE_TRAIT(__cpp_type, __nccl_dtype)    \
+  template <>                                                \
+  struct NCCLDataTypeTrait<__cpp_type> {                     \
+    static constexpr ncclDataType_t DataType = __nccl_dtype; \
+  }
+DEFINE_NCCL_DTYPE_TRAIT(int16_t, ncclFloat16);
+DEFINE_NCCL_DTYPE_TRAIT(int32_t, ncclInt32);
+DEFINE_NCCL_DTYPE_TRAIT(int64_t, ncclInt64);
+template <typename T>
+static std::string GPUTensorToString(const T *ptr, size_t numel) {
+  platform::CUDAPlace place(platform::GetCurrentDeviceId());
+  auto &dev_ctx = *platform::DeviceContextPool::Instance().GetByPlace(place);
+  auto stream = dev_ctx.stream();
+  std::vector<T> cpu_data(numel);
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
+      cpu_data.data(), ptr, sizeof(T) * numel, hipMemcpyDeviceToHost, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+  std::stringstream ss;
+  ss << "[";
+  for (decltype(numel) i = 0; i < numel; ++i) {
+    if (i > 0) ss << ",";
+    ss << cpu_data[i];
+  }
+  ss << "]";
+  return ss.str();
+}
+template <typename T>
+static std::string GPUTensorToString(const paddle::Tensor &t) {
+  return GPUTensorToString<T>(t.data<T>(), t.numel());
+}
+template <typename T>
+static paddle::Tensor PadTensor(const paddle::Tensor &x,
+                                int max_batch_size,
+                                hipStream_t stream) {
+#define CALL_PAD_ZERO_DATA                                                   \
+  do {                                                                       \
+    T *y_data = y.mutable_data<T>(x.place());                                \
+    const T *x_data = x.data<T>();                                           \
+    int n = batch_size * seq_len;                                            \
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(                              \
+        y_data, x_data, n * sizeof(T), hipMemcpyDeviceToDevice, stream));   \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                              \
+        hipMemsetAsync(y_data + n,                                          \
+                        0,                                                   \
+                        (max_batch_size - batch_size) * seq_len * sizeof(T), \
+                        stream));                                            \
+    return y;                                                                \
+  } while (0)
+  const auto &x_dim = x.dims();
+  int batch_size = x_dim[0];
+  if (x_dim.size() == 2) {
+    int seq_len = x_dim[1];
+    paddle::Tensor y(x.place(), {max_batch_size, seq_len});
+    CALL_PAD_ZERO_DATA;
+  } else {
+    PADDLE_ENFORCE_EQ(x_dim.size(), 1);
+    int seq_len = 1;
+    paddle::Tensor y(x.place(), {max_batch_size});
+    CALL_PAD_ZERO_DATA;
+  }
+}
+template <typename T>
+struct RetrieveAllGatheredInputMaskFunctor {
+  RetrieveAllGatheredInputMaskFunctor(
+      const T *x, T *y, int fused_numel, int each_mask_numel, int mask_offset)
+      : x_(x),
+        y_(y),
+        fused_numel_(fused_numel),
+        each_mask_numel_(each_mask_numel),
+        mask_offset_(mask_offset) {}
+  HOSTDEVICE void operator()(int idx) const {
+    int rank = idx / each_mask_numel_;
+    int offset = mask_offset_ + idx % each_mask_numel_;
+    int x_idx = rank * fused_numel_ + offset;
+    y_[idx] = x_[x_idx];
+  }
+ private:
+  const T *x_;
+  T *y_;
+  int fused_numel_;
+  int each_mask_numel_;
+  int mask_offset_;
+};
+template <typename T>
+struct IotaFunctor {
+  explicit IotaFunctor(T *x) : x_(x) {}
+  HOSTDEVICE void operator()(int idx) const { x_[idx] = static_cast<T>(idx); }
+ private:
+  T *x_;
+};
+template <typename T, typename IndexT>
+struct IsNonZeroFunctor {
+  HOSTDEVICE IndexT operator()(T x) const {
+    return static_cast<IndexT>(x != 0);
+  }
+};
+template <typename T, typename IndexT>
+struct ReorderBERTInputTensorsFunctor {
+ public:
+  ReorderBERTInputTensorsFunctor(const T *fused_inputs,
+                                 const IndexT *indices,
+                                 int device_id,
+                                 int num_devices,
+                                 int max_batch_size,
+                                 int seq_len,
+                                 T *input_ids_out,
+                                 T *segment_ids_out,
+                                 T *input_mask_out,
+                                 T *masked_lm_labels_out,
+                                 T *next_sentence_labels_out)
+      : fused_inputs_(fused_inputs),
+        indices_(indices),
+        device_id_(device_id),
+        num_devices_(num_devices),
+        max_batch_size_(max_batch_size),
+        seq_len_(seq_len),
+        input_ids_out_(input_ids_out),
+        segment_ids_out_(segment_ids_out),
+        input_mask_out_(input_mask_out),
+        masked_lm_labels_out_(masked_lm_labels_out),
+        next_sentence_labels_out_(next_sentence_labels_out) {}
+  // idx is in range [0, new_bs * seq_len)
+  HOSTDEVICE void operator()(int idx) const {
+    int out_idx_i = idx / seq_len_;
+    int seq_len_idx = idx % seq_len_;
+    auto index_per_device = indices_[device_id_ + out_idx_i * num_devices_];
+    auto gpu_idx = index_per_device / max_batch_size_;
+    auto bs_idx = index_per_device % max_batch_size_;
+    // [gpu_idx, bs_idx, seq_len_idx]
+    int device_stride = 4 * max_batch_size_ * seq_len_ + max_batch_size_;
+    int tensor_offset = max_batch_size_ * seq_len_;
+    int device_offset = gpu_idx * device_stride;
+    int offset = device_offset + bs_idx * seq_len_ + seq_len_idx;
+    input_ids_out_[idx] = fused_inputs_[offset];
+    segment_ids_out_[idx] = fused_inputs_[offset + tensor_offset];
+    input_mask_out_[idx] = fused_inputs_[offset + 2 * tensor_offset];
+    masked_lm_labels_out_[idx] = fused_inputs_[offset + 3 * tensor_offset];
+    if (seq_len_idx == 0) {
+      next_sentence_labels_out_[out_idx_i] =
+          fused_inputs_[device_offset + 4 * max_batch_size_ * seq_len_ +
+                        bs_idx];
+    }
+  }
+ private:
+  const T *fused_inputs_;
+  const IndexT *indices_;
+  int device_id_;
+  int num_devices_;
+  int max_batch_size_;
+  int seq_len_;
+  T *input_ids_out_;
+  T *segment_ids_out_;
+  T *input_mask_out_;
+  T *masked_lm_labels_out_;
+  T *next_sentence_labels_out_;
+};
+/**
+ * input_ids: shape: [bs, seq_len], range: [0, vocab_size)
+ * segment_ids: shape: [bs, seq_len], range: {0, 1}
+ * input_mask: shape: [bs, seq_len], range: {0, 1}
+ * masked_lm_labels: shape: [bs, seq_len], range: [0, vocab_size)
+ * next_sentence_labels：shape: [bs] or [bs, 1], range: {0, 1}
+ */
+template <typename T>
+std::vector<paddle::Tensor> GPUSortBERTInputsAcrossDevicesWithDType(
+    paddle::Tensor input_ids,
+    paddle::Tensor segment_ids,
+    paddle::Tensor input_mask,
+    paddle::Tensor masked_lm_labels,
+    paddle::Tensor next_sentence_labels,
+    int max_batch_size,
+    int ring_id,
+    int device_id,
+    int num_devices) {
+  const auto &dim = input_ids.dims();
+  int batch_size = dim[0];
+  int seq_len = dim[1];
+  PADDLE_ENFORCE_EQ(dim.size(), 2);
+  PADDLE_ENFORCE_LE(batch_size, max_batch_size);
+  PADDLE_ENFORCE_EQ(dim, segment_ids.dims());
+  PADDLE_ENFORCE_EQ(dim, input_mask.dims());
+  PADDLE_ENFORCE_EQ(dim, masked_lm_labels.dims());
+  const auto &nsl_dim = next_sentence_labels.dims();
+  if (nsl_dim.size() == 2) {
+    PADDLE_ENFORCE_EQ(nsl_dim[0], batch_size);
+    PADDLE_ENFORCE_EQ(nsl_dim[1], 1);
+  } else if (nsl_dim.size() == 1) {
+    PADDLE_ENFORCE_EQ(nsl_dim[0], batch_size);
+  } else {
+    PADDLE_THROW("invalid next_sentence_labels rank, should be 1 or 2.");
+  }
+  bool need_pad = (batch_size < max_batch_size);
+  // NOTE: device_id may be different from platform::GetCurrentDeviceId()!
+  platform::CUDAPlace place(platform::GetCurrentDeviceId());
+  auto &dev_ctx = *platform::DeviceContextPool::Instance().GetByPlace(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place)->comm();
+  auto stream = dev_ctx.stream();
+  // Step 1: pad to max_batch_size
+  if (need_pad) {
+    input_ids = PadTensor<T>(input_ids, max_batch_size, stream);
+    segment_ids = PadTensor<T>(segment_ids, max_batch_size, stream);
+    input_mask = PadTensor<T>(input_mask, max_batch_size, stream);
+    masked_lm_labels = PadTensor<T>(masked_lm_labels, max_batch_size, stream);
+    next_sentence_labels =
+        PadTensor<T>(next_sentence_labels, max_batch_size, stream);
+  }
+  VLOG(10) << "input_ids = " << GPUTensorToString<T>(input_ids);
+  VLOG(10) << "segment_ids = " << GPUTensorToString<T>(segment_ids);
+  VLOG(10) << "input_mask = " << GPUTensorToString<T>(input_mask);
+  VLOG(10) << "masked_lm_labels = " << GPUTensorToString<T>(masked_lm_labels);
+  VLOG(10) << "next_sentence_labels = "
+           << GPUTensorToString<T>(next_sentence_labels);
+  // Step 2: fuse to continous space
+  int n = max_batch_size * seq_len;
+  int numel = 4 * n + max_batch_size;
+  auto buffer = memory::Alloc(place, numel * sizeof(T));
+  T *buf_ptr = reinterpret_cast<T *>(buffer->ptr());
+  const T *input_ids_ptr =
+      static_cast<const paddle::Tensor &>(input_ids).data<T>();
+  const T *segment_ids_ptr =
+      static_cast<const paddle::Tensor &>(segment_ids).data<T>();
+  const T *input_mask_ptr =
+      static_cast<const paddle::Tensor &>(input_mask).data<T>();
+  const T *masked_lm_labels_ptr =
+      static_cast<const paddle::Tensor &>(masked_lm_labels).data<T>();
+  const T *next_sentence_labels_ptr =
+      static_cast<const paddle::Tensor &>(next_sentence_labels).data<T>();
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
+      buf_ptr, input_ids_ptr, n * sizeof(T), hipMemcpyDeviceToDevice, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + n,
+                                             segment_ids_ptr,
+                                             n * sizeof(T),
+                                             hipMemcpyDeviceToDevice,
+                                             stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + 2 * n,
+                                             input_mask_ptr,
+                                             n * sizeof(T),
+                                             hipMemcpyDeviceToDevice,
+                                             stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + 3 * n,
+                                             masked_lm_labels_ptr,
+                                             n * sizeof(T),
+                                             hipMemcpyDeviceToDevice,
+                                             stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + 4 * n,
+                                             next_sentence_labels_ptr,
+                                             max_batch_size * sizeof(T),
+                                             hipMemcpyDeviceToDevice,
+                                             stream));
+  VLOG(10) << "fused input = " << GPUTensorToString<T>(buf_ptr, numel);
+  // Step 3: allgather
+  auto allgather_buffer = memory::Alloc(place, numel * num_devices * sizeof(T));
+  T *allgather_buf_ptr = reinterpret_cast<T *>(allgather_buffer->ptr());
+  auto nccl_dtype = NCCLDataTypeTrait<T>::DataType;
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
+      buf_ptr, allgather_buf_ptr, numel, nccl_dtype, comm, stream));
+  buffer = nullptr;
+  VLOG(10) << "allgather fused input = "
+           << GPUTensorToString<T>(allgather_buf_ptr, numel * num_devices);
+  // Step 4: sort by seq_len
+  framework::Tensor allgather_input_mask;
+  int gbs = num_devices * max_batch_size;
+  allgather_input_mask.Resize({gbs, seq_len});
+  T *allgather_input_mask_ptr = allgather_input_mask.mutable_data<T>(place);
+  platform::ForRange<platform::CUDADeviceContext> retrieve_mask_for_range(
+      dev_ctx, num_devices * n);
+  int input_mask_offset = 2 * n;
+  retrieve_mask_for_range(
+      RetrieveAllGatheredInputMaskFunctor<T>(allgather_buf_ptr,
+                                             allgather_input_mask_ptr,
+                                             numel,
+                                             n,
+                                             input_mask_offset));
+  VLOG(10) << "allgather mask = "
+           << GPUTensorToString<T>(allgather_input_mask_ptr, gbs * seq_len);
+  framework::Tensor allgather_seq_len;
+  allgather_seq_len.Resize({gbs});
+  allgather_seq_len.mutable_data<T>(place);
+  operators::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      dev_ctx,
+      allgather_input_mask,
+      &allgather_seq_len,
+      kps::IdentityFunctor<T>(),
+      {1},
+      stream);
+  VLOG(10) << "allgather seq_len = "
+           << GPUTensorToString<T>(allgather_seq_len.data<T>(), gbs);
+  using IndexT = int;
+  auto indices = memory::Alloc(place, gbs * sizeof(IndexT));
+  auto *indices_ptr = reinterpret_cast<IndexT *>(indices->ptr());
+  platform::ForRange<platform::CUDADeviceContext> itoa_for_range(dev_ctx, gbs);
+  itoa_for_range(IotaFunctor<IndexT>(indices_ptr));
+  auto sorted_indices = memory::Alloc(place, gbs * sizeof(IndexT));
+  auto *sorted_indices_ptr = reinterpret_cast<IndexT *>(sorted_indices->ptr());
+  auto *allgather_seq_len_ptr = allgather_seq_len.data<T>();
+  auto sorted_allgather_seq_len = memory::Alloc(place, gbs * sizeof(T));
+  auto *sorted_allgather_seq_len_ptr =
+      reinterpret_cast<T *>(sorted_allgather_seq_len->ptr());
+  memory::AllocationPtr tmp_storage;
+  void *tmp_storage_ptr;
+  size_t tmp_storage_size = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (tmp_storage_size > 0 && tmp_storage == nullptr) {
+      tmp_storage = memory::Alloc(place, tmp_storage_size);
+    }
+    tmp_storage_ptr = tmp_storage ? tmp_storage->ptr() : nullptr;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipcub::DeviceRadixSort::SortPairsDescending(tmp_storage_ptr,
+                                                  tmp_storage_size,
+                                                  allgather_seq_len_ptr,
+                                                  sorted_allgather_seq_len_ptr,
+                                                  indices_ptr,
+                                                  sorted_indices_ptr,
+                                                  gbs,
+                                                  0,
+                                                  sizeof(T) * 8,
+                                                  stream));
+  }
+  VLOG(10) << "allgather sorted seq_len = "
+           << GPUTensorToString<T>(sorted_allgather_seq_len_ptr, gbs);
+  VLOG(10) << "allgather sorted indices = "
+           << GPUTensorToString<IndexT>(sorted_indices_ptr, gbs);
+  // Step 5: reorder inputs
+  memory::AllocationPtr ntokens_alloc;
+  IndexT ntokens;
+  if (need_pad) {
+    // find the max valid length here
+    ntokens_alloc = memory::Alloc(place, sizeof(IndexT));
+    auto *ntokens_ptr = reinterpret_cast<IndexT *>(ntokens_alloc->ptr());
+    using CubIterator =
+        hipcub::TransformInputIterator<IndexT, IsNonZeroFunctor<T, IndexT>, T *>;
+    CubIterator input_iter(sorted_allgather_seq_len_ptr,
+                           IsNonZeroFunctor<T, IndexT>());
+    tmp_storage_size = 0;
+    for (int i = 0; i < 2; ++i) {
+      if (i > 0 && tmp_storage_size > 0 &&
+          (tmp_storage == nullptr || tmp_storage->size() < tmp_storage_size)) {
+        tmp_storage = memory::Alloc(place, tmp_storage_size);
+      }
+      tmp_storage_ptr = tmp_storage ? tmp_storage->ptr() : nullptr;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipcub::DeviceReduce::Reduce(tmp_storage_ptr,
+                                    tmp_storage_size,
+                                    input_iter,
+                                    ntokens_ptr,
+                                    gbs,
+                                    hipcub::Sum(),
+                                    static_cast<IndexT>(0),
+                                    stream));
+      VLOG(10) << "ntokens_ptr = " << GPUTensorToString<IndexT>(ntokens_ptr, 1);
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
+        &ntokens, ntokens_ptr, sizeof(IndexT), hipMemcpyDeviceToHost, stream));
+    // NOTE: Maybe we do not need this line? D2H copy is always
+    // synchronous if we do not use pinned memory.
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+  } else {
+    ntokens = gbs;
+  }
+  VLOG(10) << "ntokens = " << ntokens;
+  // The indices in GPU:device_id would be:
+  // indices[device_id], indices[device_id + num_devices], ...,
+  // indices[device_id + (batch_size - 1) * num_devices]
+  // Therefore, device_id + (batch_size - 1) * num_devices < ntokens
+  // i.e., batch_size < (ntokens - device_id) / num_devices + 1
+  int new_bs = (ntokens - device_id) / num_devices;
+  if ((ntokens - device_id) % num_devices > 0) {
+    ++new_bs;
+  }
+  VLOG(10) << "New batch size is: " << new_bs
+           << " , original batch size is: " << batch_size;
+  std::vector<int64_t> out_shape = {new_bs, seq_len};
+  std::vector<int64_t> nsl_out_shape = next_sentence_labels.shape();
+  nsl_out_shape[0] = new_bs;
+  paddle::Tensor new_input_ids(input_ids.place(), out_shape);
+  paddle::Tensor new_segment_ids(segment_ids.place(), out_shape);
+  paddle::Tensor new_input_mask(input_mask.place(), out_shape);
+  paddle::Tensor new_masked_lm_labels(masked_lm_labels.place(), out_shape);
+  paddle::Tensor new_next_sentence_labels(next_sentence_labels.place(),
+                                          nsl_out_shape);
+  VLOG(10) << "starts to reorder";
+  platform::ForRange<platform::CUDADeviceContext> reorder_for_range(
+      dev_ctx, new_bs * seq_len);
+  ReorderBERTInputTensorsFunctor<T, IndexT> reorder_functor(
+      allgather_buf_ptr,
+      sorted_indices_ptr,
+      device_id,
+      num_devices,
+      max_batch_size,
+      seq_len,
+      new_input_ids.mutable_data<T>(input_ids.place()),
+      new_segment_ids.mutable_data<T>(segment_ids.place()),
+      new_input_mask.mutable_data<T>(input_mask.place()),
+      new_masked_lm_labels.mutable_data<T>(masked_lm_labels.place()),
+      new_next_sentence_labels.mutable_data<T>(next_sentence_labels.place()));
+  reorder_for_range(reorder_functor);
+  VLOG(10) << "ends to reorder";
+  return {new_input_ids,
+          new_segment_ids,
+          new_input_mask,
+          new_masked_lm_labels,
+          new_next_sentence_labels};
+}
+std::vector<paddle::Tensor> GPUSortBERTInputsAcrossDevices(
+    const paddle::Tensor &input_ids,
+    const paddle::Tensor &segment_ids,
+    const paddle::Tensor &input_mask,
+    const paddle::Tensor &masked_lm_labels,
+    const paddle::Tensor &next_sentence_labels,
+    int max_batch_size,
+    int ring_id,
+    int device_id,
+    int num_devices) {
+  PADDLE_ENFORCE_GT(num_devices, 1);
+  auto dtype = input_ids.dtype();
+#define CALL_DTYPE_FUNC(__dtype, __cpp_type)                      \
+  do {                                                            \
+    if (dtype == paddle::DataType::__dtype) {                     \
+      return GPUSortBERTInputsAcrossDevicesWithDType<__cpp_type>( \
+          input_ids,                                              \
+          segment_ids,                                            \
+          input_mask,                                             \
+          masked_lm_labels,                                       \
+          next_sentence_labels,                                   \
+          max_batch_size,                                         \
+          ring_id,                                                \
+          device_id,                                              \
+          num_devices);                                           \
+    }                                                             \
+  } while (0)
+  CALL_DTYPE_FUNC(INT16, int16_t);
+  CALL_DTYPE_FUNC(INT32, int32_t);
+  CALL_DTYPE_FUNC(INT64, int64_t);
+  PD_THROW("Unsupported data type: %d", static_cast<int>(dtype));
+}
--- a/external_ops/sort_bert_inputs_across_devices/test_sort_bert_inputs_across_devices.py
+++ b/external_ops/sort_bert_inputs_across_devices/test_sort_bert_inputs_across_devices.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy as np
+import paddle
+import paddle.distributed.fleet as fleet
+from paddle.fluid.layers.collective import _c_broadcast as broadcast
+from custom_setup_ops import sort_bert_inputs_across_devices
+def run_test_case(batch_size, max_batch_size=None):
+    if max_batch_size is None:
+        max_batch_size = batch_size
+    max_seq_len = 512
+    vocab_size = 30528
+    dtype = 'int64'
+    ring_id = 0
+    rank = paddle.distributed.get_rank()
+    world_size = paddle.distributed.get_world_size()
+    device_id = rank
+    num_devices = world_size
+    global_batch_size = batch_size * num_devices
+    seq_lens = np.random.randint(
+        low=100, high=max_seq_len + 1, size=[num_devices * batch_size])
+    input_ids = []
+    segment_ids = []
+    input_mask = []
+    masked_lm_labels = []
+    next_sentence_labels = []
+    for seq_len in seq_lens:
+        input_id = np.random.randint(low=1, high=vocab_size, size=[max_seq_len])
+        input_id[seq_len:] = 0
+        input_ids.append(input_id)
+        sentence_1_len = np.random.randint(
+            low=10, high=seq_len - 10, size=[1])[0]
+        segment_id = np.zeros([max_seq_len])
+        segment_id[sentence_1_len:seq_len] = 1
+        segment_ids.append(segment_id)
+        mask = np.ones([max_seq_len])
+        mask[seq_len:] = 0
+        input_mask.append(mask)
+        masked_lm_label = np.random.randint(
+            low=0, high=vocab_size, size=[max_seq_len])
+        masked_lm_labels.append(masked_lm_label)
+        next_sentence_label = np.random.randint(low=0, high=2, size=[1])[0]
+        next_sentence_labels.append(next_sentence_label)
+    shape = [num_devices, batch_size, max_seq_len]
+    input_ids = np.reshape(np.array(input_ids, dtype=dtype), shape)
+    segment_ids = np.reshape(np.array(segment_ids, dtype=dtype), shape)
+    input_mask = np.reshape(np.array(input_mask, dtype=dtype), shape)
+    masked_lm_labels = np.reshape(
+        np.array(
+            masked_lm_labels, dtype=dtype), shape)
+    next_sentence_labels = np.reshape(
+        np.array(
+            next_sentence_labels, dtype=dtype), [num_devices, batch_size])
+    seq_lens = np.reshape(np.sum(input_mask, axis=2), [-1])
+    sorted_indices = np.argsort(-seq_lens, kind='mergesort')
+    sorted_indices = np.reshape(sorted_indices,
+                                [batch_size, num_devices])[:, rank]
+    unsorted_indices = np.array(
+        range(rank * batch_size, (rank + 1) * batch_size),
+        dtype=sorted_indices.dtype)
+    def reorder_by_indices(indices):
+        shape = [num_devices * batch_size, max_seq_len]
+        input_ids_ret = np.reshape(input_ids, shape)[indices, :]
+        segment_ids_ret = np.reshape(segment_ids, shape)[indices, :]
+        input_mask_ret = np.reshape(input_mask, shape)[indices, :]
+        masked_lm_labels_ret = np.reshape(masked_lm_labels, shape)[indices, :]
+        next_sentence_labels_ret = np.reshape(next_sentence_labels,
+                                              shape[:1])[indices]
+        return input_ids_ret, segment_ids_ret, input_mask_ret, masked_lm_labels_ret, next_sentence_labels_ret
+    input_ids_sorted, segment_ids_sorted, input_mask_sorted, masked_lm_labels_sorted, next_sentence_labels_sorted = reorder_by_indices(
+        sorted_indices)
+    input_ids_unsorted, segment_ids_unsorted, input_mask_unsorted, masked_lm_labels_unsorted, next_sentence_labels_unsorted = reorder_by_indices(
+        unsorted_indices)
+    input_ids_sorted_actual, segment_ids_sorted_actual, input_mask_sorted_actual, masked_lm_labels_sorted_actual, next_sentence_labels_sorted_actual = sort_bert_inputs_across_devices(
+        paddle.to_tensor(input_ids_unsorted),
+        paddle.to_tensor(segment_ids_unsorted),
+        paddle.to_tensor(input_mask_unsorted),
+        paddle.to_tensor(masked_lm_labels_unsorted),
+        paddle.to_tensor(next_sentence_labels_unsorted), max_batch_size,
+        ring_id, rank, world_size)
+    def assert_equal(x, y):
+        assert np.array_equal(np.array(x), np.array(y))
+    assert_equal(input_ids_sorted, input_ids_sorted_actual)
+    assert_equal(segment_ids_sorted, segment_ids_sorted_actual)
+    assert_equal(input_mask_sorted, input_mask_sorted_actual)
+    assert_equal(masked_lm_labels_sorted, masked_lm_labels_sorted_actual)
+    assert_equal(next_sentence_labels_sorted,
+                 next_sentence_labels_sorted_actual)
+    print('Test Passed')
+def gen_seed_if_not_exists():
+    seed = os.environ.get('SEED')
+    if seed:
+        return int(seed)
+    with paddle.no_grad():
+        seed = np.random.randint(low=0, high=10000, size=[1])
+        seed = paddle.to_tensor(seed)
+        seed = broadcast(seed, use_calc_stream=True)
+        seed = seed.numpy()[0]
+        os.environ['SEED'] = str(seed)
+        return seed
+def run_main():
+    fleet.init(is_collective=True)
+    seed = gen_seed_if_not_exists()
+    np.random.seed(seed)
+    run_test_case(56)
+    run_test_case(47, 56)
+if __name__ == "__main__":
+    run_main()
--- a/get_local_rank.py
+++ b/get_local_rank.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import subprocess
+def get_gpu_list():
+    gpus = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if gpus is None:
+        output = subprocess.check_output(
+            ["sh", "-c", "nvidia-smi --list-gpus | wc -l"])
+        gpu_num = int(output.strip())
+        gpus = list(range(gpu_num))
+    else:
+        gpus = [int(g.strip()) for g in gpus.split(",") if g.strip()]
+    return gpus
+def get_local_rank():
+    world_rank = os.environ.get("OMPI_COMM_WORLD_RANK")
+    gpus = get_gpu_list()
+    if world_rank is None:
+        local_rank = gpus[0]
+    else:
+        local_rank = gpus[int(world_rank) % 4]
+    return local_rank
+if __name__ == "__main__":
+    local_rank = get_local_rank()
+    print(local_rank)
--- a/init_env.py
+++ b/init_env.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from mpi4py import MPI
+import numpy as np
+import time
+import paddle
+from pybind.functions import process_allgathered_inputs as process_bert_inputs
+from pybind.functions import process_eval_inputs as process_bert_eval_inputs
+import h5py
+import random
+global_comm = MPI.COMM_WORLD
+global_rank = global_comm.rank
+global_world_size = global_comm.size
+assert global_world_size % 2 == 0
+def create_group_comm(ranks):
+    ranks = list(ranks)
+    new_group = global_comm.group.Incl(ranks)
+    new_comm = global_comm.Create_group(new_group)
+    return new_comm
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
+    return seeds
+def broadcast_seeds(comm, seeds, root=0):
+    seeds = np.array(seeds).astype(np.int64)
+    comm.Bcast(seeds, root=root)
+    return seeds.tolist()
+def select_dataset_file_for_each_worker(files, f_start_id, worker_num,
+                                        worker_index):
+    """
+    Spliting the train file according to the worker index.
+    """
+    num_files = len(files)
+    if worker_num > num_files:
+        remainder = worker_num % num_files
+        data_file = files[(
+            f_start_id * worker_num + worker_index + remainder * f_start_id) %
+                          num_files]
+    else:
+        data_file = files[(f_start_id * worker_num + worker_index) % num_files]
+    # limin-todo: 
+    #data_file = "/data2/zengjinle/dataset/bert_data/hdf5/training-4320/hdf5_4320_shards_uncompressed/part_01799_of_04320.hdf5"
+    #print("data_file: ", data_file)
+    return data_file
+def read_hdf5_file(input_file, dtype=np.int16):
+    keys = [
+        'input_ids',
+        'input_mask',
+        'segment_ids',
+        'masked_lm_positions',
+        'masked_lm_ids',
+        'next_sentence_labels',
+    ]
+    if not os.path.exists(input_file):
+        return None
+    with h5py.File(input_file, 'r') as f:
+        outputs = [np.array(f[key], dtype=dtype) for key in keys]
+        n = outputs[0].shape[0]
+        masked_lm_labels = np.zeros(outputs[0].shape, dtype=dtype)
+        lengths = np.zeros(n, dtype=dtype)
+        for i in range(n):
+            masked_lm_positions = outputs[3][i]
+            masked_lm_ids = outputs[4][i]
+            length = np.count_nonzero(masked_lm_positions)
+            masked_lm_labels[i][
+                masked_lm_positions[:length]] = masked_lm_ids[:length]
+            lengths[i] = np.count_nonzero(outputs[1][i])
+        outputs = [
+            outputs[0], outputs[2], outputs[1], masked_lm_labels, outputs[-1],
+            lengths
+        ]
+        idx = np.random.choice(np.arange(n), n, replace=False)
+        for i in range(len(outputs)):
+            outputs[i] = outputs[i][idx]
+    return outputs
+def read_eval_hdf5_file(input_file, dtype=np.int16):
+    keys = [
+        'input_ids',
+        'input_mask',
+        'segment_ids',
+        'masked_lm_positions',
+        'masked_lm_ids',
+        'next_sentence_labels',
+    ]
+    if not os.path.exists(input_file):
+        return None
+    with h5py.File(input_file, 'r') as f:
+        outputs = [np.asarray(f[key][:]) for key in keys]
+        nsamples = outputs[0].shape[0]
+        all_data = []
+        for index in range(nsamples):
+            [
+                input_ids, input_mask, segment_ids, masked_lm_positions,
+                masked_lm_ids, next_sentence_labels
+            ] = [
+                input[index].astype(dtype)
+                if indice < 5 else np.asarray(input[index].astype(dtype))
+                for indice, input in enumerate(outputs)
+            ]
+            length = np.count_nonzero(masked_lm_positions)
+            masked_lm_positions = masked_lm_positions[:length]
+            masked_lm_ids = masked_lm_ids[:length]
+            masked_lm_labels = np.zeros(input_ids.shape, dtype=dtype)
+            masked_lm_labels[masked_lm_positions] = masked_lm_ids
+            #if index == 0:
+            #    print("masked_lm_labels = ", masked_lm_labels)
+            #    print("masked_lm_positions = ", masked_lm_positions)
+            #    print("masked_lm_ids = ", masked_lm_ids)
+            seq_len = np.asarray(np.count_nonzero(input_mask))
+            data = [
+                input_ids,
+                segment_ids,
+                input_mask,
+                masked_lm_labels,
+                next_sentence_labels,
+                seq_len,
+            ]
+            # (2050, ), i.e., 512 * 4 + 1 + 1
+            one_sample_data = np.concatenate([d.flatten() for d in data])
+            all_data.extend(one_sample_data)
+        # (2050000, ) -> (10000, 2050)
+    return np.asarray(all_data).reshape((nsamples, -1))
+class WorkerInitObj(object):
+    "Construct the object with different seed, and the Dataloader will generate the data "
+    "with different seed in each worker."
+    def __init__(self, seed):
+        self.seed = seed
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+class Context:
+    def __init__(self):
+        half_size = int(global_world_size / 2)
+        self.trainer_id = global_rank % half_size
+        self.trainer_num = half_size
+        self.is_trainer = (global_rank < half_size)
+        self.reader_id = self.trainer_id
+        self.reader_num = self.trainer_num
+        self.is_reader = not self.is_trainer
+        self.trainer_comm = create_group_comm(range(0, half_size))
+        self.reader_comm = create_group_comm(
+            range(half_size, global_world_size))
+        self.trainer_reader_comm = create_group_comm(
+            [self.trainer_id, self.trainer_id + half_size])
+        self.global_comm = global_comm
+    def init_args(self, args, dtype=np.int16):
+        self.args = args
+        self.files = [
+            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
+            if os.path.isfile(os.path.join(args.input_dir, f)) and "part" in f
+        ]
+        self.files.sort()
+        self.fid_buf = np.array([1], dtype=np.int64)
+        with h5py.File(self.files[0], 'r') as f:
+            self.num_samples = np.array(f["next_sentence_labels"][:]).size
+        self.batch_size = args.train_batch_size
+        self.max_seq_length = args.max_seq_length
+        self.worker_seeds, self.shuffling_seeds = self._setup_seeds(
+            args.seed, args.num_epochs_to_generate_seeds_for)
+        self.epoch_idx = 0
+        data_buf_size = self.num_samples * 4 * self.max_seq_length + self.num_samples * 2
+        self.data_buf = np.empty(
+            shape=[self.trainer_num * data_buf_size], dtype=dtype)
+        self.eval_dir = args.eval_dir
+        self.num_eval_examples = args.num_eval_examples
+        self.eval_batch_size = args.eval_batch_size
+        cur_seed = self.worker_seeds[self.trainer_id]
+        np.random.seed(cur_seed)
+        random.seed(cur_seed)
+        paddle.seed(cur_seed)
+        self.worker_init = WorkerInitObj(cur_seed)
+        self.barrier()
+    def shuffle_files(self):
+        random.Random(self.shuffling_seeds[self.epoch_idx]).shuffle(self.files)
+        self.epoch_idx += 1
+    def _setup_seeds(self, master_seed, epochs):
+        if master_seed is None:
+            master_seed = random.SystemRandom().randint(0, 2**32 - 1)
+            if self.trainer_id == 0:
+                print('Using random master seed: {}'.format(master_seed))
+        else:
+            print('Using master seed from command line: {}'.format(master_seed))
+        # initialize seeding RNG
+        seeding_rng = random.Random(master_seed)
+        # generate worker seeds, one seed for every distributed worker
+        worker_seeds = generate_seeds(seeding_rng, self.trainer_num)
+        # generate seeds for data shuffling, one seed for every epoch
+        shuffling_seeds = generate_seeds(seeding_rng, epochs)
+        worker_seeds = broadcast_seeds(self.global_comm, worker_seeds)
+        shuffling_seeds = broadcast_seeds(self.global_comm, shuffling_seeds)
+        return worker_seeds, shuffling_seeds
+    def worker_seed(self):
+        return self.worker_seeds[self.trainer_id]
+    def barrier(self):
+        self.global_comm.barrier()
+    def stop_reader(self):
+        if self.is_trainer:
+            self.read_file(-1)
+    def file_num(self):
+        return len(self.files)
+    def read_file(self, f_id=None):
+        if self.is_trainer:
+            self.fid_buf[0] = f_id
+            self.trainer_reader_comm.Isend(self.fid_buf, dest=1)
+            if f_id == 0:
+                self.shuffle_files()
+            elif f_id < 0:
+                return
+            self.trainer_reader_comm.Recv(self.data_buf, source=1)
+            results = process_bert_inputs(self.data_buf, self.num_samples,
+                                          self.max_seq_length, self.batch_size,
+                                          self.trainer_id, self.trainer_num)
+            return results
+        else:
+            self.trainer_reader_comm.Recv(self.fid_buf, 0)
+            f_id = self.fid_buf[0]
+            if f_id == 0:
+                self.shuffle_files()
+            elif f_id < 0:
+                return False
+            fname = select_dataset_file_for_each_worker(
+                self.files, f_id, self.trainer_num, self.trainer_id)
+            data = read_hdf5_file(fname, dtype=self.data_buf.dtype)
+            send_buf = np.concatenate([d.flatten() for d in data])
+            self.reader_comm.Allgather(send_buf, self.data_buf)
+            self.trainer_reader_comm.Send(self.data_buf, dest=0)
+            return True
+    def read_eval_file(self):
+        if self.is_trainer:
+            eval_data = []
+            for eval_file in sorted(os.listdir(self.eval_dir)):
+                eval_file_path = os.path.join(self.eval_dir, eval_file)
+                if os.path.isfile(eval_file_path) and 'part' in eval_file_path:
+                    data = read_eval_hdf5_file(
+                        eval_file_path, dtype=self.data_buf.dtype)
+                    eval_data.extend(data)
+                    if len(eval_data) > self.num_eval_examples:
+                        break
+            chunk_size = self.num_eval_examples // self.trainer_num
+            rank = self.trainer_id
+            remainder = self.num_eval_examples % self.trainer_num
+            if rank < remainder:
+                eval_data = eval_data[(chunk_size + 1) * rank:(chunk_size + 1) *
+                                      (rank + 1)]
+            else:
+                eval_data = eval_data[chunk_size * rank + remainder:chunk_size *
+                                      (rank + 1) + remainder]
+            results = process_bert_eval_inputs(eval_data, self.max_seq_length,
+                                               self.eval_batch_size,
+                                               self.args.sort_eval_data)
+            return results
+_context = Context()
+def get_context():
+    return _context
--- a/kill_grep.sh
+++ b/kill_grep.sh
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+ps -aux | grep "$1" | awk '{print $2}' | xargs kill -9
--- a/list.py
+++ b/list.py
+import hostlist
+import os
+import json
+def get_list():
+  worker = []
+  nodelist = os.environ["SLURM_JOB_NODELIST"]
+  nodelist = hostlist.expand_hostlist(nodelist)
+  num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
+  port_number = 60001
+  worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
+  for node in worker_nodes:
+      for index in range(4):
+          worker_sockets = ":".join([node, str(port_number + index )])
+          worker.append(worker_sockets)
+  return ','.join(worker)
--- a/mlperf_logger.py
+++ b/mlperf_logger.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import subprocess
+from mlperf_logging import mllog
+from mlperf_logging.mllog import constants
+mllogger = mllog.get_mllogger()
+def log_start(*args, **kwargs):
+    _log_print(mllogger.start, *args, **kwargs)
+def log_end(*args, **kwargs):
+    _log_print(mllogger.end, *args, **kwargs)
+def log_event(*args, **kwargs):
+    _log_print(mllogger.event, *args, **kwargs)
+def _log_print(logger, *args, **kwargs):
+    if kwargs.pop('sync', False):
+        barrier()
+    if 'stack_offset' not in kwargs:
+        kwargs['stack_offset'] = 3
+    if 'value' not in kwargs:
+        kwargs['value'] = None
+    if kwargs.pop('log_all_ranks', False):
+        log = True
+    else:
+        log = (get_rank() == 0)
+    if log:
+        logger(*args, **kwargs)
+def mlperf_submission_log(benchmark):
+    num_nodes = os.environ.get('SLURM_NNODES', 1)
+    mllog.config(filename=os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
+    mllogger = mllog.get_mllogger()
+    mllogger.logger.propagate = False
+    log_event(
+        key=constants.SUBMISSION_BENCHMARK,
+        value=benchmark, )
+    log_event(key=constants.SUBMISSION_ORG, value='NVIDIA')
+    log_event(key=constants.SUBMISSION_DIVISION, value='closed')
+    log_event(key=constants.SUBMISSION_STATUS, value='onprem')
+    log_event(
+        key=constants.SUBMISSION_PLATFORM,
+        value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
--- a/models/__init__.py
+++ b/models/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling import *
+from .optimization import *
--- a/models/__pycache__/__init__.cpython-36.pyc
+++ b/models/__pycache__/__init__.cpython-36.pyc
--- a/models/__pycache__/load_tf_checkpoint.cpython-36.pyc
+++ b/models/__pycache__/load_tf_checkpoint.cpython-36.pyc
--- a/models/__pycache__/mlperf_logging_helper.cpython-36.pyc
+++ b/models/__pycache__/mlperf_logging_helper.cpython-36.pyc
--- a/models/__pycache__/modeling.cpython-36.pyc
+++ b/models/__pycache__/modeling.cpython-36.pyc
--- a/models/__pycache__/optimization.cpython-36.pyc
+++ b/models/__pycache__/optimization.cpython-36.pyc
--- a/models/cur_utils.py
+++ b/models/cur_utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import inspect
+import paddle
+from paddle.nn import Layer
+def fn_args_to_dict(func, *args, **kwargs):
+    """
+    Inspect function `func` and its arguments for running, and extract a
+    dict mapping between argument names and keys. 
+    """
+    if hasattr(inspect, 'getfullargspec'):
+        (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _,
+         _) = inspect.getfullargspec(func)
+    else:
+        (spec_args, spec_varargs, spec_varkw,
+         spec_defaults) = inspect.getargspec(func)
+    # add positional argument values
+    init_dict = dict(zip(spec_args, args))
+    # add default argument values
+    kwargs_dict = dict(zip(spec_args[-len(spec_defaults):],
+                           spec_defaults)) if spec_defaults else {}
+    kwargs_dict.update(kwargs)
+    init_dict.update(kwargs_dict)
+    return init_dict
+class InitTrackerMeta(type(Layer)):
+    """
+    This metaclass wraps the `__init__` method of a class to add `init_config`
+    attribute for instances of that class, and `init_config` use a dict to track
+    the initial configuration. If the class has `_wrap_init` method, it would be
+    hooked after `__init__` and called as `_wrap_init(self, init_fn, init_args)`.
+    Since InitTrackerMeta would be used as metaclass for pretrained model classes,
+    which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)`
+    rather than `type` as base class for it to avoid inheritance metaclass
+    conflicts.
+    """
+    def __init__(cls, name, bases, attrs):
+        init_func = cls.__init__
+        # If attrs has `__init__`, wrap it using accessable `_wrap_init`.
+        # Otherwise, no need to wrap again since the super cls has been wraped.
+        # TODO: remove reduplicated tracker if using super cls `__init__`
+        help_func = getattr(cls, '_wrap_init',
+                            None) if '__init__' in attrs else None
+        cls.__init__ = InitTrackerMeta.init_and_track_conf(init_func, help_func)
+        super(InitTrackerMeta, cls).__init__(name, bases, attrs)
+    @staticmethod
+    def init_and_track_conf(init_func, help_func=None):
+        """
+        wraps `init_func` which is `__init__` method of a class to add `init_config`
+        attribute for instances of that class.
+        Args:
+            init_func (callable): It should be the `__init__` method of a class.
+            help_func (callable, optional): If provided, it would be hooked after
+                `init_func` and called as `_wrap_init(self, init_func, *init_args, **init_args)`.
+                Default None.
+        Returns:
+            function: the wrapped function
+        """
+        @functools.wraps(init_func)
+        def __impl__(self, *args, **kwargs):
+            # keep full configuration
+            init_func(self, *args, **kwargs)
+            # registed helper by `_wrap_init`
+            if help_func:
+                help_func(self, init_func, *args, **kwargs)
+            self.init_config = kwargs
+            if args:
+                kwargs['init_args'] = args
+            kwargs['init_class'] = self.__class__.__name__
+        return __impl__
--- a/models/data_process/__init__.py
+++ b/models/data_process/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .collate import *
+from .vocab import *
+from .sampler import *
+from .tokenizer import *
--- a/models/data_process/collate.py
+++ b/models/data_process/collate.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+__all__ = ['Stack', 'Pad', 'Tuple', 'Dict']
+class Stack(object):
+    """
+    Stacks the input data samples to construct the batch. The N input samples
+    must have the same shape/length and will be stacked to construct a batch.
+    Args:
+        axis (int, optional): The axis in the result data along which the input
+            data are stacked. Default: 0.
+        dtype (str|numpy.dtype, optional): The value type of the output. If it
+            is set to None, the type of input data is used. Default: None.
+    """
+    def __init__(self, axis=0, dtype=None):
+        self._axis = axis
+        self._dtype = dtype
+    def __call__(self, data):
+        """
+        Batchifies the input data by stacking.
+        Args:
+            data (list[numpy.ndarray]): The input data samples. It is a list. 
+                Each element is a numpy.ndarray or list.
+        Returns:
+            numpy.ndarray: Stacked batch data.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import Stack
+                a = [1, 2, 3, 4]
+                b = [3, 4, 5, 6]
+                c = [5, 6, 7, 8]
+                result = Stack()([a, b, c])
+                '''
+                [[1, 2, 3, 4],
+                 [3, 4, 5, 6],
+                 [5, 6, 7, 8]]
+                '''
+        """
+        data = np.stack(
+            data,
+            axis=self._axis).astype(self._dtype) if self._dtype else np.stack(
+                data, axis=self._axis)
+        return data
+class Pad(object):
+    """
+    Pads the input data samples to the largest length at `axis`.
+    Args:
+        pad_val (float|int, optional): The padding value. Default: 0.
+        axis (int, optional): The axis to pad the arrays. The arrays will be
+            padded to the largest length at `axis`. For example, assume the 
+            input arrays have shape (10, 8, 5), (6, 8, 5), (3, 8, 5) and the 
+            axis is 0. Each input will be padded into (10, 8, 5) and then 
+            stacked to form the final output, which has shape (3, 10, 8, 5). 
+            Default: 0.
+        ret_length (bool|numpy.dtype, optional): If it is bool, indicate whether
+            to return the valid length in the output, and the data type of
+            returned length is int32 if True. If it is numpy.dtype, indicate the
+            data type of returned length. Default: None.
+        dtype (numpy.dtype, optional): The value type of the output. If it is
+            set to None, the input data type is used. Default: None.
+        pad_right (bool, optional): Whether the padding direction is right-side. 
+            If True, it indicates we pad to the right side, while False indicates 
+            we pad to the left side. Default: True.
+     """
+    def __init__(self,
+                 pad_val=0,
+                 axis=0,
+                 ret_length=None,
+                 dtype=None,
+                 pad_right=True):
+        self._pad_val = pad_val
+        self._axis = axis
+        self._ret_length = ret_length
+        self._dtype = dtype
+        self._pad_right = pad_right
+    def __call__(self, data):
+        """
+        Batchifies the input data by padding. The input will be padded to the 
+        largest dimension at `axis` and then stacked to form the final output. 
+        In addition, the function will output the original dimensions at the 
+        `axis` if `ret_length` is not None or False.
+        Args:
+            data (list[numpy.ndarray|list]): The input data samples. It is a 
+                list. Each element is a numpy.ndarray or list.
+        Returns:
+            numpy.ndarray|tuple[numpy.ndarray]: If `ret_length` is False, it 
+            is a numpy.ndarray representing the padded batch data and the 
+            shape is (N, …). Otherwise, it is a tuple, besides the padded batch 
+            data, the tuple also includes a numpy.ndarray representing original 
+            length at `axis` of all input samples, which shaped `(N,)`. 
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import Pad
+                a = [1, 2, 3, 4]
+                b = [5, 6, 7]
+                c = [8, 9]
+                result = Pad(pad_val=0)([a, b, c])
+                '''
+                [[1, 2, 3, 4],
+                 [5, 6, 7, 0],
+                 [8, 9, 0, 0]]
+                '''
+        """
+        arrs = [np.asarray(ele) for ele in data]
+        original_length = [ele.shape[self._axis] for ele in arrs]
+        max_size = max(original_length)
+        ret_shape = list(arrs[0].shape)
+        ret_shape[self._axis] = max_size
+        ret_shape = (len(arrs), ) + tuple(ret_shape)
+        ret = np.full(
+            shape=ret_shape,
+            fill_value=self._pad_val,
+            dtype=arrs[0].dtype if self._dtype is None else self._dtype)
+        for i, arr in enumerate(arrs):
+            if arr.shape[self._axis] == max_size:
+                ret[i] = arr
+            else:
+                slices = [slice(None) for _ in range(arr.ndim)]
+                if self._pad_right:
+                    slices[self._axis] = slice(0, arr.shape[self._axis])
+                else:
+                    slices[self._axis] = slice(max_size - arr.shape[self._axis],
+                                               max_size)
+                if slices[self._axis].start != slices[self._axis].stop:
+                    slices = [slice(i, i + 1)] + slices
+                    ret[tuple(slices)] = arr
+        if self._ret_length:
+            return ret, np.asarray(
+                original_length,
+                dtype="int32") if self._ret_length == True else np.asarray(
+                    original_length, self._ret_length)
+        else:
+            return ret
+class Tuple(object):
+    """
+    Wraps multiple batchify functions together. The input functions will be applied
+    to the corresponding input fields.
+    Each sample should be a list or tuple containing multiple fields. The i'th
+    batchify function stored in Tuple will be applied on the i'th field. 
+    For example, when data sample is (nd_data, label), you can wrap two batchify
+    functions using `Tuple(DataBatchify, LabelBatchify)` to batchify nd_data and
+    label correspondingly.
+    Args:
+        fn (callable|list[callable]|tuple[callable]): The batchify functions to 
+            wrap. It is a callable function or a list/tuple of callable functions.
+        args (tuple[callable]): The additional batchify functions to wrap.
+    """
+    def __init__(self, fn, *args):
+        if isinstance(fn, (list, tuple)):
+            assert len(args) == 0, 'Input pattern not understood. The input of Tuple can be ' \
+                                   'Tuple(A, B, C) or Tuple([A, B, C]) or Tuple((A, B, C)). ' \
+                                   'Received fn=%s, args=%s' % (str(fn), str(args))
+            self._fn = fn
+        else:
+            self._fn = (fn, ) + args
+        for i, ele_fn in enumerate(self._fn):
+            assert callable(
+                ele_fn
+            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
+                i, str(type(ele_fn)))
+    def __call__(self, data):
+        """
+        Batchifies data samples by applying each function on the corresponding 
+        data field, and each data field is produced by stacking the field data 
+        of samples.
+        Args:
+            data (list|tuple): The samples to batchfy. Each sample in list/tuple
+                should contain `N` fields.
+        Returns:
+            tuple: A tuple composed of results from all including batchifying 
+            functions.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import Stack, Pad, Tuple
+                data = [
+                        [[1, 2, 3, 4], [1]],
+                        [[5, 6, 7], [0]],
+                        [[8, 9], [1]],
+                       ]
+                batchify_fn = Tuple(Pad(pad_val=0), Stack())
+                ids, label = batchify_fn(data)
+                '''
+                ids:
+                [[1, 2, 3, 4],
+                [5, 6, 7, 0],
+                [8, 9, 0, 0]]
+                label: [[1], [0], [1]]
+                '''
+        """
+        assert len(data[0]) == len(self._fn),\
+            'The number of attributes in each data sample should contain' \
+            ' {} elements'.format(len(self._fn))
+        ret = []
+        for i, ele_fn in enumerate(self._fn):
+            result = ele_fn([ele[i] for ele in data])
+            if isinstance(result, (tuple, list)):
+                ret.extend(result)
+            else:
+                ret.append(result)
+        return tuple(ret)
+class Dict(object):
+    """
+    Wraps multiple batchify functions together. The input functions will be 
+    applied to the corresponding input fields.
+    Each sample should be a dict containing multiple fields. Each batchify 
+    function with key stored in `Dict` will be applied on the field which has 
+    the same key. 
+    For example, when data sample is {'tokens': tokens, 'labels': labels}, you 
+    can wrap two batchify functions using 
+    `Dict({'tokens': DataBatchify, 'labels': LabelBatchify})` to batchify tokens 
+    and labels correspondingly.
+    Args:
+        fn (dict): The batchify functions to wrap. It is a dict, which values is 
+            callable functions.
+    """
+    def __init__(self, fn):
+        assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \
+                                   'Received fn=%s' % (str(fn))
+        self._fn = fn
+        for col_name, ele_fn in self._fn.items():
+            assert callable(
+                ele_fn
+            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
+                col_name, str(type(ele_fn)))
+    def __call__(self, data):
+        """
+        Batchifies data samples by applying each function on the corresponding 
+        data field, and each data field is produced by stacking the field data 
+        with the same key as batchify functions of all samples.
+        Args:
+            data (list[dict]|tuple[dict]): The samples to batchfy. Each sample 
+                in list/tuple is a dict with `N` key-values.
+        Returns:
+            tuple: A tuple composed of results from all including batchifying 
+            functions.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import Stack, Pad, Dict
+                data = [
+                        {'labels':[1], 'token_ids':[1, 2, 3, 4]},
+                        {'labels':[0], 'token_ids':[5, 6, 7]},
+                        {'labels':[1], 'token_ids':[8, 9]},
+                       ]
+                batchify_fn = Dict({'token_ids':Pad(pad_val=0), 'labels':Stack()})
+                ids, label = batchify_fn(data)
+                '''
+                ids:
+                [[1, 2, 3, 4],
+                [5, 6, 7, 0],
+                [8, 9, 0, 0]]
+                label: [[1], [0], [1]]
+                '''
+        """
+        ret = []
+        for col_name, ele_fn in self._fn.items():
+            result = ele_fn([ele[col_name] for ele in data])
+            if isinstance(result, (tuple, list)):
+                ret.extend(result)
+            else:
+                ret.append(result)
+        return tuple(ret)
--- a/models/data_process/iterator.py
+++ b/models/data_process/iterator.py
+# Iterator for NLP Dataset
\ No newline at end of file
--- a/models/data_process/sampler.py
+++ b/models/data_process/sampler.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import functools
+import math
+import six
+import numpy as np
+import paddle.distributed as dist
+class SamplerHelper(object):
+    """
+    The class is to help construct iterable sampler used for 
+    :class:`paddle.io.DataLoader`. It wraps a dataset and uses its 
+    :meth:`__getitem__` method. Every subclass of :class:`SamplerHelper` has 
+    to provide an :meth:`__iter__` method, providing a way to iterate over 
+    indices of dataset elements, and a :meth:`__len__` method that returns the 
+    length of the returned iterators.
+    The class also can be used as batch iterator instead of indices iterator 
+    when `iterator` yield samples rather than indices by initializing `iterator` 
+    with a iterable dataset.
+    .. note:: 
+        The :meth:`__len__` method isn't strictly required by 
+        :class:`paddle.io.DataLoader`, but is expected in any calculation 
+        involving the length of a :class:`paddle.io.DataLoader`.
+    Args:
+        dataset (Dataset): Input dataset for :class:`SamplerHelper`.
+        iterable (Iterable, optional): Iterator of dataset. Default: None.
+    """
+    # chain sampler
+    def __init__(self, dataset, iterable=None):
+        self.data_source = dataset
+        self.iterable = iterable
+        if isinstance(dataset, collections.Iterable) and iterable is None:
+            # iterable-style datasets
+            self.iterable = dataset
+    def __iter__(self):
+        if self.iterable is None:
+            return iter(range(len(self.data_source)))
+        elif isinstance(self.iterable, collections.Iterable):
+            return iter(self.iterable)
+        elif callable(self.iterable):
+            return self.iterable()
+        else:
+            raise ValueError(
+                "`iterable` should be None, instance of Iterable or callable "
+                "producing generator.")
+    def __len__(self):
+        # Allow some samplers have different length with `len(data_source)`,
+        # such as batch sampler.
+        if hasattr(self, "_length"):
+            return self._length
+        else:
+            return len(self.data_source)
+    @property
+    def length(self):
+        """
+        Returns the length.
+        """
+        # since `len()` only produce integer, use length property to get None
+        # for uncertain length. samplers can set length if necessary.
+        try:
+            length = len(self)
+        except Exception:
+            length = None
+        return length
+    @length.setter
+    def length(self, length):
+        self._length = length
+    def apply(self, fn):
+        # Transformation functions would be performed. It includes 
+        # :meth:`shuffle`, :meth:`sort`, :meth:`fit` and :meth:`shard`.
+        # Args:
+        #     fn (callable): Transformation functions to be performed.
+        # Returns:
+        #     SamplerHelper: A new transformed :class:`SamplerHelper` object.
+        rs = fn(self)
+        if isinstance(rs, (list, tuple)):
+            iterable, data_source = rs
+        else:
+            iterable, data_source = rs, self.data_source
+        sampler = type(self)(data_source, iterable)
+        return sampler
+    def shuffle(self, buffer_size=-1, seed=None):
+        """
+        Shuffles the dataset according to the given buffer size and random seed.
+        Args:
+            buffer_size (int, optional): Buffer size for shuffle. If 
+                `buffer_size < 0` or more than the length of the dataset, 
+                `buffer_size` is the length of the dataset. Default: -1. 
+            seed (int, optional): Seed for the random. Default: None.
+        Returns:
+            SamplerHelper: A new shuffled :class:`SamplerHelper` object.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+                    def __len__(self):
+                        return len(self.data)
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+                sampler = sampler.shuffle(seed=2)
+                print(list(sampler))    # indices of dataset elements
+                # [2, 1, 0]
+        """
+        if seed is not None:
+            random_generator = np.random.RandomState(seed)
+        else:  # use the global random generator
+            random_generator = np.random
+        def _impl():
+            buf = []
+            for idx in iter(self):
+                buf.append(idx)
+                if buffer_size > 0 and len(buf) >= buffer_size:
+                    random_generator.shuffle(buf)
+                    for b in buf:
+                        yield b
+                    buf = []
+            if len(buf) > 0:
+                random_generator.shuffle(buf)
+                for b in buf:
+                    yield b
+        return type(self)(self.data_source, _impl)
+    def sort(self, cmp=None, key=None, reverse=False, buffer_size=-1):
+        """
+        Sorts the dataset according to given callable :meth:`cmp` or :meth:`key`.
+        Args:
+            cmp (callable, optional): The function of comparison. Default: None. 
+            key (callable, optional): The function of key. Default: None.
+            reverse (bool, optional): Whether to reverse when sorting the data 
+                samples. If True, it means in descending order, and False means 
+                in ascending order. Default: False.
+            buffer_size (int, optional): Buffer size for sort. If 
+                `buffer_size < 0` or `buffer_size` is more than the length 
+                of the data, `buffer_size` will be set to the length of the data. 
+                Default: -1.
+        Returns:
+            SamplerHelper: A new sorted :class:`SamplerHelper` object.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+                    def __len__(self):
+                        return len(self.data)
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+                # Sorted in ascending order by the length of the first field 
+                # of the sample
+                key = (lambda x, data_source: len(data_source[x][0]))
+                sampler = sampler.sort(key=key)
+                print(list(sampler))    # indices of dataset elements
+                # [2, 1, 0]
+        """
+        if key:
+            key_wrapper = (lambda x: key(x, self.data_source))
+        elif cmp:
+            key_wrapper = functools.cmp_to_key(
+                lambda x, y: cmp(x, y, self.data_source))
+        else:
+            key_wrapper = (lambda x: len(self.data_source[x]))
+        def _impl():
+            data_source = self.data_source
+            buf = []
+            for idx in iter(self):
+                buf.append(idx)
+                if buffer_size > 0 and len(buf) >= buffer_size:
+                    buf = sorted(buf, key=key_wrapper, reverse=reverse)
+                    for b in buf:
+                        yield b
+                    buf = []
+            if len(buf) > 0:
+                buf = sorted(buf, key=key_wrapper, reverse=reverse)
+                for b in buf:
+                    yield b
+        return type(self)(self.data_source, _impl)
+    def batch(self, batch_size, drop_last=False, batch_size_fn=None, key=None):
+        """
+        Batches the dataset according to given `batch_size`.
+        Args:
+            batch_size (int): The batch size.
+            drop_last (bool, optional): Whether to drop the last mini batch. 
+                Default: False.
+            batch_size_fn (callable, optional): It accepts four arguments: 
+                index of data source, the length of minibatch, the size of
+                minibatch so far and data source, and it returns the size of
+                mini batch so far. Actually, the returned value can be anything
+                and would used as argument `size_so_far` in `key`. If None, it
+                would return the length of mini match. Default: None.
+            key (callable, optional): The function of key. It accepts the size of minibatch so far
+                and the length of minibatch, and returns what to be compared
+                with `batch_size`. If None, only the size of mini batch so far
+                would be compared with `batch_size`. Default: None.
+        Returns:
+            SamplerHelper: A new batched :class:`SamplerHelper` object.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+                    def __len__(self):
+                        return len(self.data)
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+                sampler = sampler.batch(batch_size=2)
+                print(list(sampler))    # indices of dataset elements
+                # [[0, 1], [2]]
+        """
+        _key = lambda size_so_far, minibatch_len: size_so_far
+        ori_batch_size_fn = batch_size_fn
+        if batch_size_fn is None:
+            batch_size_fn = lambda new, count, sofar, data_source: count
+        key = _key if key is None else key
+        def _impl():
+            data_source = self.data_source
+            minibatch, size_so_far = [], 0
+            for idx in iter(self):
+                minibatch.append(idx)
+                size_so_far = batch_size_fn(idx,
+                                            len(minibatch), size_so_far,
+                                            data_source)
+                if key(size_so_far, len(minibatch)) == batch_size:
+                    yield minibatch
+                    minibatch, size_so_far = [], 0
+                elif key(size_so_far, len(minibatch)) > batch_size:
+                    if len(minibatch) == 1:
+                        raise ValueError(
+                            "Please increase the value of `batch_size`, or limit the max length of batch."
+                        )
+                    yield minibatch[:-1]
+                    minibatch, size_so_far = minibatch[-1:], batch_size_fn(
+                        idx, 1, 0, data_source)
+            if minibatch and not drop_last:
+                yield minibatch
+        sampler = type(self)(self.data_source, _impl)
+        if ori_batch_size_fn is None and self.length is not None:
+            sampler.length = (self.length + int(not drop_last) *
+                              (batch_size - 1)) // batch_size
+        else:
+            sampler.length = None
+        return sampler
+    def shard(self, num_replicas=None, rank=None):
+        """
+        Slices the dataset for multi GPU training.
+        Args:
+            num_replicas (int, optional): The number of training process, and 
+                is also the number of GPU cards used in training. If None, it 
+                will be set by :meth:`paddle.distributed.get_world_size` method. 
+                Default: None.
+            rank (int, optional): The id of current training process. Equal 
+                to the value of the environment variable PADDLE_TRAINER_ID. If 
+                None, it will be intialized by :meth:`paddle.distributed.get_rank` 
+                method. Default: None.
+        Returns:
+            SamplerHelper: A new sliced :class:`SamplerHelper` object.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+                    def __len__(self):
+                        return len(self.data)
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+                sampler = sampler.shard(num_replicas=2)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 2]
+        """
+        if num_replicas is None:
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            rank = dist.get_rank()
+        def _impl():
+            for i, idx in enumerate(self):
+                if i % num_replicas == rank:
+                    yield idx
+            if i % num_replicas != num_replicas - 1 and rank > i % num_replicas:
+                # use last samples to make it evenly divisible
+                yield idx
+        sampler = type(self)(self.data_source, _impl)
+        if self.length is not None:
+            sampler.length = int(math.ceil(self.length * 1.0 / num_replicas))
+        else:
+            sampler.length = None
+        return sampler
+    def list(self):
+        # Produce a sampler with a `listiterator` when calling `iter`. Since 
+        # `list` would fetch all contents at time, thus it can get accurate 
+        # length.
+        def _impl():
+            indices = list(iter(self))
+            self.length = len(indices)
+            return iter(indices)
+        return type(self)(self.data_source, _impl)
--- a/models/data_process/tokenizer.py
+++ b/models/data_process/tokenizer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import jieba
+from .vocab import Vocab
+def get_idx_from_word(word, word_to_idx, unk_word):
+    if word in word_to_idx:
+        return word_to_idx[word]
+    return word_to_idx[unk_word]
+class BaseTokenizer(object):
+    def __init__(self, vocab):
+        self.vocab = vocab
+    def get_tokenizer(self):
+        return self.tokenizer
+    def cut(self, sentence):
+        pass
+    def encode(self, sentence):
+        pass
+class JiebaTokenizer(BaseTokenizer):
+    """
+    Constructs a tokenizer based on `jieba <https://github.com/fxsjy/jieba>`__. 
+    It supports :meth:`cut` method to split the text to tokens, and :meth:`encode` 
+    method to covert text to token ids.
+    Args:
+        vocab(paddlenlp.data.Vocab): An instance of :class:`paddlenlp.data.Vocab`.
+    """
+    def __init__(self, vocab):
+        super(JiebaTokenizer, self).__init__(vocab)
+        self.tokenizer = jieba.Tokenizer()
+        # initialize tokenizer
+        self.tokenizer.FREQ = {key: 1 for key in self.vocab.token_to_idx.keys()}
+        self.tokenizer.total = len(self.tokenizer.FREQ)
+        self.tokenizer.initialized = True
+    def cut(self, sentence, cut_all=False, use_hmm=True):
+        """
+        The method used to cut the text to tokens.
+        Args:
+            sentence(str): The text that needs to be cuted.
+            cut_all(bool, optional): Whether to use the full mode. If True, 
+                using full mode that gets all the possible words from the 
+                sentence, which is fast but not accurate. If False, using 
+                accurate mode that attempts to cut the sentence into the most 
+                accurate segmentations, which is suitable for text analysis. 
+                Default: False.
+            use_hmm(bool, optional): Whether to use the HMM model. Default: True.
+        Returns:
+            list[str]: A list of tokens.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import Vocab, JiebaTokenizer
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokenizer = JiebaTokenizer(vocab)
+                tokens = tokenizer.cut('我爱你中国')
+                print(tokens)
+                # ['我爱你', '中国']
+        """
+        return self.tokenizer.lcut(sentence, cut_all, use_hmm)
+    def encode(self, sentence, cut_all=False, use_hmm=True):
+        """
+        The method used to convert the text to ids. It will firstly call 
+        :meth:`cut` method to cut the text to tokens. Then, convert tokens to 
+        ids using `vocab`.
+        Args:
+            sentence(str): The text that needs to be cuted.
+            cut_all(bool, optional): Whether to use the full mode. If True, 
+                using full mode that gets all the possible words from the 
+                sentence, which is fast but not accurate. If False, using 
+                accurate mode that attempts to cut the sentence into the most 
+                accurate segmentations, which is suitable for text analysis. 
+                Default: False.
+            use_hmm(bool, optional): Whether to use the HMM model. Default: True.
+        Returns:
+            list[int]: A list of ids.
+        Example:
+            .. code-block:: python
+                from paddlenlp.data import Vocab, JiebaTokenizer
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokenizer = JiebaTokenizer(vocab)
+                ids = tokenizer.encode('我爱你中国')
+                print(ids)
+                # [1170578, 575565]
+        """
+        words = self.cut(sentence, cut_all, use_hmm)
+        return [
+            get_idx_from_word(word, self.vocab.token_to_idx,
+                              self.vocab.unk_token) for word in words
+        ]