2.3.0-dtk-22.04.2

d2d32668 · yuguo960516yuguo · ad08b8ce · d2d32668 · d2d32668 · d2d32668
Commit d2d32668 authored Apr 26, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#endif
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||        \
+     defined(PADDLE_WITH_ASCEND_CL))
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
+#endif
+#include "paddle/fluid/distributed/collective/Common.h"
+constexpr const char* HETER_BACKEND_NAME = "HETER_BACKEND";
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+class ProcessGroupHeter : public ProcessGroup {
+ public:
+  class HeterTask : public ProcessGroup::Task,
+                    public std::enable_shared_from_this<HeterTask> {
+   public:
+    HeterTask(int rank,
+              CommType CommType,
+              const std::vector<phi::DenseTensor>&);
+    bool IsCompleted();
+    void SynchronizeStreams() {}
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    void Synchronize() {}
+    virtual ~HeterTask();
+  };
+  ProcessGroupHeter(const std::shared_ptr<Store>& store,
+                    int rank,
+                    int size,
+                    const platform::Place& place,
+                    int gid,
+                    int local_rank,
+                    int local_size,
+                    int gloo_rank,
+                    int gloo_size,
+                    bool with_switch,
+                    std::string switch_endpoints,
+                    int src_rank,
+                    int dst_rank);
+  const std::string GetBackendName() const override {
+    return std::string(HETER_BACKEND_NAME);
+  }
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>&,
+      std::vector<phi::DenseTensor>&,
+      const AllreduceOptions& = AllreduceOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>&,
+      std::vector<phi::DenseTensor>&,
+      const BroadcastOptions& = BroadcastOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& in_tensors, int peer) override;
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& out_tensors, int peer) override;
+ protected:
+  virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
+      int rank, CommType opType, const std::vector<phi::DenseTensor>& inputs);
+ private:
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<ProcessGroup> inner_pg_;
+  std::shared_ptr<ProcessGroupGloo> inter_pg_;
+  int local_rank_;
+  int local_size_;
+  int gloo_rank_;
+  int gloo_size_;
+  bool with_switch_;
+  std::string switch_endpoint_;
+  int src_rank_;
+  int dst_rank_;
+  static int send_count;
+  static int recv_count;
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+DECLARE_bool(nccl_blocking_wait);
+DECLARE_bool(use_stream_safe_cuda_allocator);
+constexpr int64_t kWaitBlockTImeout = 10;
+namespace paddle {
+namespace distributed {
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<EventManager>& ncclEvents,                       // NOLINT
+    std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    ncclEvents[i].Record(*default_ctx);
+    ncclEvents[i].Block(*dev_ctx[i]);
+  }
+}
+std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
+    std::vector<Place> places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs) {
+  return std::make_shared<ProcessGroupNCCL::NCCLTask>(
+      places, rank, comm_type, inputs);
+}
+ProcessGroupNCCL::NCCLTask::NCCLTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType CommType,
+    const std::vector<phi::DenseTensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  ncclComms_.resize(places.size());
+}
+ProcessGroupNCCL::NCCLTask::~NCCLTask() {}
+void ProcessGroupNCCL::NCCLTask::SetOutputs(
+    std::vector<phi::DenseTensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
+}
+void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
+  }
+}
+bool ProcessGroupNCCL::NCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+  return true;
+}
+void ProcessGroupNCCL::CheckSplitSizes(std::vector<int64_t>& split_sizes,
+                                       std::vector<int64_t> tensor_shape) {
+  int64_t len_size = split_sizes.size();
+  if (len_size == 0) {
+    PADDLE_ENFORCE_EQ(tensor_shape[0] % size_ == 0,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Tensor's dim[0] must be divisible by group size "
+                          "when split_sizes not given."));
+    split_sizes.insert(split_sizes.end(),
+                       size_,
+                       static_cast<int64_t>(tensor_shape[0] / size_));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        len_size == size_,
+        true,
+        platform::errors::InvalidArgument(
+            "The length of split_sizes must be equal to group size."));
+    auto sum_size = std::accumulate(
+        split_sizes.begin(), split_sizes.end(), static_cast<int64_t>(0));
+    PADDLE_ENFORCE_EQ(
+        sum_size == tensor_shape[0],
+        true,
+        platform::errors::InvalidArgument(
+            "The sum of split_sizes must be equal to tensor's dim[0]."));
+  }
+}
+// TODO(sheniang03): Add timeout for wait, now timeout unused
+bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  if (FLAGS_nccl_blocking_wait) {
+    // NOTE(shenliang03): It will block host for sync
+    while (!IsCompleted()) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+    }
+  }
+  if (!barrierTensors_.empty()) {
+    // If we use the work to do barrier, we should block cpu
+    for (auto& place : places_) {
+      platform::CUDADeviceGuard gpuGuard(place);
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#endif
+    }
+  }
+  return true;
+}
+// Same as Wait
+void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
+ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
+                                   int rank,
+                                   int size,
+                                   const platform::Place& place,
+                                   int gid)
+    : ProcessGroup(rank, size, place, gid), store_(store) {
+  platform::SetDeviceId(place_.device);
+}
+void ProcessGroupNCCL::BroadcastUniqueNCCLID(
+    std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
+      auto nccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES);
+      store_->set(key, nccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&nccl_ids[i], ret.data(), ret.size());
+    }
+  }
+}
+// create NCCLManager cache for places_key
+void ProcessGroupNCCL::CreateNCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(),
+                    false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the NCCL Communicator since "
+                        "the GPU place are not known"));
+  std::vector<std::shared_ptr<NCCLCommManager>> nccl_comms;
+  nccl_comms.resize(places.size());
+  // using vector just for broadcast
+  std::vector<ncclUniqueId> nccl_ids;
+  nccl_ids.resize(1);
+  auto& nccl_id = nccl_ids.front();
+  for (auto& place : places) {
+    used_place_ids_.insert(place.GetDeviceId());
+  }
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
+  }
+  BroadcastUniqueNCCLID(nccl_ids);
+  VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
+  std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::CUDADeviceGuard guard(places[i]);
+    nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
+    dev_ctx[i].reset(new CUDADeviceContext(places[i]));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  std::vector<EventManager> events;
+  events.resize(places.size());
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_ncclcomm_.emplace(places_key, std::move(nccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+  auto& nccl_comms = places_to_ncclcomm_[key];
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+  // construct uninitialize guard for device
+  platform::CUDADeviceGuard cuda_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      memory::RecordStream(inputs[i].Holder(),
+                           places_to_ctx_[key][i]->stream());
+    }
+  }
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream);
+    }
+  }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    cuda_guard.SetDevice(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+template <typename Fn>
+void ProcessGroupNCCL::Collective(const phi::DenseTensor* in,
+                                  phi::DenseTensor* out,
+                                  Fn fn,
+                                  CommType op_type) {
+  std::vector<Place> places;
+  places.push_back(in->place());
+  const auto key = GetKeyFromPlaces(places);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+  auto& nccl_comms = places_to_ncclcomm_[key];
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  // construct uninitialize guard for device
+  platform::CUDADeviceGuard cuda_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    cuda_guard.SetDevice(places[0]);
+    memory::RecordStream(in->Holder(), places_to_ctx_[key][0]->stream());
+  }
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    cuda_guard.SetDevice(places[0]);
+    const auto& nccl_stream = places_to_ctx_[key][0]->stream();
+    fn(in, out, nccl_comms[0]->GetNcclComm(), nccl_stream);
+  }
+  cuda_guard.SetDevice(places[0]);
+}
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
+    std::vector<phi::DenseTensor>& tensors,
+    Fn fn,
+    int dst_rank,
+    CommType op_type) {
+  const auto places = GetPlaceList(tensors);
+  const auto key = GetKeyFromPlaces(places);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+  auto& nccl_comms = places_to_ncclcomm_[key];
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  auto task = CreateTask(places, rank_, op_type, tensors);
+  // construct uninitialize guard for device
+  platform::CUDADeviceGuard cuda_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      memory::RecordStream(tensors[i].Holder(),
+                           places_to_ctx_[key][i]->stream());
+    }
+  }
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+    }
+  }
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    cuda_guard.SetDevice(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const AllreduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        return platform::dynload::ncclAllReduce(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op),
+            comm,
+            stream);
+      },
+      CommType::ALLREDUCE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root =
+            opts.source_rank * in_tensors.size() + opts.source_root;
+        return platform::dynload::ncclBroadcast(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            root,
+            comm,
+            stream);
+      },
+      CommType::BROADCAST);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
+    const BarrierOptions& opts) {
+  // Only support single card single process
+  std::vector<phi::GPUPlace> places = {place_};
+  std::vector<phi::DenseTensor> barrierTensors;
+  barrierTensors.reserve(places.size());
+  platform::CUDADeviceGuard gpuGuard;
+  for (auto& place : places) {
+    gpuGuard.SetDeviceIndex(place.GetDeviceId());
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, place);
+    barrierTensors.push_back(
+        *std::dynamic_pointer_cast<phi::DenseTensor>(dt.impl()));
+  }
+  auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors);
+  auto nccl_task = dynamic_cast<ProcessGroupNCCL::NCCLTask*>(task.get());
+  nccl_task->barrierTensors_ = std::move(barrierTensors);
+  return task;
+}
+void CheckTensorsInDifferentDevices(
+    const std::vector<phi::DenseTensor>& tensors, const size_t num_devices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size() == 0,
+      false,
+      platform::errors::InvalidArgument("Tensor list must be nonempty."));
+  PADDLE_ENFORCE_LE(
+      tensors.size(),
+      num_devices,
+      platform::errors::InvalidArgument(
+          "Tensor list mustn't be larger than the number of available GPUs."));
+  std::set<Place> used_devices;
+  for (const auto& t : tensors) {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(t.place()),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Tensors must be CUDA and dense tensor."));
+    const auto inserted = used_devices.insert(t.place()).second;
+    PADDLE_ENFORCE_EQ(inserted,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Tensors must be on distinct GPU devices."));
+  }
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
+    std::vector<phi::DenseTensor>& tensors, int dst_rank) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND);
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
+    std::vector<phi::DenseTensor>& tensors, int src_rank) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV);
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
+    phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
+  // CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
+  std::vector<phi::DenseTensor> shared_tensors;
+  shared_tensors.push_back(shared_input);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND);
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
+    phi::DenseTensor& tensors, int src_rank, int offset, int length) {
+  // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
+  std::vector<phi::DenseTensor> shared_tensors;
+  shared_tensors.push_back(shared_input);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV);
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER);
+}
+void* GetPointerByOffset(void* raw_pointer,
+                         size_t offset,
+                         experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
+  }
+  return nullptr;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        size_t offset = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input.data(), offset, input.dtype()),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output.data(), offset, input.dtype()),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          offset += input.numel() / size_;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLTOALL);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "The dtypes of input and output must be equal."));
+        std::vector<int64_t> in_dims = phi::vectorize(input.dims());
+        std::vector<int64_t> out_dims = phi::vectorize(output.dims());
+        CheckSplitSizes(in_sizes, in_dims);
+        CheckSplitSizes(out_sizes, out_dims);
+        size_t in_offset = 0, out_offset = 0;
+        size_t in_length = 0, out_length = 0;
+        size_t in_row_size = input.numel() / in_dims[0];
+        size_t out_row_size = output.numel() / out_dims[0];
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          in_length = in_sizes[i] * in_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input.data(), in_offset, input.dtype()),
+              in_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          in_offset += in_length;
+          out_length = out_sizes[i] * out_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output.data(), out_offset, input.dtype()),
+              out_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          out_offset += out_length;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLTOALL_SINGLE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            ToNCCLRedType(opts.reduce_op),
+            opts.root_rank,
+            comm,
+            stream));
+      },
+      CommType::REDUCE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        size_t offset = 0;
+        if (rank_ == opts.root_rank) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+          for (auto i = 0; i < size_; i++) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                GetPointerByOffset(input.data(), offset, input.dtype()),
+                input.numel() / size_,
+                platform::ToNCCLDataType(input.dtype()),
+                i,
+                comm,
+                stream));
+            offset += input.numel() / size_;
+          }
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output.data(),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              opts.root_rank,
+              comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output.data(),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              opts.root_rank,
+              comm,
+              stream));
+        }
+      },
+      CommType::SCATTER);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::_ReduceScatterBase(
+    phi::DenseTensor& out_tensor,
+    phi::DenseTensor& in_tensor,
+    const ReduceScatterOptions& opts) {
+  // auto tensor = out_tensors.back();
+  PADDLE_ENFORCE_EQ(
+      out_tensor.dtype(),
+      in_tensor.dtype(),
+      platform::errors::InvalidArgument(
+          "Input tensor and output tensor should be same dtype."));
+  PADDLE_ENFORCE_EQ(
+      out_tensor.numel() * size_,
+      in_tensor.numel(),
+      platform::errors::InvalidArgument("input tensor must be the same size as "
+                                        "output tensor size times world_size"));
+  auto inputs = std::vector<phi::DenseTensor>{in_tensor};
+  auto outputs = std::vector<phi::DenseTensor>{out_tensor};
+  return Collective(
+      inputs,
+      outputs,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          platform::CUDADeviceGuard cuda_guard;
+          cuda_guard.SetDevice(output.place());
+          memory::RecordStream(output.Holder(), stream);
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
+            input.data(),
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            ToNCCLRedType(opts.reduce_op),
+            comm,
+            stream));
+      },
+      CommType::REDUCE_SCATTER);
+}
+void ProcessGroupNCCL::GroupStart() {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+}
+void ProcessGroupNCCL::GroupEnd() {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#endif
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+constexpr const char* NCCL_BACKEND_NAME = "NCCL";
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+using CUDAStream = platform::stream::CUDAStream;
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+class ProcessGroupNCCL : public ProcessGroup {
+ public:
+  class NCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<NCCLTask> {
+   public:
+    NCCLTask(const std::vector<Place>& places,
+             int rank,
+             CommType CommType,
+             const std::vector<phi::DenseTensor>& inputs);
+    bool IsCompleted();
+    void SynchronizeStreams();
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    void Synchronize();
+    void SetOutputs(std::vector<phi::DenseTensor>& outputs);  // NOLINT
+    virtual ~NCCLTask();
+    std::vector<EventManager> control_events_;
+    std::vector<phi::DenseTensor> barrierTensors_;
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<NCCLCommManager>> ncclComms_;
+    std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
+   private:
+  };
+  ProcessGroupNCCL(const std::shared_ptr<Store>& store,
+                   int rank,
+                   int size,
+                   const platform::Place& place,
+                   int gid);
+  const std::string GetBackendName() const override {
+    return std::string(NCCL_BACKEND_NAME);
+  }
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors, int src_rank) override;
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
+                                                   int dst_rank,
+                                                   int offset,
+                                                   int length) override;
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
+                                                   int src_rank,
+                                                   int offset,
+                                                   int length) override;
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in,
+      std::vector<phi::DenseTensor>& out) override;
+  std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
+      std::vector<phi::DenseTensor>& in,
+      std::vector<phi::DenseTensor>& out,
+      std::vector<int64_t>& in_sizes,
+      std::vector<int64_t>& out_sizes) override;
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceOptions& opts) override;
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ScatterOptions&) override;
+  std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
+      phi::DenseTensor&,  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      const ReduceScatterOptions&) override;
+  static void GroupStart();
+  static void GroupEnd();
+ protected:
+  virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
+      std::vector<Place> places,
+      int rank,
+      CommType opType,
+      const std::vector<phi::DenseTensor>& inputs);
+ protected:
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<NCCLCommManager> nccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
+      places_to_ncclcomm_;
+  std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<CUDADeviceContext>>>
+      places_to_ctx_;
+  std::set<int> used_place_ids_;
+ private:
+  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
+                   int root,                             // NOLINT
+                   int server_fd);
+  void BroadcastUniqueNCCLID(std::vector<ncclUniqueId>& nccl_ids);  // NOLINT
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<phi::DenseTensor>& inputs,   // NOLINT
+      std::vector<phi::DenseTensor>& outputs,  // NOLINT
+      Fn fn,
+      CommType op_type);
+  template <typename Fn>
+  void Collective(const phi::DenseTensor*,
+                  phi::DenseTensor*,
+                  Fn fn,
+                  CommType op_type);
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      Fn fn,
+      int dst_rank,
+      CommType op_type);
+  void CreateNCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+  void CheckSplitSizes(std::vector<int64_t>& split_sizes,
+                       std::vector<int64_t> tensor_shape);
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/Types.h
+++ b/paddle/fluid/distributed/collective/Types.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <cstdint>
+#include <vector>
+namespace paddle {
+namespace distributed {
+// TODO(shenliang03): To support AVG for reduce
+enum class ReduceOp : std::uint8_t { SUM = 0, AVG, MAX, MIN, PRODUCT };
+struct AllreduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+};
+struct BroadcastOptions {
+  int source_rank = 0;
+  int source_root = 0;
+};
+struct BarrierOptions {
+  std::vector<int> place_ids;
+};
+struct ReduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+  int root_rank = 0;
+};
+struct ScatterOptions {
+  int root_rank = 0;
+};
+struct ReduceScatterOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/reducer.h"
+namespace paddle {
+namespace distributed {
+static Backend TransToBackend(platform::Place place) {
+  static const std::map<phi::AllocationType, Backend> type_backend = {
+      {phi::AllocationType::GPU, Backend::GPU},
+      {phi::AllocationType::CPU, Backend::CPU},
+  };
+  phi::AllocationType type = place.GetType();
+  auto it = type_backend.find(type);
+  PADDLE_ENFORCE_EQ(it != type_backend.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Place type (%s) is not supported. ", place));
+  return it->second;
+}
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor> tensors,
+    const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size(),
+      is_sparse_gradient.size(),
+      platform::errors::PreconditionNotMet(
+          "tensors len must be equal to is_sparse_gradient len, but "
+          "[%lu] != [%lu]",
+          tensors.size(),
+          is_sparse_gradient.size()));
+  auto check_perm = [](const std::vector<int64_t> &x) -> bool {
+    size_t len = x.size();
+    std::vector<size_t> cnt(len, 0);
+    for (size_t i = 0; i < len; ++i) {
+      if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
+        return false;
+      }
+      cnt[x[i]]++;
+    }
+    return true;
+  };
+  PADDLE_ENFORCE_EQ(true,
+                    check_perm(tensor_indices),
+                    platform::errors::PreconditionNotMet(
+                        "tensor_indices must be a permutation from 0 to %lu",
+                        tensor_indices.size()));
+  // the return vector
+  std::vector<std::vector<size_t>> res;
+  // Key: the var type
+  // Value: should use which index in group_size_limits for group size limit
+  std::map<experimental::DataType, size_t> group_limit_index;
+  // Key: the var type
+  // Value: <the var index in input tensors, total numel in this group>
+  std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
+      next_group;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const auto &var = tensors[i];
+    size_t tensor_real_index = i;
+    if (!tensor_indices.empty()) {
+      tensor_real_index = tensor_indices[i];
+    }
+    if (is_sparse_gradient[tensor_real_index]) {
+      // we keep sparse var a single group
+      res.push_back({tensor_real_index});
+      continue;
+    }
+    const auto &var_dtype = var.dtype();
+    VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
+    auto &group_info = next_group[var_dtype];
+    int64_t var_size = -1;
+    if (var.is_dense_tensor()) {
+      var_size =
+          std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
+    } else {
+      VLOG(3) << "var " << var.name()
+              << " is not tensor or selected_rows, so skip it";
+      continue;
+    }
+    group_info.first.push_back(tensor_real_index);
+    group_info.second += experimental::SizeOf(var_dtype) * var_size;
+    // group_info.second += framework::SizeOfType(var_dtype) * var_size;
+    if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
+      // means it is the first var of var_dtype
+      group_limit_index[var_dtype] = 0;
+    }
+    auto &cur_limit_index = group_limit_index[var_dtype];
+    if (group_info.second >= group_size_limits[cur_limit_index]) {
+      // exceed group capacity and create a new group
+      res.emplace_back(std::move(group_info.first));
+      group_info = std::pair<std::vector<size_t>, size_t>();
+      cur_limit_index =
+          (std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
+    }
+  }
+  // add the final groups
+  for (auto &e : next_group) {
+    auto &group_info = e.second;
+    if (!group_info.first.empty()) {
+      res.emplace_back(std::move(group_info.first));
+    }
+  }
+  for (const auto &group_index : res) {
+    PADDLE_ENFORCE_NE(
+        group_index.empty(),
+        true,
+        platform::errors::PreconditionNotMet(
+            "AssignGroupBySize construct empty group, please check."));
+  }
+  if (tensor_indices.empty()) {
+    std::sort(res.begin(),
+              res.end(),
+              [](const std::vector<size_t> &x, const std::vector<size_t> &y) {
+                return x.front() < y.front();
+              });
+  }
+  return res;
+}
+template <typename DeviceContext, typename T>
+static void ConcatTensorsForAllReduce(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents) {
+  operators::math::ConcatFunctor<DeviceContext, T> concat_functor_;
+  concat_functor_(
+      context,
+      dense_tensors_,
+      0,
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get());
+}
+template <typename DeviceContext, typename T>
+static void SplitTensorsForAllReduce(
+    const DeviceContext &context,
+    Tensor *p_dense_contents,
+    std::vector<phi::DenseTensor> *p_dense_tensors) {
+  auto *in =
+      std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
+          .get();
+  std::vector<phi::DenseTensor *> outs;
+  std::vector<const phi::DenseTensor *> shape_refer;
+  outs.reserve(p_dense_tensors->size());
+  shape_refer.reserve(p_dense_tensors->size());
+  for (auto &tensor : *p_dense_tensors) {
+    outs.emplace_back(&tensor);
+    shape_refer.emplace_back(&tensor);
+  }
+  operators::math::SplitFunctor<DeviceContext, T> split_functor_;
+  split_functor_(context, *in, shape_refer, 0, &outs);
+}
+// context is used to select the stream for concat
+template <typename DeviceContext>
+static void ConcatTensorsWithType(
+    const DeviceContext &context,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
+    Tensor *p_dense_contents,
+    phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      ConcatTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case phi::DataType::FLOAT32:
+      ConcatTensorsForAllReduce<DeviceContext, float>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case phi::DataType::FLOAT64:
+      ConcatTensorsForAllReduce<DeviceContext, double>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          type));
+  }
+}
+// context is used to select the stream for split
+template <typename DeviceContext>
+static void SplitTensorsWithType(const DeviceContext &context,
+                                 Tensor *p_dense_contents,
+                                 std::vector<phi::DenseTensor> *p_dense_tensors,
+                                 phi::DataType type) {
+  switch (type) {
+    case phi::DataType::FLOAT16:
+      SplitTensorsForAllReduce<DeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT32:
+      SplitTensorsForAllReduce<DeviceContext, float>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case phi::DataType::FLOAT64:
+      SplitTensorsForAllReduce<DeviceContext, double>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          type));
+  }
+}
+void EagerGroup::ConcatTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(
+        *default_ctx, dense_tensors_, &dense_contents_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat grad tensors since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<phi::CPUContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    ConcatTensorsWithType(
+        *default_ctx, dense_tensors_, &dense_contents_, dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Concat grad tensor not supported on place (%s)", place));
+  }
+}
+void EagerGroup::SplitTensors(const platform::Place &place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(
+        *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split grad tensor since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    auto *default_ctx = static_cast<phi::CPUContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    SplitTensorsWithType(
+        *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Split grad tensor not supported on place (%s)", place));
+  }
+}
+EagerReducer::EagerReducer(
+    const std::vector<Tensor> tensors,
+    const std::vector<std::vector<size_t>> &group_indices,
+    const std::vector<bool> &is_sparse_gradient,
+    std::shared_ptr<distributed::ProcessGroup> process_group,
+    const std::vector<size_t> &group_size_limits,
+    bool find_unused_parameters)
+    : tensors_(tensors),
+      group_indices_(group_indices),
+      is_sparse_gradient_(is_sparse_gradient),
+      process_group_(process_group),
+      group_size_limits_(group_size_limits),
+      find_unused_vars_each_step_(find_unused_parameters) {
+  VLOG(3) << "Start construct the Reducer ...";
+  nranks_ = process_group_->GetSize();
+  // initialize groups
+  InitializeGroups(group_indices);
+  for (size_t global_var_index = 0; global_var_index < tensors_.size();
+       ++global_var_index) {
+    auto tensor = tensors_[global_var_index];
+    auto reduce_hook = [=](void) -> void {
+      this->AddDistHook(global_var_index);
+    };
+    const auto &grad_node = GetGradNodeFromTensor(&tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node,"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    const auto &accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    accumulation_grad_node->RegisterReduceHook(
+        std::make_shared<egr::CppTensorVoidHook>(reduce_hook));
+    gradnode_index_map_[grad_node.get()] = global_var_index;
+  }
+  vars_marked_ready_.resize(tensors_.size(), false);
+  local_used_vars_.resize(tensors_.size(), 0);
+  if (find_unused_vars_each_step_) {
+    global_used_vars_ = paddle::experimental::empty(
+        IntArray({static_cast<int32_t>(tensors_.size())}),
+        DataType::INT32,
+        inner_place_);
+  }
+}
+std::shared_ptr<egr::GradNodeBase> EagerReducer::GetGradNodeFromTensor(
+    Tensor *tensor) {
+  auto *autograd_meta = tensor->get_autograd_meta();
+  const auto &grad_node =
+      static_cast<egr::AutogradMeta *>(autograd_meta)->GetMutableGradNode();
+  return grad_node;
+}
+void EagerReducer::InitializeGroups(
+    const std::vector<std::vector<size_t>> &group_indices) {
+  VLOG(3) << "Start initialize groups ..";
+  // clear the group
+  groups_.clear();
+  groups_.reserve(group_indices.size());
+  variable_locators_.clear();
+  variable_locators_.resize(tensors_.size());
+  auto group_nums = group_indices.size();
+  for (size_t group_index = 0; group_index < group_nums; ++group_index) {
+    const auto &tensor_indices_ = group_indices[group_index];
+    PADDLE_ENFORCE_GT(
+        tensor_indices_.size(),
+        0,
+        platform::errors::PreconditionNotMet(
+            "The number of group[%d]'s elements is 0.", group_index));
+    EagerGroup group;
+    // It's just for check the sparse or dense
+    auto first_var = tensors_[tensor_indices_.front()];
+    if (tensor_indices_.size() == 1 &&
+        is_sparse_gradient_[tensor_indices_.front()]) {
+      // process the sparse gradient. one sparse, one group
+      group.dtype_ = first_var.dtype();
+      group.is_sparse_ = true;
+    } else {
+      // process the dense gradient.
+      InitializeDenseGroups(tensor_indices_, &group);
+      group.dense_contents_ = paddle::experimental::empty(
+          IntArray({group.all_length_}), group.dtype_, inner_place_);
+    }
+    // map tensors to this group by VariableLocator
+    size_t inside_group_index = 0;
+    for (const auto var_index : tensor_indices_) {
+      TensorLocator tensor_locator;
+      tensor_locator.group_index = group_index;
+      tensor_locator.inside_group_index = inside_group_index++;
+      variable_locators_[var_index] = tensor_locator;
+    }
+    group.tensor_indices_ = std::move(tensor_indices_);
+    groups_.emplace_back(std::move(group));
+    VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
+  }
+}
+void EagerReducer::InitializeDenseGroups(
+    const std::vector<size_t> &tensor_indices_, EagerGroup *p_group) {
+  VLOG(3) << "InitializeDenseGroups.";
+  int64_t all_length = 0;
+  for (size_t index = 0; index < tensor_indices_.size(); ++index) {
+    auto tensor_index = tensor_indices_[index];
+    auto &tensor = tensors_[tensor_index];
+    auto &tensor_name = tensor.name();
+    PADDLE_ENFORCE_EQ(is_sparse_gradient_[tensor_index],
+                      false,
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s's GRAD must be Tensor, but received "
+                          "GRAD is SelectedRows",
+                          tensor_name));
+    PADDLE_ENFORCE_EQ(tensor.initialized(),
+                      true,
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s is not initialized.", tensor_name));
+    const auto size = tensor.numel();
+    PADDLE_ENFORCE_GT(
+        size,
+        0,
+        platform::errors::PreconditionNotMet(
+            "The number of tensor %s's elements is 0.", tensor_name));
+    all_length += size;
+    p_group->length_.push_back(size);
+    // for concat operator
+    p_group->origin_shapes_.push_back(IntArray(tensor.shape()));
+    p_group->dense_tensors_.push_back(phi::DenseTensor());
+    const auto &dtype = tensor.dtype();
+    const auto &inner_place = tensor.impl()->place();
+    if (index > 0) {
+      PADDLE_ENFORCE_EQ(dtype,
+                        p_group->dtype_,
+                        platform::errors::PreconditionNotMet(
+                            "Tensor %s has unexpected dtype.", tensor_name));
+    } else {
+      p_group->dtype_ = dtype;
+      inner_place_ = inner_place;
+    }
+  }
+  p_group->all_length_ = all_length;
+}
+void EagerReducer::TraverseBackwardGraph(const std::vector<Tensor> &outputs) {
+  std::queue<egr::GradNodeBase *> queue;
+  std::set<egr::GradNodeBase *> visited;
+  for (const auto &output : outputs) {
+    auto *auto_grad_meta =
+        static_cast<egr::AutogradMeta *>(output.get_autograd_meta());
+    if (!auto_grad_meta) continue;
+    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
+    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
+        auto_grad_meta->StopGradient()) {
+      continue;
+    }
+    egr::GradNodeBase *grad_node = shared_grad_node.get();
+    queue.emplace(grad_node);
+  }
+  while (!queue.empty()) {
+    egr::GradNodeBase *node = queue.front();
+    queue.pop();
+    const paddle::small_vector<std::vector<egr::GradSlotMeta>,
+                               egr::kSlotSmallVectorSize> &metas =
+        node->OutputMeta();
+    for (size_t i = 0; i < metas.size(); i++) {
+      for (size_t j = 0; j < metas[i].size(); j++) {
+        const egr::Edge &edge = metas[i][j].GetEdge();
+        auto next_node_shared = edge.GetMutableGradNode();
+        if (!next_node_shared || !next_node_shared.get()) {
+          continue;
+        }
+        auto *next_node = next_node_shared.get();
+        const bool was_inserted = visited.insert(next_node).second;
+        if (was_inserted) {
+          queue.emplace(next_node);
+        }
+      }
+    }
+  }
+  for (const auto &it : gradnode_index_map_) {
+    if (visited.count(it.first) == 0) {
+      unused_vars_.push_back(it.second);
+      VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+              << "Tensor " << tensors_[it.second].name() << " at index "
+              << it.second << " is marked as unused.";
+    }
+  }
+}
+void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
+  VLOG(3) << "after forward, then reset count for backward.";
+  grad_need_hooks_ = true;
+  next_group_ = 0;
+  std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) {
+    group.pending_ = group.tensor_indices_.size();
+    group.sparse_contents_ = Tensor();
+  });
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(tensors_.size(), false);
+  PADDLE_ENFORCE_EQ(
+      groups_need_finalize_,
+      false,
+      platform::errors::PreconditionNotMet(
+          "A serious error has occurred here. Please "
+          "set find_unused_parameters=True to traverse backward graph "
+          "in each step to prepare reduce in advance. If you have "
+          "set, There may be several reasons for this error: "
+          "1) Please note that all forward outputs derived from the module "
+          "parameters must participate in the calculation of losses and "
+          "subsequent gradient calculations. If not, the wrapper will hang, "
+          "waiting for autograd to generate gradients for these parameters. "
+          "you can use detach or stop_gradient to make the unused parameters "
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
+  // The first var to trigger the unused parameter
+  has_marked_unused_vars_ = false;
+  if (find_unused_vars_once_ || find_unused_vars_each_step_) {
+    unused_vars_.clear();
+    TraverseBackwardGraph(outputs);
+    // only check once in first step
+    find_unused_vars_once_ = false;
+  }
+  if (find_unused_vars_each_step_ && unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  }
+  if (unused_vars_.size() == tensors_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
+  }
+}
+void EagerReducer::AddDistHook(size_t var_index) {
+  PADDLE_ENFORCE_LT(var_index,
+                    variable_locators_.size(),
+                    platform::errors::OutOfRange(
+                        "Out of bounds variable index. it must be less"
+                        "than %d, but it is %d",
+                        variable_locators_.size(),
+                        var_index));
+  // gradient synchronization is not required when grad_need_hooks_ is false.
+  if (!grad_need_hooks_) {
+    return;
+  }
+  VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name()
+          << "@Grad] arrived and triggered disthook";
+  local_used_vars_[var_index] = 1;
+  if (!has_marked_unused_vars_) {
+    has_marked_unused_vars_ = true;
+    for (const auto unused_index : unused_vars_) {
+      MarkVarReady(unused_index, false);
+    }
+  }
+  MarkVarReady(var_index, true);
+}
+void EagerReducer::MarkVarReady(const size_t var_index,
+                                const bool is_used_var) {
+  VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name()
+          << "] is marked ready.";
+  // error happened, if the var is ready before.
+  if (vars_marked_ready_[var_index]) {
+    auto error_info = string::Sprintf(
+        "Error happened, when parameter[%d][%s] has been ready before. "
+        "Please set find_unused_parameters=True to traverse backward graph "
+        "in each step to prepare reduce in advance. If you have set, "
+        "there may be several reasons for this error: "
+        "1) In multiple reentrant backward phase, some parameters are reused."
+        "2) Using model parameters outside of forward function. Please "
+        "make sure that model parameters are not shared in concurrent "
+        "forward-backward passes.",
+        var_index,
+        tensors_[var_index].name());
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_,
+                      false,
+                      platform::errors::PreconditionNotMet(error_info));
+    error_info +=
+        "3) Unused parameters retrieval is incorrect. "
+        "The return value of forward will be used to retrieve"
+        " the unused parameters of the entire model. These "
+        "gradients of unused parameters will not be synchronized "
+        "between multiple cards. However, if the unused "
+        "parameters participate in the backward calculation "
+        "again at a later time (e.g. after the forward function, "
+        "the loss calculation uses the unused "
+        "paramters of the forward and trigger backward), "
+        "its gradient will be wrong.";
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_,
+                      true,
+                      platform::errors::PreconditionNotMet(error_info));
+  } else {
+    vars_marked_ready_[var_index] = true;
+  }
+  groups_need_finalize_ = true;
+  const auto &var_locator = variable_locators_[var_index];
+  const auto group_index = var_locator.group_index;
+  const auto inside_group_index = var_locator.inside_group_index;
+  auto &group = groups_[group_index];
+  auto &group_tensor = group.dense_tensors_[inside_group_index];
+  const auto length = group.length_[inside_group_index];
+  if (!group.is_sparse_) {
+    if (is_used_var) {
+      auto *autograd_meta = tensors_[var_index].get_autograd_meta();
+      auto &grad_tensor =
+          static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
+      group_tensor
+          .ShareDataWith(*(
+              std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
+          .Resize({grad_tensor.numel()});
+    } else {
+      // TODO(shenliang03): maybe save the memory by avoiding tensor
+      // construction
+      if (!group_tensor.initialized()) {
+        group_tensor.Resize({static_cast<int64_t>(length)});
+        group_tensor.mutable_data(inner_place_, group.dtype_);
+      }
+      if (HasGrad(var_index)) {
+        VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad";
+        auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+        group_tensor
+            .ShareDataWith(*(std::dynamic_pointer_cast<phi::DenseTensor>(
+                grad_tensor->impl())))
+            .Resize({length});
+      } else {
+        VLOG(3) << "Tensor[" << tensors_[var_index].name()
+                << "] doesn't have grad";
+        auto *dev_ctx =
+            platform::DeviceContextPool::Instance().Get(inner_place_);
+        group_tensor.Resize({static_cast<int64_t>(length)});
+        phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0);
+      }
+    }
+  } else {
+    auto *autograd_meta = tensors_[var_index].get_autograd_meta();
+    auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
+    // process sparse group
+    PADDLE_ENFORCE_EQ(
+        HasGrad(var_index),
+        true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] should have gradient. "
+            "Currently, DataParallel does not support sparse "
+            "parameters without generating gradients during training. "
+            "For example, if is_sparese=True is used in Embedding, "
+            "the current step of this parameter cannot generate gradient "
+            "because of stop_gradient/detatch, where error will occur.",
+            var_index,
+            tensors_[var_index].name()));
+    // need to check tensor type
+    PADDLE_ENFORCE_EQ(
+        grad_tensor.is_selected_rows(),
+        true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] must have a selectedrows gradient. "
+            "Before forward pass, the parameter type is inferred to be "
+            "SelectedRows, but after backward pass, its actual type becomes "
+            "LodTensor. It is currently not supported by DataParallel. "
+            "For example, if sparse embedding is used, and the weight of "
+            "embedding is shared with subsequent dense parameters, then "
+            "the parameter gradient of the embedding will be converted "
+            "to dense parameters.",
+            var_index,
+            tensors_[var_index].name()));
+    group.sparse_contents_.set_impl(grad_tensor.impl());
+  }
+  if (--group.pending_ == 0) {
+    // can start allreduce
+    MarkGroupReady(group_index);
+  }
+  if (next_group_ == groups_.size()) {
+    FinalizeBackward();
+  }
+}
+void EagerReducer::MarkGroupReady(size_t group_index) {
+  VLOG(3) << "Group[" << group_index << "] is ready";
+  PADDLE_ENFORCE_GE(
+      group_index,
+      next_group_,
+      platform::errors::PreconditionNotMet(
+          "The index of the incoming group must be greater "
+          "than or equal to the previously synchronized group index, "
+          "expect it to greater than or equal to %d, but got %d.",
+          next_group_,
+          group_index));
+  if (group_index > next_group_) {
+    VLOG(3) << "It will adjust the order of group in next batch automatically";
+    return;
+  }
+  for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
+       ++next_group_) {
+    UNUSED auto &group = groups_[next_group_];
+    if (group.is_sparse_) {
+      AllReduceSparse(&group, next_group_);
+    } else {
+      FusedAllReduceSchedule(&group, next_group_);
+    }
+  }
+}
+bool EagerReducer::HasGrad(size_t var_index) {
+  auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+  if (grad && grad->initialized()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+void EagerReducer::ProcessUnusedDenseVars() {
+  // The calculation stream must be used here to
+  // avoid conflicts with communication.
+  VLOG(3) << "Local used vars : "
+          << string::join_strings(local_used_vars_, ',');
+  const auto *dev_ctx =
+      platform::DeviceContextPool::Instance().Get(inner_place_);
+  auto *global_used_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(global_used_vars_.impl())
+          .get();
+  framework::TensorFromVector<int32_t>(
+      local_used_vars_, *dev_ctx, global_used_tensor);
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+  std::vector<Tensor> reduce_tensors = {global_used_vars_};
+  std::vector<phi::DenseTensor> in_out;
+  for (auto &t : reduce_tensors) {
+    in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+  }
+  process_group_->AllReduce(in_out, in_out, opts)->Synchronize();
+  framework::TensorToVector<int>(
+      *global_used_tensor, *dev_ctx, &local_used_vars_);
+  dev_ctx->Wait();
+  // sync compute stream to get global used var message,
+  // but maybe affect speed performance
+  VLOG(3) << "Global used vars : "
+          << string::join_strings(local_used_vars_, ',');
+  for (const auto var_index : unused_vars_) {
+    const bool global_unused = (local_used_vars_[var_index] == 0);
+    // global used but local unused, set grad
+    VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+            << "Var [" << var_index << "] [" << tensors_[var_index].name()
+            << "] global_unused: " << global_unused
+            << "  has grad: " << HasGrad(var_index);
+    if (!global_unused) {
+      VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank "
+              << process_group_->GetRank() << "]";
+      const auto &var_locator = variable_locators_[var_index];
+      const auto group_index = var_locator.group_index;
+      const auto &group = groups_[group_index];
+      const auto inside_group_index = var_locator.inside_group_index;
+      auto &src_tensor = group.dense_tensors_[inside_group_index];
+      // sparse no need to check and no support find_unused_parameters
+      if (group.is_sparse_) {
+        continue;
+      }
+      // NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
+      // gradient accumulation can continue normally after clear_gradients()
+      // especiall in cases including complex control flow.
+      std::static_pointer_cast<egr::GradNodeAccumulation>(
+          GetGradNodeFromTensor(&tensors_[var_index]))
+          ->SetFakeEmpty(false);
+      Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
+      auto dest_var_base = tensors_[var_index];
+      auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base);
+      grad_tensor->copy_(grad_value, inner_place_, true);
+      grad_tensor->reshape(dest_var_base.shape());
+    }
+  }
+}
+void EagerReducer::FinalizeBackward() {
+  groups_need_finalize_ = false;
+  grad_need_hooks_ = false;
+  for (auto &group : groups_) {
+    if (!group.is_sparse_) {
+      group.task->Synchronize();
+    }
+  }
+  for (auto &group : groups_) {
+    if (!group.is_sparse_) {
+      group.SplitTensors(inner_place_);
+    }
+  }
+  if (find_unused_vars_each_step_) {
+    ProcessUnusedDenseVars();
+    local_used_vars_.clear();
+    local_used_vars_.resize(tensors_.size(), 0);
+    VLOG(3) << "ProcessUnusedDenseVars is finished.";
+  }
+  VLOG(3) << "In the batch, Reducer is finished.";
+}
+void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
+                                          const int curr_group_index) {
+  // The overall timeline: concat > div_nranks > allreduce > split
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+  VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce.";
+  // concat tensors
+  group->ConcatTensors(inner_place_);
+  // div nranks
+  paddle::experimental::scale_(
+      group->dense_contents_, 1.0 / nranks_, 0.0, false);
+  // all_reduce
+  std::vector<Tensor> reduce_tensors = {group->dense_contents_};
+  std::vector<phi::DenseTensor> in_out;
+  for (auto &t : reduce_tensors) {
+    in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+  }
+  group->task = process_group_->AllReduce(in_out, in_out, opts);
+  // split in FinalizeBackward()
+}
+void EagerReducer::AllReduceSparse(EagerGroup *group,
+                                   const int curr_group_index) {
+  // div nranks
+  Tensor sparse_tensor(group->sparse_contents_);
+  paddle::experimental::scale_(sparse_tensor, 1.0 / nranks_, 0.0, false);
+  VLOG(3) << "sparse_group [" << curr_group_index << "] start allreduce.";
+  auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
+  if (platform::is_gpu_place(inner_place_)) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    dev_ctx = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(inner_place_));
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat grad tensors since it's not compiled with NCCL,"
+        "Please recompile or reinstall Paddle with NCCL support."));
+#endif
+  } else if (platform::is_cpu_place(inner_place_)) {
+    dev_ctx = static_cast<phi::CPUContext *>(
+        platform::DeviceContextPool::Instance().Get(inner_place_));
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Split grad tensor not supported on place (%s)", inner_place_));
+  }
+  auto src = std::dynamic_pointer_cast<phi::SelectedRows>(
+      group->sparse_contents_.impl());
+  const auto &src_rows = src->rows();
+  const auto &rank_ = process_group_->GetRank();
+  const auto &size_ = process_group_->GetSize();
+  framework::Vector<int64_t> rows_num_vector(size_);
+  rows_num_vector[rank_] = static_cast<int64_t>(src_rows.size());
+  Tensor rows_num_tensor = paddle::experimental::empty(
+      IntArray({static_cast<int64_t>(size_)}), DataType::INT64, inner_place_);
+  auto *rows_num_dense_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(rows_num_tensor.impl()).get();
+  framework::TensorFromVector<int64_t>(
+      rows_num_vector, *dev_ctx, rows_num_dense_tensor);
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+  std::vector<Tensor> reduce_tensors = {rows_num_tensor};
+  std::vector<phi::DenseTensor> in_out;
+  for (auto &t : reduce_tensors) {
+    in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+  }
+  process_group_->AllReduce(in_out, in_out, opts)->Synchronize();
+  framework::TensorToVector<int64_t>(
+      *rows_num_dense_tensor, *dev_ctx, &rows_num_vector);
+  dev_ctx->Wait();
+  const auto *cpu_rows_num_ptr = rows_num_vector.data();
+  auto rows_num = std::accumulate(
+      cpu_rows_num_ptr, cpu_rows_num_ptr + size_, static_cast<int64_t>(0));
+  VLOG(3) << "Gather rows: " << string::join_strings(rows_num_vector, ',')
+          << ", total rows number: " << rows_num
+          << ", height: " << src->height();
+  dev_ctx->Wait();
+  Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
+  std::vector<int64_t> dst_shape = src_value_tensor.shape();
+  if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + size_, [&](int64_t row) {
+        return row == cpu_rows_num_ptr[0];
+      })) {
+    // During sparse communication, the number of each card is same.
+    // allgather is used to speed up the allreduce by replacing broadcast.
+    VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
+    Tensor dst_rows_tensor =
+        paddle::experimental::empty(IntArray({static_cast<int64_t>(rows_num)}),
+                                    DataType::INT64,
+                                    inner_place_);
+    Tensor src_rows_tensor = paddle::experimental::empty(
+        IntArray({static_cast<int64_t>((*src).rows().size())}),
+        DataType::INT64,
+        inner_place_);
+    auto *src_rows_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(src_rows_tensor.impl())
+            .get();
+    framework::TensorFromVector<int64_t>(
+        (*src).rows(), *dev_ctx, src_rows_dense_tensor);
+    std::vector<Tensor> src_rows_tensors = {src_rows_tensor};
+    std::vector<Tensor> dst_rows_tensors = {dst_rows_tensor};
+    std::vector<phi::DenseTensor> in;
+    std::vector<phi::DenseTensor> out;
+    for (auto &t : src_rows_tensors) {
+      in.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+    }
+    for (auto &t : dst_rows_tensors) {
+      out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+    }
+    process_group_->AllGather(in, out)->Synchronize();
+    framework::Vector<int64_t> dst_rows_vector(rows_num, 0);
+    auto *dst_rows_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_rows_tensor.impl())
+            .get();
+    framework::TensorToVector<int64_t>(
+        *dst_rows_dense_tensor, *dev_ctx, &dst_rows_vector);
+    dev_ctx->Wait();
+    dst_shape[dst_shape.size() - 2] = rows_num;
+    auto dst_dense_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(
+        paddle::experimental::full(
+            IntArray(dst_shape), 0, src_value_tensor.dtype(), inner_place_)
+            .impl());
+    auto dst =
+        std::make_shared<phi::SelectedRows>(dst_rows_vector, (*src).height());
+    *(dst->mutable_value()) = *dst_dense_tensor;
+    Tensor dst_value_tensor(std::make_shared<phi::DenseTensor>(dst->value()));
+    std::vector<Tensor> src_value_tensors = {src_value_tensor};
+    std::vector<Tensor> dst_value_tensors = {dst_value_tensor};
+    std::vector<phi::DenseTensor> src_dense;
+    std::vector<phi::DenseTensor> dst_dense;
+    for (auto &t : src_value_tensors) {
+      src_dense.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+    }
+    for (auto &t : dst_value_tensors) {
+      dst_dense.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+    }
+    process_group_->AllGather(src_dense, dst_dense)->Synchronize();
+    src->set_rows(dst_rows_vector);
+    *(src->mutable_value()) =
+        *(std::dynamic_pointer_cast<phi::DenseTensor>(dst_value_tensor.impl()));
+  } else {
+    std::vector<Tensor> rows_tensors;
+    std::vector<Tensor> values_tensors;
+    for (int i = 0; i < size_; ++i) {
+      std::vector<int64_t> value_tensor_shape = {
+          cpu_rows_num_ptr[i], dst_shape[dst_shape.size() - 1]};
+      Tensor rows_tensor = paddle::experimental::full(
+          IntArray({static_cast<int64_t>(cpu_rows_num_ptr[i])}),
+          0,
+          DataType::INT64,
+          inner_place_);
+      Tensor values_tensor = paddle::experimental::full(
+          IntArray(value_tensor_shape), 0, src->value().dtype(), inner_place_);
+      std::vector<phi::DenseTensor> rows_dense_vector;
+      std::vector<phi::DenseTensor> values_dense_vector;
+      if (i == rank_) {
+        auto *rows_dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl())
+                .get();
+        framework::TensorFromVector<int64_t>(
+            src_rows, *dev_ctx, rows_dense_tensor);
+        values_tensor.set_impl(
+            std::make_shared<phi::DenseTensor>(src->value()));
+      }
+      rows_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl()));
+      values_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(values_tensor.impl()));
+      auto b_opts = BroadcastOptions();
+      b_opts.source_rank = i;
+      process_group_->Broadcast(rows_dense_vector, rows_dense_vector, b_opts);
+      process_group_
+          ->Broadcast(values_dense_vector, values_dense_vector, b_opts)
+          ->Wait();
+      rows_tensors.push_back(rows_tensor);
+      values_tensors.push_back(values_tensor);
+    }
+    Tensor dst_rows_tensor =
+        paddle::experimental::concat(rows_tensors, phi::Scalar(0));
+    framework::Vector<int64_t> dst_rows_vector(rows_num, 0);
+    auto *dst_rows_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_rows_tensor.impl())
+            .get();
+    framework::TensorToVector<int64_t>(
+        *dst_rows_dense_tensor, *dev_ctx, &dst_rows_vector);
+    src->set_rows(dst_rows_vector);
+    Tensor dst_values_tensor =
+        paddle::experimental::concat(values_tensors, phi::Scalar(0));
+    *(src->mutable_value()) = *(
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_values_tensor.impl()));
+  }
+}
+std::ostream &operator<<(std::ostream &out, const EagerGroup &group) {
+  const auto &tensors_ = group.tensor_indices_;
+  out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size()
+      << "\n";
+  auto begin = tensors_.begin();
+  auto end = tensors_.end();
+  out << "[";
+  for (int i = 0; begin != end && i < 100; ++i, ++begin) {
+    if (i > 0) out << ' ';
+    out << *begin;
+  }
+  if (begin != end) {
+    out << " ...";
+  }
+  out << "]\n";
+  return out;
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/utils/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+using Tensor = paddle::experimental::Tensor;
+using Scalar = paddle::experimental::ScalarBase<paddle::experimental::Tensor>;
+using IntArray =
+    paddle::experimental::IntArrayBase<paddle::experimental::Tensor>;
+using Backend = paddle::experimental::Backend;
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor>,
+    const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices = {});
+class EagerGroup {
+ public:
+  Tensor dense_contents_;
+  Tensor sparse_contents_;
+  bool is_sparse_ = false;
+  // for concat kernel
+  std::vector<phi::DenseTensor> dense_tensors_;
+  std::vector<int64_t> length_;
+  int64_t all_length_{0};
+  std::vector<IntArray> origin_shapes_;
+  // Global indices of participating tensors in the group
+  std::vector<size_t> tensor_indices_;
+  // Number of params that haven't been ready. When it is 0, it means
+  // the group is ready.
+  size_t pending_ = -1;
+  // external message of group
+  phi::DataType dtype_;
+  // help to sync
+  std::shared_ptr<ProcessGroup::Task> task;
+  // context is used to select the stream for concat
+  void ConcatTensors(const platform::Place &);
+  // context is used to select the stream for split
+  void SplitTensors(const platform::Place &);
+  friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
+};
+struct TensorLocator {
+  // record the index in groups_
+  size_t group_index;
+  size_t inside_group_index;
+};
+class EagerReducer {
+ public:
+  explicit EagerReducer(
+      const std::vector<Tensor> tensors,
+      const std::vector<std::vector<size_t>> &group_indices,
+      const std::vector<bool> &is_sparse_gradient,
+      std::shared_ptr<distributed::ProcessGroup> process_group,
+      const std::vector<size_t> &group_size_limits,
+      bool find_unused_parameters);
+  virtual ~EagerReducer() {}
+  std::shared_ptr<egr::GradNodeBase> GetGradNodeFromTensor(Tensor *tensor);
+  void InitializeGroups(const std::vector<std::vector<size_t>> &group_indices);
+  void InitializeDenseGroups(const std::vector<size_t> &tensor_indices_,
+                             EagerGroup *p_group);
+  void PrepareForBackward(const std::vector<Tensor> &outputs);
+  void AddDistHook(size_t var_index);
+  void MarkVarReady(const size_t var_index, const bool is_used_var);
+  void MarkGroupReady(const size_t group_index);
+  void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index);
+  void AllReduceSparse(EagerGroup *group, const int curr_group_index);
+  void FinalizeBackward();
+  void TraverseBackwardGraph(const std::vector<Tensor> &outputs);
+  void ProcessUnusedDenseVars();
+  bool HasGrad(size_t var_index);
+ private:
+  std::vector<Tensor> tensors_;
+  std::vector<std::vector<size_t>> group_indices_;
+  std::vector<bool> is_sparse_gradient_;
+  std::shared_ptr<distributed::ProcessGroup> process_group_;
+  std::vector<size_t> group_size_limits_;
+  std::vector<EagerGroup> groups_;
+  std::vector<TensorLocator> variable_locators_;
+  platform::Place inner_place_;
+  size_t next_group_ = 0;
+  int64_t nranks_ = -1;
+  bool grad_need_hooks_{false};
+  std::vector<bool> vars_marked_ready_;
+  std::vector<int32_t> local_used_vars_;
+  // Following variables are to help unused vars
+  std::vector<size_t> unused_vars_;
+  std::map<egr::GradNodeBase *, size_t> gradnode_index_map_;
+  bool has_marked_unused_vars_{false};
+  bool find_unused_vars_each_step_{false};
+  bool find_unused_vars_once_{true};
+  bool groups_need_finalize_{false};
+  Tensor global_used_vars_;
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/common/CMakeLists.txt
+++ b/paddle/fluid/distributed/common/CMakeLists.txt
+cc_library(
+  afs_wrapper
+  SRCS afs_warpper.cc
+  DEPS fs ps_framework_proto)
+#set_property(GLOBAL PROPERTY COMMON_DEPS afs_warpper)
--- a/paddle/fluid/distributed/common/afs_warpper.cc
+++ b/paddle/fluid/distributed/common/afs_warpper.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/common/afs_warpper.h"
+#include "paddle/fluid/framework/io/fs.h"
+namespace paddle {
+namespace distributed {
+// AfsClient impl
+int AfsClient::initialize(const FsClientParameter& fs_client_param) {
+  // temporarily implemented with hdfs-client
+  return initialize(fs_client_param.hadoop_bin(),
+                    fs_client_param.uri(),
+                    fs_client_param.user(),
+                    fs_client_param.passwd(),
+                    fs_client_param.buffer_size());
+}
+int AfsClient::initialize(const std::string& hadoop_bin,
+                          const std::string& uri,
+                          const std::string& user,
+                          const std::string& passwd,
+                          int buffer_size_param) {
+  return initialize(
+      hadoop_bin,
+      uri,
+      paddle::string::format_string("%s,%s", user.c_str(), passwd.c_str()),
+      buffer_size_param);
+}
+int AfsClient::initialize(const std::string& hadoop_bin,
+                          const std::string& uri,
+                          const std::string& ugi,
+                          int buffer_size_param) {
+  // temporarily implemented with hdfs-client
+  size_t buffer_size = 1L << 25;  // 32MB
+  if (buffer_size_param > static_cast<int>(buffer_size)) {
+    buffer_size = buffer_size_param;
+  }
+  paddle::framework::hdfs_set_buffer_size(buffer_size);
+  paddle::framework::hdfs_set_command(paddle::string::format_string(
+      "2>>./hdfs_err.log %s fs -Dfs.default.name=%s -Dhadoop.job.ugi=%s "
+      "-Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=300000",
+      hadoop_bin.c_str(),
+      uri.c_str(),
+      ugi.c_str()));
+  return 0;
+}
+// open file in 'w' or 'r'
+std::shared_ptr<FsReadChannel> AfsClient::open_r(const FsChannelConfig& config,
+                                                 uint32_t buffer_size,
+                                                 int* err_no) {
+  std::shared_ptr<FsReadChannel> channel =
+      std::make_shared<FsReadChannel>(buffer_size);
+  std::shared_ptr<FILE> fp =
+      paddle::framework::fs_open_read(config.path, err_no, config.deconverter);
+  channel->open(fp, config);
+  return channel;
+}
+std::shared_ptr<FsWriteChannel> AfsClient::open_w(const FsChannelConfig& config,
+                                                  uint32_t buffer_size,
+                                                  int* err_no) {
+  std::shared_ptr<FsWriteChannel> channel =
+      std::make_shared<FsWriteChannel>(buffer_size);
+  std::shared_ptr<FILE> fp =
+      paddle::framework::fs_open_write(config.path, err_no, config.converter);
+  channel->open(fp, config);
+  return channel;
+}
+// remove file in path, path maybe a reg, such as 'part-000-*'
+void AfsClient::remove(const std::string& path) {
+  return paddle::framework::fs_remove(path);
+}
+void AfsClient::remove_dir(const std::string& dir) {
+  return paddle::framework::fs_remove(dir);
+}
+// list files in path, path maybe a dir with reg
+std::vector<std::string> AfsClient::list(const std::string& path) {
+  return paddle::framework::fs_list(path);
+}
+// exist or not
+bool AfsClient::exist(const std::string& dir) {
+  return paddle::framework::fs_exists(dir);
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/common/afs_warpper.h
+++ b/paddle/fluid/distributed/common/afs_warpper.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+struct FsDataConverter {
+  std::string converter;
+  std::string deconverter;
+};
+struct FsChannelConfig {
+  std::string path;       // path of file
+  std::string converter;  // data converter
+  std::string deconverter;
+};
+class FsReadChannel {
+ public:
+  FsReadChannel() : _buffer_size(0) {}
+  explicit FsReadChannel(uint32_t buffer_size) : _buffer_size(buffer_size) {}
+  virtual ~FsReadChannel() {}
+  FsReadChannel(FsReadChannel&&) = delete;
+  FsReadChannel(const FsReadChannel&) = delete;
+  int open(std::shared_ptr<FILE> fp, const FsChannelConfig& config) {
+    _file = fp;
+    return 0;
+  }
+  inline int close() {
+    _file.reset();
+    return 0;
+  }
+  inline uint32_t read_line(std::string& line_data) {  // NOLINT
+    line_data.clear();
+    char buffer = '\0';
+    size_t read_count = 0;
+    while (1 == fread(&buffer, 1, 1, _file.get()) && buffer != '\n') {
+      ++read_count;
+      line_data.append(&buffer, 1);
+    }
+    if (read_count == 0 && buffer != '\n') {
+      return -1;
+    }
+    return 0;
+  }
+ private:
+  uint32_t _buffer_size;
+  FsChannelConfig _config;
+  std::shared_ptr<FILE> _file;
+};
+class FsWriteChannel {
+ public:
+  FsWriteChannel() : _buffer_size(0) {}
+  explicit FsWriteChannel(uint32_t buffer_size) : _buffer_size(buffer_size) {}
+  virtual ~FsWriteChannel() {}
+  FsWriteChannel(FsWriteChannel&&) = delete;
+  FsWriteChannel(const FsWriteChannel&) = delete;
+  int open(std::shared_ptr<FILE> fp, const FsChannelConfig& config) {
+    _file = fp;
+    // the buffer has set in fs.cc
+    // if (_buffer_size != 0) {
+    //    _buffer = std::shared_ptr<char>(new char[_buffer_size]);
+    //    CHECK(0 == setvbuf(&*_file, _buffer.get(), _IOFBF, _buffer_size));
+    //}
+    return 0;
+  }
+  inline void flush() { return; }
+  inline int close() {
+    flush();
+    _file.reset();
+    return 0;
+  }
+  inline uint32_t write_line(const char* data, uint32_t size) {
+    size_t write_count = fwrite_unlocked(data, 1, size, _file.get());
+    if (write_count != size) {
+      return -1;
+    }
+    write_count = fwrite_unlocked("\n", 1, 1, _file.get());
+    if (write_count != 1) {
+      return -1;
+    }
+    return 0;
+  }
+  inline uint32_t write_line(const std::string& data) {
+    return write_line(data.c_str(), data.size());
+  }
+ private:
+  uint32_t _buffer_size;
+  FsChannelConfig _config;
+  std::shared_ptr<FILE> _file;
+  std::shared_ptr<char> _buffer;
+};
+class AfsClient {
+ public:
+  AfsClient() {}
+  virtual ~AfsClient() {}
+  AfsClient(AfsClient&&) = delete;
+  AfsClient(const AfsClient&) = delete;
+  int initialize(const FsClientParameter& fs_client_param);
+  int initialize(const std::string& hadoop_bin,
+                 const std::string& uri,
+                 const std::string& user,
+                 const std::string& passwd,
+                 int buffer_size_param = (1L << 25));
+  int initialize(const std::string& hadoop_bin,
+                 const std::string& uri,
+                 const std::string& ugi,
+                 int buffer_size_param = (1L << 25));
+  // open file in 'w' or 'r'
+  std::shared_ptr<FsReadChannel> open_r(const FsChannelConfig& config,
+                                        uint32_t buffer_size = 0,
+                                        int* err_no = nullptr);
+  std::shared_ptr<FsWriteChannel> open_w(const FsChannelConfig& config,
+                                         uint32_t buffer_size = 0,
+                                         int* err_no = nullptr);
+  // remove file in path, path maybe a reg, such as 'part-000-*'
+  void remove(const std::string& path);
+  void remove_dir(const std::string& dir);
+  // list files in path, path maybe a dir with reg
+  std::vector<std::string> list(const std::string& path);
+  // exist or not
+  bool exist(const std::string& dir);
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/common/chunk_allocator.h
+++ b/paddle/fluid/distributed/common/chunk_allocator.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <glog/logging.h>
+namespace paddle {
+namespace distributed {
+// Fast allocation and deallocation of objects by allocating them in chunks.
+template <class T>
+class ChunkAllocator {
+ public:
+  explicit ChunkAllocator(size_t chunk_size = 64) {
+    CHECK(sizeof(Node) == std::max(sizeof(void*), sizeof(T)));
+    _chunk_size = chunk_size;
+    _chunks = NULL;
+    _free_nodes = NULL;
+    _counter = 0;
+  }
+  ChunkAllocator(const ChunkAllocator&) = delete;
+  ~ChunkAllocator() {
+    while (_chunks != NULL) {
+      Chunk* x = _chunks;
+      _chunks = _chunks->next;
+      free(x);
+    }
+  }
+  template <class... ARGS>
+  T* acquire(ARGS&&... args) {
+    if (_free_nodes == NULL) {
+      create_new_chunk();
+    }
+    T* x = (T*)(void*)_free_nodes;  // NOLINT
+    _free_nodes = _free_nodes->next;
+    new (x) T(std::forward<ARGS>(args)...);
+    _counter++;
+    return x;
+  }
+  void release(T* x) {
+    x->~T();
+    Node* node = (Node*)(void*)x;  // NOLINT
+    node->next = _free_nodes;
+    _free_nodes = node;
+    _counter--;
+  }
+  size_t size() const { return _counter; }
+ private:
+  struct alignas(T) Node {
+    union {
+      Node* next;
+      char data[sizeof(T)];
+    };
+  };
+  struct Chunk {
+    Chunk* next;
+    Node nodes[];
+  };
+  size_t _chunk_size;  // how many elements in one chunk
+  Chunk* _chunks;      // a list
+  Node* _free_nodes;   // a list
+  size_t _counter;     // how many elements are acquired
+  void create_new_chunk() {
+    Chunk* chunk;
+    posix_memalign(reinterpret_cast<void**>(&chunk),
+                   std::max<size_t>(sizeof(void*), alignof(Chunk)),
+                   sizeof(Chunk) + sizeof(Node) * _chunk_size);
+    chunk->next = _chunks;
+    _chunks = chunk;
+    for (size_t i = 0; i < _chunk_size; i++) {
+      Node* node = &chunk->nodes[i];
+      node->next = _free_nodes;
+      _free_nodes = node;
+    }
+  }
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/common/cost_timer.h
+++ b/paddle/fluid/distributed/common/cost_timer.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <unordered_map>
+#include "butil/time.h"
+#include "bvar/latency_recorder.h"
+#include "glog/logging.h"
+namespace paddle {
+namespace distributed {
+struct CostProfilerNode {
+  std::shared_ptr<bvar::LatencyRecorder> recorder;
+};
+class CostProfiler {
+ public:
+  ~CostProfiler() {}
+  static CostProfiler& instance() {
+    static CostProfiler profiler;
+    return profiler;
+  }
+  void register_profiler(const std::string& label) {
+    if (_cost_profiler_map.find(label) != _cost_profiler_map.end()) {
+      return;
+    }
+    auto profiler_node = std::make_shared<CostProfilerNode>();
+    profiler_node->recorder.reset(
+        new bvar::LatencyRecorder("cost_profiler", label));
+    _cost_profiler_map[label] = profiler_node;
+  }
+  CostProfilerNode* profiler(const std::string& label) {
+    auto itr = _cost_profiler_map.find(label);
+    if (itr != _cost_profiler_map.end()) {
+      return itr->second.get();
+    }
+    return NULL;
+  }
+ private:
+  CostProfiler() {}
+  std::unordered_map<std::string, std::shared_ptr<CostProfilerNode>>
+      _cost_profiler_map;
+};
+class CostTimer {
+ public:
+  explicit CostTimer(const std::string& label) {
+    _label = label;
+    auto& profiler = CostProfiler::instance();
+    _profiler_node = profiler.profiler(label);
+    // 如果不在profiler中，则使用log输出耗时信息
+    _is_print_cost = _profiler_node == NULL;
+    _start_time_ms = butil::gettimeofday_ms();
+  }
+  explicit CostTimer(CostProfilerNode& profiler_node) {  // NOLINT
+    _is_print_cost = false;
+    _profiler_node = &profiler_node;
+    _start_time_ms = butil::gettimeofday_ms();
+  }
+  ~CostTimer() {
+    if (_is_print_cost) {
+      VLOG(3) << "CostTimer label:" << _label
+              << ", cost:" << butil::gettimeofday_ms() - _start_time_ms << "ms";
+    } else {
+      *(_profiler_node->recorder) << butil::gettimeofday_ms() - _start_time_ms;
+    }
+  }
+ private:
+  std::string _label;
+  bool _is_print_cost;
+  uint64_t _start_time_ms;
+  CostProfilerNode* _profiler_node;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/common/local_random.h
+++ b/paddle/fluid/distributed/common/local_random.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <assert.h>
+#include <time.h>
+#include <atomic>
+#include <random>
+namespace paddle {
+namespace distributed {
+// Get time in seconds.
+inline double current_realtime() {
+  struct timespec tp;
+  clock_gettime(CLOCK_REALTIME, &tp);
+  return tp.tv_sec + tp.tv_nsec * 1e-9;
+}
+inline std::default_random_engine& local_random_engine() {
+  struct engine_wrapper_t {
+    std::default_random_engine engine;
+    engine_wrapper_t() {
+      static std::atomic<unsigned long> x(0);  // NOLINT
+      std::seed_seq sseq = {
+          x++, x++, x++, (unsigned long)(current_realtime() * 1000)};  // NOLINT
+      engine.seed(sseq);
+    }
+  };
+  thread_local engine_wrapper_t r;
+  return r.engine;
+}
+template <class T = double>
+std::uniform_real_distribution<T>& local_uniform_real_distribution() {
+  thread_local std::uniform_real_distribution<T> distr;
+  assert(distr.a() == 0.0 && distr.b() == 1.0);
+  return distr;
+}
+template <class T = double>
+T uniform_real() {
+  return local_uniform_real_distribution<T>()(local_random_engine());
+}
+template <class T = double>
+T uniform_real(T a, T b) {
+  if (a == b) {
+    return a;
+  }
+  return (T)(a + uniform_real<T>() * (b - a));
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/common/registerer.h
+++ b/paddle/fluid/distributed/common/registerer.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <glog/logging.h>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+namespace paddle {
+namespace distributed {
+class Any {
+ public:
+  Any() : content_(NULL) {}
+  template <typename ValueType>
+  Any(const ValueType &value) : content_(new Holder<ValueType>(value)) {}
+  Any(const Any &other)
+      : content_(other.content_ ? other.content_->clone() : NULL) {}
+  ~Any() { delete content_; }
+  template <typename ValueType>
+  ValueType *any_cast() {
+    return content_ ? &static_cast<Holder<ValueType> *>(content_)->held_ : NULL;
+  }
+ private:
+  class PlaceHolder {
+   public:
+    virtual ~PlaceHolder() {}
+    virtual PlaceHolder *clone() const = 0;
+  };
+  template <typename ValueType>
+  class Holder : public PlaceHolder {
+   public:
+    explicit Holder(const ValueType &value) : held_(value) {}
+    virtual PlaceHolder *clone() const { return new Holder(held_); }
+    ValueType held_;
+  };
+  PlaceHolder *content_;
+};
+class ObjectFactory {
+ public:
+  ObjectFactory() {}
+  virtual ~ObjectFactory() {}
+  virtual Any NewInstance() { return Any(); }
+ private:
+};
+typedef std::map<std::string, ObjectFactory *> FactoryMap;
+typedef std::map<std::string, FactoryMap> PsCoreClassMap;
+#ifdef __cplusplus
+extern "C" {
+#endif
+inline PsCoreClassMap &global_factory_map() {
+  static PsCoreClassMap *base_class = new PsCoreClassMap();
+  return *base_class;
+}
+#ifdef __cplusplus
+}
+#endif
+inline PsCoreClassMap &global_factory_map_cpp() { return global_factory_map(); }
+// typedef pa::Any Any;
+// typedef ::FactoryMap FactoryMap;
+#define REGISTER_PSCORE_REGISTERER(base_class)                           \
+  class base_class##Registerer {                                         \
+   public:                                                               \
+    static base_class *CreateInstanceByName(const ::std::string &name) { \
+      if (global_factory_map_cpp().find(#base_class) ==                  \
+          global_factory_map_cpp().end()) {                              \
+        LOG(ERROR) << "Can't Find BaseClass For CreateClass with:"       \
+                   << #base_class;                                       \
+        return NULL;                                                     \
+      }                                                                  \
+      FactoryMap &map = global_factory_map_cpp()[#base_class];           \
+      FactoryMap::iterator iter = map.find(name);                        \
+      if (iter == map.end()) {                                           \
+        LOG(ERROR) << "Can't Find Class For Create with:" << name;       \
+        return NULL;                                                     \
+      }                                                                  \
+      Any object = iter->second->NewInstance();                          \
+      return *(object.any_cast<base_class *>());                         \
+    }                                                                    \
+  };
+#define REGISTER_PSCORE_CLASS(clazz, name)              \
+  class ObjectFactory##name : public ObjectFactory {    \
+   public:                                              \
+    Any NewInstance() { return Any(new name()); }       \
+  };                                                    \
+  void register_factory_##name() {                      \
+    FactoryMap &map = global_factory_map_cpp()[#clazz]; \
+    if (map.find(#name) == map.end()) {                 \
+      map[#name] = new ObjectFactory##name();           \
+    }                                                   \
+  }                                                     \
+  void register_factory_##name() __attribute__((constructor));
+#define CREATE_PSCORE_CLASS(base_class, name) \
+  base_class##Registerer::CreateInstanceByName(name);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/common/topk_calculator.h
+++ b/paddle/fluid/distributed/common/topk_calculator.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <queue>
+#include <unordered_map>
+namespace paddle {
+namespace distributed {
+class TopkCalculator {
+ public:
+  TopkCalculator(int shard_num, size_t k)
+      : _shard_num(shard_num), _total_max_size(k) {
+    _shard_max_size = _total_max_size / shard_num;
+    _shard_max_size = _shard_max_size > 1 ? _shard_max_size : 1;
+    for (int i = 0; i < shard_num; ++i) {
+      _mpq.emplace(i,
+                   std::priority_queue<double,
+                                       std::vector<double>,
+                                       std::greater<double>>());
+    }
+  }
+  ~TopkCalculator() {}
+  bool push(int shard_id, double value) {
+    if (_mpq.find(shard_id) == _mpq.end()) {
+      return false;
+    }
+    auto &pq = _mpq[shard_id];
+    if (pq.size() < _shard_max_size) {
+      pq.push(value);
+    } else {
+      if (pq.top() < value) {
+        pq.pop();
+        pq.push(value);
+      }
+    }
+    return true;
+  }
+  // TODO 再进行一次堆排序merge各个shard的结果
+  int top() {
+    double total = 0;
+    for (const auto &item : _mpq) {
+      auto &pq = item.second;
+      if (!pq.empty()) {
+        total += pq.top();
+      }
+    }
+    return total / _shard_num;
+  }
+ private:
+  std::unordered_map<
+      int,
+      std::priority_queue<double, std::vector<double>, std::greater<double>>>
+      _mpq;
+  int _shard_num;
+  size_t _total_max_size;
+  size_t _shard_max_size;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sys/time.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+namespace paddle {
+namespace distributed {
+template <typename T>
+inline phi::funcs::BlasT<phi::CPUContext, T> GetBlas() {
+  phi::CPUContext cpu_ctx;
+  return phi::funcs::GetBlas<phi::CPUContext, T>(cpu_ctx);
+}
+template <typename T>
+inline void SQRT(int n, const T* x, T* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = sqrt(x[i]);
+  }
+}
+template <typename T>
+inline void ADD(int n, const T* x, const T y, T* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y;
+  }
+}
+template <typename T>
+inline void DIV(int n, const T x, const T* y, T* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x / y[i];
+  }
+}
+template <typename T>
+inline void ELE_MUL(int n, const T* x, const T* y, T* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+static bool StartWith(const std::string& str, const std::string& substr) {
+  return str.find(substr) == 0;
+}
+static bool EndWith(const std::string& str, const std::string& substr) {
+  return str.rfind(substr) == (str.length() - substr.length());
+}
+inline std::vector<int> bucket(const int v_size, const int b_size) {
+  int remainder = v_size % b_size;
+  int bucket = v_size / b_size;
+  std::vector<int> ret_vec(b_size, bucket);
+  for (int i = 0; i < remainder; ++i) {
+    ret_vec[i] = ret_vec[i] + 1;
+  }
+  int cur_bucket = 0;
+  for (int& j : ret_vec) {
+    int tmp = j;
+    j = cur_bucket;
+    cur_bucket += tmp;
+  }
+  ret_vec.push_back(cur_bucket);
+  return ret_vec;
+}
+template <typename T>
+std::string to_string(const std::vector<T>& vec) {
+  std::stringstream ss;
+  for (const auto& c : vec) {
+    ss << c << " ";
+  }
+  return ss.str();
+}
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/dataset_utils/README.md
+++ b/paddle/fluid/distributed/dataset_utils/README.md
+# 目录说明
+> 干掉原来的 index_dataset 目录
+dataset 抽样工具类
+用户自定义数据处理so
+流式dataserver相关类
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+proto_library(fleet_executor_desc_proto SRCS fleet_executor_desc.proto)
+if(WITH_PYTHON)
+  py_proto_compile(fleet_executor_desc_py_proto SRCS fleet_executor_desc.proto)
+endif()
+proto_library(interceptor_message_proto SRCS interceptor_message.proto)
+if(WITH_ARM_BRPC)
+  set(BRPC_DEPS arm_brpc snappy gflags glog)
+elseif(WITH_DISTRIBUTE AND WITH_PSCORE)
+  set(BRPC_DEPS
+      brpc
+      ssl
+      crypto
+      protobuf
+      zlib
+      leveldb
+      snappy
+      gflags
+      glog)
+else()
+  set(BRPC_DEPS "")
+endif()
+cc_library(
+  task_loop_thread_pool
+  SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc
+  DEPS enforce glog)
+cc_library(
+  fleet_executor
+  SRCS fleet_executor.cc
+       carrier.cc
+       task_node.cc
+       runtime_graph.cc
+       dist_model.cc
+       interceptor.cc
+       compute_interceptor.cc
+       amplifier_interceptor.cc
+       source_interceptor.cc
+       sink_interceptor.cc
+       message_service.cc
+       message_bus.cc
+       dist_model_tensor_wrapper.cc
+  DEPS proto_desc
+       fleet_executor_desc_proto
+       interceptor_message_proto
+       task_loop_thread_pool
+       collective_helper
+       op_registry
+       executor_gc_helper
+       gflags
+       glog
+       ${BRPC_DEPS})
+if(WITH_DISTRIBUTE)
+  set(DISTRIBUTE_COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+  )
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
+  set_source_files_properties(
+    interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    sink_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS
+                                                    ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  add_subdirectory(test)
+endif()
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/operator.h"
+namespace paddle {
+namespace distributed {
+AmplifierInterceptor::AmplifierInterceptor(int64_t interceptor_id,
+                                           TaskNode* node)
+    : ComputeInterceptor(interceptor_id, node) {
+  run_per_steps_ = node->run_per_steps();
+  run_at_offset_ = node->run_at_offset();
+  reply_up_per_steps_ = node->reply_up_per_steps();
+  send_down_per_steps_ = node->send_down_per_steps();
+}
+void AmplifierInterceptor::RunOps() {
+  // run_per_steps_, run_at_offset_
+  // 4, 0 --> run at step 0, 4, 8, 12
+  // 4, 3 --> run at step 3, 7, 11, 15
+  if ((step_ % run_per_steps_) == run_at_offset_) {
+    ComputeInterceptor::RunOps();
+  }
+}
+void AmplifierInterceptor::SendDataReadyToDownStream() {
+  // run multi times, send ready one times to downstream, that is
+  // input multi times, output one times
+  if (step_ % send_down_per_steps_ == 0) {
+    ComputeInterceptor::SendDataReadyToDownStream();
+  }
+}
+void AmplifierInterceptor::ReplyCompletedToUpStream() {
+  // run multi times, reply one times to upstream, that is
+  // input one times, output multi times
+  if (step_ % reply_up_per_steps_ == 0) {
+    ComputeInterceptor::ReplyCompletedToUpStream();
+  }
+}
+REGISTER_INTERCEPTOR(Amplifier, AmplifierInterceptor);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <utility>
+#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+namespace paddle {
+namespace distributed {
+class AmplifierInterceptor : public ComputeInterceptor {
+ public:
+  AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);
+ private:
+  void RunOps() override;
+  void SendDataReadyToDownStream() override;
+  void ReplyCompletedToUpStream() override;
+  int64_t run_per_steps_{1};
+  int64_t run_at_offset_{0};
+  // one input produces multi times output
+  int64_t reply_up_per_steps_{1};
+  // one output need multi times input
+  int64_t send_down_per_steps_{1};
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include <algorithm>
+#include "paddle/fluid/distributed/fleet_executor/global.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable_helper.h"
+namespace paddle {
+namespace distributed {
+USE_INTERCEPTOR(Source);
+USE_INTERCEPTOR(Compute);
+USE_INTERCEPTOR(Amplifier);
+USE_INTERCEPTOR(Sink);
+void Carrier::Init(
+    int64_t rank,
+    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank) {
+  rank_ = rank;
+  interceptor_id_to_rank_ = interceptor_id_to_rank;
+  // TODO(fleet_exe dev): thread pool
+  thread_num_ = 1;
+  thread_pool_.SetThreadNum(thread_num_);
+  thread_pool_.Start();
+}
+void Carrier::Init(
+    int64_t rank,
+    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
+    const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
+    const framework::ProgramDesc& program,
+    framework::Scope* scope,
+    int64_t num_micro_batches,
+    const platform::Place& place,
+    const std::vector<std::string>& inference_root_scope_vars) {
+  rank_ = rank;
+  interceptor_id_to_rank_ = interceptor_id_to_rank;
+  interceptor_id_to_node_ = interceptor_id_to_node;
+  place_ = place;
+  root_scope_ = scope;
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
+  PADDLE_ENFORCE_NOT_NULL(
+      root_scope_,
+      platform::errors::InvalidArgument("root_scope can not be nullptr"));
+  minibatch_scope_ = &root_scope_->NewScope();
+  microbatch_scopes_.resize(num_micro_batches);
+  for (int i = 0; i < num_micro_batches; ++i) {
+    microbatch_scopes_[i] = &minibatch_scope_->NewScope();
+    CopyParameters(i, program, inference_root_scope_vars);
+  }
+  // TODO(fleet_exe dev): thread pool
+  thread_num_ = 1;
+  thread_pool_.SetThreadNum(thread_num_);
+  thread_pool_.Start();
+  CreateInterceptors();
+  is_init_ = true;
+}
+void Carrier::Release() {
+  if (root_scope_) {
+    root_scope_->DropKids();
+  }
+}
+Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
+void Carrier::CopyParameters(
+    int microbatch_id,
+    const framework::ProgramDesc& program,
+    const std::vector<std::string>& inference_root_scope_vars) {
+  auto& global_block = program.Block(0);
+  std::map<std::string, int> inference_root_scope_var_map;
+  for (auto var_name : inference_root_scope_vars) {
+    inference_root_scope_var_map.insert({var_name, 1});
+  }
+  for (auto& var : global_block.AllVars()) {
+    std::string var_name = var->Name();
+    bool force_root = inference_root_scope_var_map.find(var_name) !=
+                      inference_root_scope_var_map.end();
+    if (force_root) {
+      VLOG(4) << var_name << " will be forced to be created in the root scope.";
+    }
+    if ((var->Persistable() || force_root) && microbatch_id == 0) {
+      auto* ptr = root_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(5) << "Create persistable var: " << var->Name()
+              << ", which pointer is " << ptr;
+    } else if (!var->Persistable()) {
+      auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
+      VLOG(5) << "Create variable " << var->Name() << " for microbatch "
+              << microbatch_id << ", which pointer is " << ptr << ".";
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+}
+bool Carrier::EnqueueInterceptorMessage(
+    const InterceptorMessage& interceptor_message) {
+  PADDLE_ENFORCE_EQ(
+      interceptor_message.ctrl_message(),
+      false,
+      platform::errors::Fatal(
+          "Control message should be only send inter rank using message bus."));
+  int64_t dst_id = interceptor_message.dst_id();
+  Interceptor* dst_interceptor = GetInterceptor(dst_id);
+  dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
+  return true;
+}
+Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
+  auto iter = interceptor_idx_to_interceptor_.find(interceptor_id);
+  PADDLE_ENFORCE_NE(iter,
+                    interceptor_idx_to_interceptor_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find interceptor instance for interceptor "
+                        "id %lld. Wrong dst? Call before init?",
+                        interceptor_id));
+  return iter->second.get();
+}
+void Carrier::Wait() {
+  std::unique_lock<std::mutex> lock(running_mutex_);
+  cond_var_.wait(lock);
+}
+void Carrier::WakeUp() {
+  // probably double notify, but ok for ut
+  cond_var_.notify_all();
+}
+void Carrier::Start() {
+  PADDLE_ENFORCE_EQ(is_init_,
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Using carrier before initialized."));
+  for (int64_t id : source_interceptor_ids_) {
+    VLOG(3) << "Carrier Start is sending start to source interceptor " << id
+            << ".";
+    InterceptorMessage start_msg;
+    // source node data_is_ready is send by carrier, so set src_id=-1
+    start_msg.set_src_id(-1);
+    start_msg.set_dst_id(id);
+    start_msg.set_message_type(DATA_IS_READY);
+    Send(start_msg);
+  }
+  // TODO(wangxi): async step
+  Wait();
+  dev_ctx_->Wait();
+  for (auto* micro_scope : microbatch_scopes_) {
+    // By default, we should delete all kid scopes after run executor because
+    // some operators may create local scope when running, such as while_op.
+    // But when while_op also create a local executor to run it's sub block,
+    // the sub scopes it created should not be dropped immediately, because
+    // while_grad_op will use some variables created during while_op run, so
+    // we need to keep the kids and wait for the outer executor to drop them.
+    micro_scope->DropKids();
+  }
+}
+bool Carrier::IsInit() const { return is_init_; }
+int64_t Carrier::GetRank(int64_t interceptor_id) const {
+  PADDLE_ENFORCE_NE(
+      interceptor_id_to_rank_.find(interceptor_id),
+      interceptor_id_to_rank_.end(),
+      platform::errors::NotFound("Cannot find rank for interceptor id %lld.",
+                                 interceptor_id));
+  return interceptor_id_to_rank_.at(interceptor_id);
+}
+bool Carrier::Send(const InterceptorMessage& msg) {
+  int64_t src_id = msg.src_id();
+  // TODO(liyurui): compatible solution, will be removed completely in the
+  // future
+  if (interceptor_id_to_rank_.find(src_id) == interceptor_id_to_rank_.end() &&
+      src_id == SOURCE_ID) {
+    src_id = msg.dst_id();
+  }
+  int64_t dst_id = msg.dst_id();
+  int64_t src_rank = GetRank(src_id);
+  int64_t dst_rank = GetRank(dst_id);
+  PADDLE_ENFORCE_EQ(
+      src_rank,
+      rank_,
+      platform::errors::Fatal("The source rank id %lld, which is not equal to "
+                              "the carrier rank id %lld.",
+                              src_rank,
+                              rank_));
+  if (src_rank == dst_rank) {
+    VLOG(3) << "Send a message from interceptor " << src_id
+            << " to interceptor " << dst_id << ", which are in the same ranks.";
+    return EnqueueInterceptorMessage(msg);
+  } else {
+    VLOG(3) << "Send a message from interceptor " << src_id
+            << " to interceptor " << dst_id
+            << ", which are in different ranks.";
+    return GlobalVal<MessageBus>::Get()->Send(dst_rank, msg);
+  }
+}
+Interceptor* Carrier::SetInterceptor(int64_t interceptor_id,
+                                     std::unique_ptr<Interceptor> interceptor) {
+  auto iter = interceptor_idx_to_interceptor_.find(interceptor_id);
+  PADDLE_ENFORCE_EQ(iter,
+                    interceptor_idx_to_interceptor_.end(),
+                    platform::errors::AlreadyExists(
+                        "The interceptor id %lld has already been created! "
+                        "The interceptor id should be unique.",
+                        interceptor_id));
+  interceptor->RegisterCarrier(this);
+  // TODO(fleet_exe dev): get loop
+  auto* loop = thread_pool_.GetLoop(interceptor_id % thread_num_);
+  PADDLE_ENFORCE_NOT_NULL(
+      loop, platform::errors::Fatal("thread task loop must not null"));
+  interceptor->RegisterTaskLoop(loop);
+  auto* ptr = interceptor.get();
+  interceptor_idx_to_interceptor_.insert(
+      std::make_pair(interceptor_id, std::move(interceptor)));
+  return ptr;
+}
+static std::shared_ptr<framework::GarbageCollector> GetGC(
+    const platform::Place& place) {
+  int64_t max_memory_size = framework::GetEagerDeletionThreshold();
+  std::shared_ptr<framework::GarbageCollector> gc;
+  if (max_memory_size >= 0) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::is_gpu_place(place)) {
+      if (framework::IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
+                                                              max_memory_size));
+      }
+    }
+#endif
+  }  // max_memory_size >= 0
+  return gc;
+}
+void Carrier::CreateInterceptors() {
+  if (interceptor_id_to_node_.empty()) return;
+  auto gc = GetGC(place_);
+  // create each Interceptor
+  // no auto init since there is no config
+  for (const auto& item : interceptor_id_to_node_) {
+    int64_t interceptor_id = item.first;
+    TaskNode* task_node = item.second;
+    PADDLE_ENFORCE_LT(
+        task_node->run_at_offset(),
+        task_node->run_per_steps(),
+        platform::errors::InvalidArgument(
+            "Interceptor's run_at_offset must < run_per_steps, must now "
+            "run_at_offset=%ld run_per_steps=%ld",
+            task_node->run_at_offset(),
+            task_node->run_per_steps()));
+    std::unique_ptr<Interceptor> interceptor;
+    PADDLE_ENFORCE_NE(task_node->type().empty(),
+                      true,
+                      platform::errors::NotFound(
+                          "Cannot found type for task node with id %lld",
+                          task_node->task_id()));
+    interceptor = InterceptorFactory::Create(
+        task_node->type(), interceptor_id, task_node);
+    interceptor->SetPlace(place_);
+    interceptor->SetMiniBatchScope(minibatch_scope_);
+    interceptor->SetMicroBatchScope(microbatch_scopes_);
+    interceptor->SetRootScope(root_scope_);
+    interceptor->SetGC(gc);
+    SetInterceptor(interceptor_id, std::move(interceptor));
+    VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
+            << " with type: " << task_node->type() << ".";
+    if (task_node->upstream().empty()) {
+      source_interceptor_ids_.emplace_back(interceptor_id);
+    }
+  }
+}
+}  // namespace distributed
+}  // namespace paddle