2.3.0-dtk-22.04.2

d2d32668 · yuguo960516yuguo · ad08b8ce · d2d32668 · d2d32668 · d2d32668
Commit d2d32668 authored Apr 26, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A simple test driver for cmake. 
+# set PYTHONPATH before run command.
+# Usage:
+#    ./.set_python_pash.sh -p YOUR_PYTHON_PATH {exec...}
+# 
+# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
+#
+PYPATH=""
+set -x
+while getopts "d:" opt; do
+  case $opt in
+    d)
+      PYPATH=$OPTARG
+      ;;
+  esac
+done
+shift $(($OPTIND - 1))
+export PYTHONPATH=$PYPATH:$PYTHONPATH
+$@
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
+add_subdirectory(utils)
+add_subdirectory(scripts)
+add_subdirectory(testing)
+set(PYTHON_TESTS_DIR
+    ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests
+    CACHE INTERNAL "python tests directory")
+add_subdirectory(phi)
+add_subdirectory(infrt)
+add_subdirectory(fluid)
--- a/paddle/extension.h
+++ b/paddle/extension.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+// All paddle apis in C++ frontend
+#include "paddle/phi/api/all.h"
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
+paddle.fluid.optimizer.PipelineOptimizer (paddle.fluid.optimizer.PipelineOptimizer, ('document', '2e55a29dbeb874934f7a1a1af3a22b8c'))
+paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'num_microbatches', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
+add_subdirectory(memory)
+add_subdirectory(platform)
+add_subdirectory(distributed)
+add_subdirectory(framework)
+add_subdirectory(imperative)
+add_subdirectory(operators)
+add_subdirectory(pybind)
+add_subdirectory(eager)
+add_subdirectory(jit)
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
+add_subdirectory(collective)
+add_subdirectory(store)
+if(WITH_PYTHON)
+  py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
+  add_custom_target(
+    ps_py_proto_init ALL
+    COMMAND ${CMAKE_COMMAND} -E make_directory
+            ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
+  add_dependencies(ps_py_proto ps_py_proto_init)
+  if(NOT WIN32)
+    add_custom_command(
+      TARGET ps_py_proto
+      POST_BUILD
+      COMMAND mv the_one_ps_pb2.py
+              ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/)
+  else()
+    string(
+      REPLACE "/" "\\" fleet_proto_dstpath
+              "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
+    add_custom_command(
+      TARGET ps_py_proto
+      POST_BUILD
+      COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath}
+      COMMENT
+        "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}."
+    )
+  endif()
+endif()
+if(NOT WITH_PSCORE)
+  add_subdirectory(fleet_executor)
+  return()
+endif()
+proto_library(ps_framework_proto SRCS the_one_ps.proto)
+add_custom_command(
+  TARGET ps_framework_proto
+  POST_BUILD
+  COMMAND mv the_one_ps.pb.h ps.pb.h
+  COMMAND mv the_one_ps.pb.cc ps.pb.cc)
+set(DISTRIBUTE_COMPILE_FLAGS
+    "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
+)
+if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+endif()
+add_subdirectory(common)
+add_subdirectory(ps)
+add_subdirectory(test)
+add_subdirectory(index_dataset)
+add_subdirectory(fleet_executor)
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
+cc_library(
+  processgroup
+  SRCS ProcessGroup.cc
+  DEPS phi_api eager_api)
+cc_library(
+  eager_reducer
+  SRCS reducer.cc
+  DEPS eager_api processgroup phi_api string_helper)
+if(WITH_DISTRIBUTE)
+  cc_library(
+    processgroup_gloo
+    SRCS ProcessGroupGloo.cc
+    DEPS phi_api eager_api gloo_wrapper)
+endif()
+if(WITH_NCCL OR WITH_RCCL)
+  cc_library(
+    processgroup_nccl
+    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
+    DEPS place
+         cuda_stream
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
+      DEPS place
+           cuda_stream
+           enforce
+           collective_helper
+           device_context
+           phi_api
+           eager_api)
+  endif()
+endif()
+if(WITH_ASCEND_CL)
+  cc_library(
+    processgroup_hccl
+    SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc
+    DEPS place
+         npu_stream
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc
+      DEPS place
+           npu_stream
+           enforce
+           collective_helper
+           device_context
+           phi_api
+           eager_api)
+  endif()
+endif()
--- a/paddle/fluid/distributed/collective/Common.cc
+++ b/paddle/fluid/distributed/collective/Common.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/Common.h"
+namespace paddle {
+namespace distributed {
+std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.place());
+  }
+  return places;
+}
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors) {
+  return std::all_of(
+      tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
+        return platform::is_gpu_place(t.place());
+      });
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/Common.h
+++ b/paddle/fluid/distributed/collective/Common.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors);
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places);
+bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors);
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/HCCLTools.cc
+++ b/paddle/fluid/distributed/collective/HCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+namespace paddle {
+namespace distributed {
+HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(),
+      true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <error.h>
+#include <string>
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/variant.h"
+namespace paddle {
+namespace distributed {
+class NPUEventManager {
+ public:
+  NPUEventManager() = default;
+  ~NPUEventManager() {
+    if (is_created_) {
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUEventDestroy(event_);
+    }
+  }
+  NPUEventManager(const NPUEventManager&) = delete;
+  NPUEventManager& operator=(const NPUEventManager&) = delete;
+  NPUEventManager(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+  NPUEventManager& operator=(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  aclrtEvent GetRawNPUEvent() const { return event_; }
+  void Record(const paddle::platform::NPUDeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "NPUDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+    platform::NPUDeviceGuard guard(device_index_);
+    platform::NPUEventRecord(event_, ctx.stream());
+  }
+  bool Query() const {
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    platform::NPUEventQuery(event_, &status);
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      return true;
+    }
+    return false;
+  }
+  void Block(const paddle::platform::NPUDeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUStreamWaitEvent(ctx.stream(), event_);
+    }
+  }
+ private:
+  bool is_created_{false};
+  aclrtEvent event_{};
+  int8_t device_index_{0};
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::NPUDeviceGuard guard(device_index);
+    platform::NPUEventCreate(&event_);
+    is_created_ = true;
+  }
+};
+class HCCLCommManager {
+ public:
+  explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
+  HCCLCommManager() : HCCLCommManager(nullptr) {}
+  ~HCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hccl_comm_) {
+      platform::dynload::HcclCommDestroy(hccl_comm_);
+    }
+  }
+  static std::shared_ptr<HCCLCommManager> Create(int num_ranks,
+                                                 int rank,
+                                                 HcclRootInfo* comm_id,
+                                                 HcclComm hccl_comm) {
+    auto hccl_manager = std::make_shared<HCCLCommManager>();
+    auto ret = platform::dynload::HcclCommInitRootInfo(
+        num_ranks, comm_id, rank, &hccl_comm);
+    using __NPU_STATUS_TYPE__ = decltype(ret);
+    constexpr auto __success_type__ =
+        platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
+    if (UNLIKELY(ret != __success_type__)) {
+      VLOG(0) << "Error: create hccl_id error.";
+      exit(-1);
+    }
+    hccl_manager->hccl_id_ = comm_id;
+    hccl_manager->rank_ = rank;
+    hccl_manager->hccl_comm_ = hccl_comm;
+    return hccl_manager;
+  }
+  HcclRootInfo* GetHcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_id_;
+  }
+  HcclComm GetHcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_comm_;
+  }
+  HCCLCommManager(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
+  HCCLCommManager(HCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(hccl_comm_, other.hccl_comm_);
+  }
+ protected:
+  HcclComm hccl_comm_;
+  HcclRootInfo* hccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+HcclReduceOp ToHCCLRedType(ReduceOp reduction);
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+namespace paddle {
+namespace distributed {
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, ncclRedOp_t> red_type = {
+      {ReduceOp::MIN, ncclMin},
+      {ReduceOp::MAX, ncclMax},
+      {ReduceOp::SUM, ncclSum},
+      {ReduceOp::PRODUCT, ncclProd},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
+  return it->second;
+}
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#include <error.h>
+#include <string>
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/variant.h"
+namespace paddle {
+namespace distributed {
+#define NCCLCHECK(cmd)                                  \
+  do {                                                  \
+    ncclResult_t r = cmd;                               \
+    if (r != ncclSuccess) {                             \
+      printf("Failed, NCCL error %s:%d '%s'\n",         \
+             __FILE__,                                  \
+             __LINE__,                                  \
+             platform::dynload::ncclGetErrorString(r)); \
+      exit(EXIT_FAILURE);                               \
+    }                                                   \
+  } while (0)
+// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
+// EventManage is different from paddle::platform::CudaEvent.
+// It uses lazy initialization and is only created when the
+// Record() method is called for the first time; it also monitors
+// device information to ensure that recorded stream and event
+// are on the same device.
+class EventManager {
+ public:
+  EventManager() {}
+  explicit EventManager(unsigned int flags) : flags_{flags} {}
+  ~EventManager() {
+    if (is_created_) {
+      platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      hipEventDestroy(event_);
+#else
+      cudaEventDestroy(event_);
+#endif
+    }
+  }
+  EventManager(const EventManager&) = delete;
+  EventManager& operator=(const EventManager&) = delete;
+  EventManager(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+  EventManager& operator=(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  gpuEvent_t GetRawCudaEvent() const { return event_; }
+  void Record(const paddle::platform::CUDADeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "CUDADeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+    platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream()));
+#endif
+  }
+  bool Query() const {
+#ifdef PADDLE_WITH_HIP
+    gpuError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) {
+      return true;
+    }
+    if (err == hipErrorNotReady) {
+      return false;
+    }
+#else
+    gpuError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    }
+    if (err == cudaErrorNotReady) {
+      return false;
+    }
+#endif
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    return false;
+  }
+  void Synchronize() const {
+    if (is_created_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
+#endif
+    }
+  }
+  void Block(const paddle::platform::CUDADeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
+#endif
+    }
+  }
+ private:
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags_ = hipEventDefault;
+#else
+  unsigned int flags_ = cudaEventDefault;
+#endif
+  bool is_created_{false};
+  gpuEvent_t event_{};
+  int8_t device_index_{0};
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::CUDADeviceGuard guard(device_index);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
+#endif
+    is_created_ = true;
+  }
+};
+// NOTE(shenliang03): NCCLCommManager is more lightweight than
+// platform::NCCLComm
+class NCCLCommManager {
+ public:
+  explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {}
+  NCCLCommManager() : NCCLCommManager(nullptr) {}
+  ~NCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (nccl_comm_) {
+      platform::dynload::ncclCommDestroy(nccl_comm_);
+    }
+  }
+  static std::shared_ptr<NCCLCommManager> Create(int num_ranks,
+                                                 int rank,
+                                                 ncclUniqueId comm_id) {
+    auto nccl_manager = std::make_shared<NCCLCommManager>();
+    NCCLCHECK(platform::dynload::ncclCommInitRank(
+        &(nccl_manager->nccl_comm_), num_ranks, comm_id, rank));
+    nccl_manager->nccl_id_ = comm_id;
+    nccl_manager->rank_ = rank;
+    return nccl_manager;
+  }
+  ncclUniqueId GetNcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_id_;
+  }
+  ncclComm_t GetNcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_comm_;
+  }
+  NCCLCommManager(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(NCCLCommManager&& other) = delete;
+  NCCLCommManager(NCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(nccl_comm_, other.nccl_comm_);
+  }
+ protected:
+  ncclComm_t nccl_comm_;
+  ncclUniqueId nccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+namespace paddle {
+namespace distributed {
+ProcessGroup::Task::Task(int rank,
+                         const std::vector<phi::DenseTensor>& inputTensors,
+                         CommType comm_type)
+    : rank_(rank), comm_type_(comm_type) {}
+ProcessGroup::Task::~Task() = default;
+bool ProcessGroup::Task::IsCompleted() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return is_completed_;
+}
+bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
+  return false;
+}
+void ProcessGroup::Task::Synchronize() {}
+ProcessGroup::ProcessGroup(int rank,
+                           int size,
+                           const platform::Place& place,
+                           int gid)
+    : rank_(rank), size_(size), place_(place), gid_(gid) {
+  if (gid != IGNORE_ID) {
+    auto map = ProcessGroupMapFromGid::getInstance();
+    map->insert(gid_, this);
+  }
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
+namespace paddle {
+namespace distributed {
+constexpr int IGNORE_ID = -1;
+using Tensor = paddle::experimental::Tensor;
+enum class CommType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_SPARSE = 2,  // TODO(shenliang03): to support sparse in allreduce
+  REDUCE = 3,
+  ALLGATHER = 4,
+  GATHER = 5,
+  SCATTER = 6,
+  REDUCE_SCATTER = 7,
+  ALLTOALL = 8,
+  SEND = 9,
+  RECV = 10,
+  BARRIER = 11,
+  ALLTOALL_SINGLE = 12,
+  UNKNOWN = 100,
+};
+class ProcessGroup {
+ public:
+  class Task {
+   public:
+    Task(int rank,
+         const std::vector<phi::DenseTensor>& inputTensors,
+         CommType opType = CommType::UNKNOWN);
+    virtual ~Task();
+    virtual bool IsCompleted();
+    virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    virtual void Synchronize();
+   protected:
+    const int rank_;
+    CommType comm_type_;
+    std::mutex mutex_;
+    bool is_completed_ = false;
+  };
+  explicit ProcessGroup(int rank,
+                        int size,
+                        const platform::Place& place,
+                        int gid);
+  virtual ~ProcessGroup() {}
+  int GetRank() const { return rank_; }
+  int GetSize() const { return size_; }
+  virtual const std::string GetBackendName() const = 0;
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const AllreduceOptions& = AllreduceOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions& = BroadcastOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support barrier", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support receive", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor&,
+                                                           int,
+                                                           int,
+                                                           int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors, int, int, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support receive", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>&,    // NOLINT
+      std::vector<phi::DenseTensor>&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,    // NOLINT
+      std::vector<phi::DenseTensor>&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceOptions& opts) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Reduce", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&) {         // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support Scatter", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
+      phi::DenseTensor&,              // NOLINT
+      phi::DenseTensor&,              // NOLINT
+      const ReduceScatterOptions&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support ReduceScatter", GetBackendName()));
+  }
+ protected:
+  const int rank_;
+  const int size_;
+  const platform::Place place_;
+  const int gid_;
+};
+class ProcessGroupMapFromGid {
+ public:
+  bool has(int gid) {
+    auto it = map_.find(gid);
+    return it != map_.end();
+  }
+  void insert(int gid, ProcessGroup* pg) {
+    // TODO(sandyhouse): address ut and uncomment the following codes
+    // PADDLE_ENFORCE_EQ(has(gid), false,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "The process group with id %d doesnot exist.",
+    //                       gid));
+    map_[gid] = pg;
+  }
+  ProcessGroup* get(int gid) {
+    // TODO(sandyhouse): address ut and uncomment the following codes
+    // PADDLE_ENFORCE_EQ(has(gid), true,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "The process group with id %d doesnot exist.",
+    //                       gid));
+    return map_.find(gid)->second;
+  }
+  static std::shared_ptr<ProcessGroupMapFromGid> getInstance() {
+    static auto s_instance = std::make_shared<ProcessGroupMapFromGid>();
+    return s_instance;
+  }
+  ProcessGroupMapFromGid() = default;
+  ~ProcessGroupMapFromGid() = default;
+ private:
+  std::unordered_map<int, ProcessGroup*> map_;
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+#include <gloo/broadcast.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace distributed {
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#define HOST_NAME_MAX 256
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+template <typename T>
+T* get_data(phi::DenseTensor& tensor) {  // NOLINT
+  return reinterpret_cast<T*>(tensor.data());
+}
+template <typename T>
+std::vector<T*> get_multi_data(
+    std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  std::vector<T*> ret;
+  ret.reserve(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret.push_back(get_data<T>(tensors[i]));
+  }
+  return ret;
+}
+template <typename T, typename P>
+void set_output(P& opts, phi::DenseTensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+template <typename T, typename P>
+void set_input(P& opts, phi::DenseTensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+template <typename T, typename P>
+void set_outputs(P& opts,                                   // NOLINT
+                 std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+template <typename T, typename P>
+void set_inputs(P& opts,                                   // NOLINT
+                std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+template <typename T, typename P>
+void set_inputs_for_scatter(P& opts,                   // NOLINT
+                            phi::DenseTensor& tensor,  // NOLINT
+                            int nranks) {
+  std::vector<T*> ret;
+  ret.reserve(nranks);
+  T* raw_pointer = reinterpret_cast<T*>(tensor.data());
+  size_t offset = 0;
+  for (int i = 0; i < nranks; i++) {
+    ret.push_back(raw_pointer + offset);
+    offset += tensor.numel() / nranks;
+  }
+  opts.setInputs(ret, tensor.numel() / nranks);
+}
+ProcessGroupGloo::GlooTask::GlooTask(
+    int rank, const std::vector<phi::DenseTensor>& inputs, CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {}
+ProcessGroupGloo::ProcessGroupGloo(
+    const std::shared_ptr<distributed::Store>& store,
+    int rank,
+    int world_size,
+    const platform::Place& place,
+    int gid,
+    const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size, place, gid),
+      _tag(0),
+      _store(new GlooStore(store)) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(gid), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    int rank,
+                    int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+  void Run() override { _do_broadcast(_inputs[0], _outputs[0]); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<phi::DenseTensor> _inputs{};
+  std::vector<phi::DenseTensor> _outputs{};
+  const uint32_t _tag;
+  void _do_broadcast(phi::DenseTensor& in, phi::DenseTensor& out) {  // NOLINT
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = in.dtype();
+    if (rank_ == _root) {
+      GENERATE_FUNC(dtype, set_input, opts, in);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(
+      context, inputs, outputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank,
+                    const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    ReduceOp reduce_op,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+  void Run() override { _do_allreduce(_inputs, _outputs); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+  void _do_allreduce(std::vector<phi::DenseTensor>& ins,     // NOLINT
+                     std::vector<phi::DenseTensor>& outs) {  // NOLINT
+    const auto& dtype = ins[0].dtype();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, ins);
+    GENERATE_FUNC(dtype, set_outputs, opts, outs);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const AllreduceOptions& opts) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(
+      rank_, context, inputs, outputs, opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+class BarrierGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BarrierGlooTask(int rank, const std::shared_ptr<gloo::Context>& context)
+      : ProcessGroupGloo::GlooTask(
+            rank, std::vector<phi::DenseTensor>{}, CommType::BARRIER),
+        _context(context) {}
+  void Run() override { _do_barrier(); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  void _do_barrier() {
+    gloo::BarrierOptions opts(_context);
+    gloo::barrier(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Barrier(
+    const BarrierOptions& opts) {
+  std::shared_ptr<BarrierGlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<BarrierGlooTask>(rank_, context);
+  task->Run();
+  return task;
+}
+class AllgatherGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllgatherGlooTask(int rank,
+                    const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+  void Run() override { _do_allgather(_inputs, _outputs); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  uint32_t _tag;
+  void _do_allgather(std::vector<phi::DenseTensor>& in,     // NOLINT
+                     std::vector<phi::DenseTensor>& out) {  // NOLINT
+    const auto& dtype = in[0].dtype();
+    gloo::AllgatherOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, in[0]);
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setTag(_tag);
+    gloo::allgather(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  std::shared_ptr<AllgatherGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<AllgatherGlooTask>(
+      rank_, context, in_tensors, out_tensors, tag);
+  task->Run();
+  return task;
+}
+class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ReduceGlooTask(int rank,
+                 const std::shared_ptr<gloo::Context>& context,
+                 std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                 std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                 ReduceOp reduce_op,
+                 int dst,
+                 uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::REDUCE),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _reduce_op(reduce_op),
+        _dst(dst),
+        _tag(tag) {}
+  void Run() override { _do_reduce(_inputs, _outputs, _dst); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  const ReduceOp _reduce_op;
+  int _dst;
+  uint32_t _tag;
+  gloo::ReduceOptions::Func _get_function(const experimental::DataType type,
+                                          const ReduceOp op) {
+    gloo::ReduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+  template <typename T>
+  void _get_function_impl(gloo::ReduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+  void _do_reduce(std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                  std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                  int dst) {
+    const auto& dtype = inputs[0].dtype();
+    gloo::ReduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, inputs[0]);
+    GENERATE_FUNC(dtype, set_output, opts, outputs[0]);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    opts.setRoot(dst);
+    gloo::reduce(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const ReduceOptions& opts) {
+  std::shared_ptr<ReduceGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ReduceGlooTask>(
+      rank_, context, inputs, outputs, opts.reduce_op, opts.root_rank, tag);
+  task->Run();
+  return task;
+}
+class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ScatterGlooTask(int rank,
+                  const std::shared_ptr<gloo::Context>& context,
+                  std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                  std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                  int src,
+                  int size,
+                  uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _src(src),
+        _size(size),
+        _tag(tag) {}
+  void Run() override { _do_scatter(_inputs, _outputs, _src); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  int _src;
+  int _size;
+  uint32_t _tag;
+  void _do_scatter(std::vector<phi::DenseTensor>& in,   // NOLINT
+                   std::vector<phi::DenseTensor>& out,  // NOLINT
+                   int src) {
+    const auto& dtype = in[0].dtype();
+    gloo::ScatterOptions opts(_context);
+    if (rank_ == src) {
+      GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in[0], _size);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setRoot(src);
+    opts.setTag(_tag);
+    gloo::scatter(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts) {
+  std::shared_ptr<ScatterGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ScatterGlooTask>(
+      rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag);
+  task->Run();
+  return task;
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      0,
+      platform::errors::Fatal("Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <future>
+#include <mutex>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+constexpr const char* GLOO_BACKEND_NAME = "GLOO";
+namespace paddle {
+namespace distributed {
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class GlooTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<GlooTask> {
+   public:
+    explicit GlooTask(int rank,
+                      const std::vector<phi::DenseTensor>& input_tensors,
+                      CommType comm_type);
+    ~GlooTask() = default;
+    virtual void Run() = 0;
+    bool Wait(std::chrono::milliseconds timeout) override { return true; }
+    bool IsCompleted() override { return true; }
+    void Synchronize() override {}
+   protected:
+    friend class ProcessGroupGloo;
+  };
+  class GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    explicit GlooStore(const std::shared_ptr<paddle::distributed::Store>& store)
+        : _store(store) {}
+    ~GlooStore() = default;
+    std::vector<char> get(const std::string& key) override {
+      VLOG(3) << "GlooStore::get";
+      auto value = _store->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+    void wait(const std::vector<std::string>& keys) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+    }
+    void set(const std::string& key, const std::vector<char>& value) override {
+      VLOG(3) << "GlooStore::set";
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      _store->set(key, tmp);
+    }
+    void wait(const std::vector<std::string>& keys,
+              const std::chrono::milliseconds& timeout) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+      // wait(keys);
+    }
+   protected:
+    std::shared_ptr<paddle::distributed::Store> _store;
+  };
+  class GlooOptions {
+   public:
+    GlooOptions() = default;
+    ~GlooOptions() = default;
+    static std::shared_ptr<GlooOptions> create() {
+      return std::make_shared<GlooOptions>();
+    }
+    std::shared_ptr<::gloo::transport::Device> device;
+  };
+  explicit ProcessGroupGloo(
+      const std::shared_ptr<paddle::distributed::Store>& store,
+      int rank,
+      int world_size,
+      const platform::Place& place,
+      int gid,
+      std::shared_ptr<GlooOptions> options);
+  ~ProcessGroupGloo() = default;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& inputs,
+      std::vector<phi::DenseTensor>& outputs,
+      const BroadcastOptions& = BroadcastOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& inputs,
+      std::vector<phi::DenseTensor>& outputs,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceOptions& opts) override;
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ScatterOptions&) override;
+  std::shared_ptr<::gloo::Context> get_context() { return _context; }
+  uint64_t next_tag() { return _tag++; }
+  const std::string GetBackendName() const override {
+    return GLOO_BACKEND_NAME;
+  }
+  // Helper functions for Gloo.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& ifname);
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+ protected:
+  uint32_t _tag;
+  std::shared_ptr<gloo::rendezvous::Context> _context;
+  std::shared_ptr<::gloo::rendezvous::Store> _store;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+DECLARE_bool(hccl_blocking_wait);
+// DECLARE_bool(use_stream_safe_npu_allocator);
+constexpr int64_t kWaitBlockTImeout = 10;
+namespace paddle {
+namespace distributed {
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<NPUEventManager>& hcclEvents,                   // NOLINT
+    std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    hcclEvents[i].Record(*dev_ctx[i]);
+    hcclEvents[i].Block(*default_ctx);
+  }
+}
+std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
+    std::vector<Place> places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs) {
+  return std::make_shared<ProcessGroupHCCL::HCCLTask>(
+      places, rank, comm_type, inputs);
+}
+ProcessGroupHCCL::HCCLTask::HCCLTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType CommType,
+    const std::vector<phi::DenseTensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  hcclComms_.resize(places.size());
+}
+ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
+void ProcessGroupHCCL::HCCLTask::SetOutputs(
+    std::vector<phi::DenseTensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
+}
+void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    platform::NPUStreamWaitEvent(default_ctx->stream(),
+                                 control_events_[i].GetRawNPUEvent());
+  }
+}
+bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+  return true;
+}
+// TODO(sandyhouse): Add timeout for wait, now timeout unused
+bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  // NOTE(sandyhouse): It will block host for sync
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+  }
+  return true;
+}
+// Same as Wait
+void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
+ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                                   int rank,
+                                   int size,
+                                   const platform::Place& place,
+                                   int gid)
+    : ProcessGroup(rank, size, place, gid), store_(store) {
+  platform::SetNPUDeviceId(place_.device);
+}
+void ProcessGroupHCCL::BroadcastUniqueHCCLID(
+    std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto hccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
+      store_->set(key, hccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&hccl_ids[i], ret.data(), ret.size());
+    }
+  }
+}
+// create HCCLManager cache for places_key
+void ProcessGroupHCCL::CreateHCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(),
+                    false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+  std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
+  hccl_comms.resize(places.size());
+  // using vector just for broadcast
+  std::vector<HcclRootInfo> hccl_ids;
+  hccl_ids.resize(1);
+  auto& hccl_id = hccl_ids.front();
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
+  }
+  BroadcastUniqueHCCLID(hccl_ids);
+  VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
+  std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+  std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    hccl_comms[i] = HCCLCommManager::Create(
+        GetSize(), GetRank(), &hccl_id, comms.get() + i);
+    dev_ctx[i].reset(new NPUDeviceContext(places[i]));
+  }
+  std::vector<NPUEventManager> events;
+  events.resize(places.size());
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+  auto& hccl_comms = places_to_hcclcomm_[key];
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < inputs.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
+  }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const AllreduceOptions& opts) {
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          HcclComm comm,
+          const aclrtStream& stream) {
+        return platform::dynload::HcclAllReduce(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToHCCLDataType(input.dtype()),
+            ToHCCLRedType(opts.reduce_op),
+            comm,
+            stream);
+      },
+      CommType::ALLREDUCE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const BroadcastOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          HcclComm comm,
+          const aclrtStream& stream) {
+        int root = opts.source_rank * in_tensors.size() + opts.source_root;
+        if (rank_ == root) {
+          return platform::dynload::HcclBroadcast(
+              input.data(),
+              input.numel(),
+              platform::ToHCCLDataType(input.dtype()),
+              root,
+              comm,
+              stream);
+        } else {
+          return platform::dynload::HcclBroadcast(
+              output.data(),
+              output.numel(),
+              platform::ToHCCLDataType(output.dtype()),
+              root,
+              comm,
+              stream);
+        }
+      },
+      CommType::BROADCAST);
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+constexpr const char* HCCL_BACKEND_NAME = "HCCL";
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+using NPUStream = platform::stream::NPUStream;
+using NPUDeviceContext = paddle::platform::NPUDeviceContext;
+class ProcessGroupHCCL : public ProcessGroup {
+ public:
+  class HCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<HCCLTask> {
+   public:
+    HCCLTask(const std::vector<Place>& places,
+             int rank,
+             CommType CommType,
+             const std::vector<phi::DenseTensor>& inputs);
+    bool IsCompleted();
+    void SynchronizeStreams();
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    void Synchronize();
+    void SetOutputs(std::vector<phi::DenseTensor>& outputs);  // NOLINT
+    virtual ~HCCLTask();
+    std::vector<NPUEventManager> control_events_;
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
+    std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
+   private:
+  };
+  ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                   int rank,
+                   int size,
+                   const platform::Place& place,
+                   int gid);
+  const std::string GetBackendName() const override {
+    return std::string(HCCL_BACKEND_NAME);
+  }
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+ protected:
+  virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
+      std::vector<Place> places,
+      int rank,
+      CommType opType,
+      const std::vector<phi::DenseTensor>& inputs);
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<HCCLCommManager> hccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
+      places_to_hcclcomm_;
+  std::unordered_map<std::string, std::vector<NPUEventManager>>
+      places_to_events_;
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<NPUDeviceContext>>>
+      places_to_ctx_;
+  std::set<int> used_place_ids_;
+ private:
+  void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids,
+                   int root,  // NOLINT
+                   int server_fd);
+  void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids);  // NOLINT
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<phi::DenseTensor>& inputs,   // NOLINT
+      std::vector<phi::DenseTensor>& outputs,  // NOLINT
+      Fn fn,
+      CommType op_type);
+  void CreateHCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+#include <chrono>
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+constexpr int64_t kWaitBlockTImeout = 10;
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+int ProcessGroupHeter::send_count = 0;
+int ProcessGroupHeter::recv_count = 0;
+std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
+    int rank, CommType comm_type, const std::vector<phi::DenseTensor>& inputs) {
+  return std::make_shared<ProcessGroupHeter::HeterTask>(
+      rank, comm_type, inputs);
+}
+ProcessGroupHeter::HeterTask::HeterTask(
+    int rank, CommType CommType, const std::vector<phi::DenseTensor>& inputs)
+    : Task(rank, inputs, CommType) {}
+ProcessGroupHeter::HeterTask::~HeterTask() {}
+bool ProcessGroupHeter::HeterTask::IsCompleted() { return true; }
+// TODO(sheniang03): Add timeout for wait, now timeout unused
+bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
+  return true;
+}
+ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
+                                     int rank,
+                                     int size,
+                                     const platform::Place& place,
+                                     int gid,
+                                     int local_rank,
+                                     int local_size,
+                                     int gloo_rank,
+                                     int gloo_size,
+                                     bool with_switch,
+                                     std::string switch_endpoint,
+                                     int src_rank,
+                                     int dst_rank)
+    : ProcessGroup(rank, size, place, gid),
+      store_(store),
+      local_rank_(local_rank),
+      local_size_(local_size),
+      gloo_rank_(gloo_rank),
+      gloo_size_(gloo_size),
+      with_switch_(with_switch),
+      switch_endpoint_(switch_endpoint),
+      src_rank_(src_rank),
+      dst_rank_(dst_rank) {
+  return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  inner_pg_ = std::make_shared<ProcessGroupNCCL>(
+      store, local_rank, local_size, place_, IGNORE_ID);
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  inner_pg_ = std::make_shared<ProcessGroupHCCL>(
+      store, local_rank, local_size, place_, IGNORE_ID);
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "ProcessGroupHeter only supports NCCL, RCCL and HCCL now."));
+#endif
+  if (local_rank_ == 0 && !with_switch_) {
+    auto opts = ProcessGroupGloo::GlooOptions::create();
+    opts->device = ProcessGroupGloo::createDefaultDevice();
+    inter_pg_ = std::make_shared<ProcessGroupGloo>(
+        store, gloo_rank_, gloo_size_, place_, IGNORE_ID, opts);
+  }
+}
+template <typename T>
+static void _do_add(T* dst, T* src, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    *dst += *src;
+    dst++;
+    src++;
+  }
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const AllreduceOptions& opts) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+#endif
+  // Step1: do allreduce in inner cluster
+  auto task = inner_pg_->AllReduce(in_tensors, in_tensors, opts);
+  task->Wait();
+  // Step2: copy tensors to CPU
+  if (local_rank_ == 0) {
+    std::vector<phi::DenseTensor> cpu_tensors;
+    cpu_tensors.reserve(in_tensors.size());
+    phi::DenseTensor cpu_tensor;
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = in_tensors[i];
+      cpu_tensor.Resize(gpu_tensor.dims());
+      framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+      cpu_tensors.push_back(cpu_tensor);
+    }
+    // Step3: do inter cluster allreduce
+    if (with_switch_) {
+      if (local_rank_ == 0) {
+        HeterClient* client_ =
+            HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+        auto dense_cpu_tensor = cpu_tensors[0];
+        std::vector<int64_t> send_size;
+        send_size.push_back(dense_cpu_tensor.numel());
+        int ret = client_->Send(
+            gid_,
+            {dense_cpu_tensor.name()},
+            send_size,
+            dense_cpu_tensor.data(),
+            dense_cpu_tensor.numel() *
+                framework::DataTypeSize(dense_cpu_tensor.dtype()));
+        PADDLE_ENFORCE_EQ(ret,
+                          0,
+                          platform::errors::PreconditionNotMet(
+                              "Send to the switch module error."));
+        phi::DenseTensor cpu_tensor2;
+        cpu_tensor2.AllocateFrom(
+            std::make_unique<paddle::experimental::DefaultAllocator>(
+                paddle::platform::CPUPlace())
+                .get(),
+            dense_cpu_tensor.dtype(),
+            dense_cpu_tensor.numel());
+        ret = client_->Recv(
+            gid_,
+            {dense_cpu_tensor.name()},
+            cpu_tensor2.data(),
+            cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype()));
+        PADDLE_ENFORCE_EQ(ret,
+                          0,
+                          platform::errors::PreconditionNotMet(
+                              "Recv from the switch module error."));
+        switch (dense_cpu_tensor.dtype()) {
+          case DataType::FLOAT32:
+            _do_add<float>(reinterpret_cast<float*>(dense_cpu_tensor.data()),
+                           reinterpret_cast<float*>(cpu_tensor2.data()),
+                           dense_cpu_tensor.numel());
+            break;
+          case DataType::FLOAT64:
+            _do_add<double>(reinterpret_cast<double*>(dense_cpu_tensor.data()),
+                            reinterpret_cast<double*>(cpu_tensor2.data()),
+                            dense_cpu_tensor.numel());
+            break;
+          case DataType::INT32:
+            _do_add<int>(reinterpret_cast<int*>(dense_cpu_tensor.data()),
+                         reinterpret_cast<int*>(cpu_tensor2.data()),
+                         dense_cpu_tensor.numel());
+            break;
+          default:
+            PADDLE_THROW(platform::errors::PreconditionNotMet(
+                "Unsupported data type (%s) to do add.",
+                framework::DataType2String(dense_cpu_tensor.dtype())));
+        }
+      }
+    } else {
+      auto gloo_task = inter_pg_->AllReduce(cpu_tensors, cpu_tensors, opts);
+      gloo_task->Wait();
+    }
+    // Step4: copy cpu tensors to gpu
+    // copy cpu tensors to gpu
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = out_tensors[i];
+      auto cpu_tensor = cpu_tensors[i];
+      framework::TensorCopySync(cpu_tensor, cpu_tensor.place(), &gpu_tensor);
+    }
+  }
+  // Step5: broadcast among inner cluster
+  auto b_opts = BroadcastOptions();
+  b_opts.source_rank = 0;
+  auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts);
+  broadcast_task->Wait();
+  return CreateTask(rank_, CommType::ALLREDUCE, in_tensors);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+#endif
+  // Step1: do broadcast in inner cluster
+  auto b_opts = BroadcastOptions();
+  b_opts.source_rank = 0;
+  inner_pg_->Broadcast(in_tensors, out_tensors, b_opts);
+  if (local_rank_ == 0) {
+    std::vector<phi::DenseTensor> cpu_tensors;
+    cpu_tensors.reserve(in_tensors.size());
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = in_tensors[i];
+      phi::DenseTensor cpu_tensor;
+      cpu_tensor.Resize(gpu_tensor.dims());
+      framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+      cpu_tensors.push_back(cpu_tensor);
+    }
+    if (with_switch_) {
+      if (local_rank_ == 0) {
+        HeterClient* client_ =
+            HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+        auto dense_cpu_tensor = cpu_tensors[0];
+        if (gloo_rank_ == 0) {
+          std::vector<int64_t> send_size;
+          send_size.push_back(dense_cpu_tensor.numel());
+          int ret = client_->Send(
+              gid_,
+              {dense_cpu_tensor.name()},
+              send_size,
+              dense_cpu_tensor.data(),
+              dense_cpu_tensor.numel() *
+                  framework::DataTypeSize(dense_cpu_tensor.dtype()));
+          PADDLE_ENFORCE_EQ(ret,
+                            0,
+                            platform::errors::PreconditionNotMet(
+                                "Send to the switch module error."));
+        } else {
+          int ret = client_->Recv(
+              gid_,
+              {dense_cpu_tensor.name()},
+              dense_cpu_tensor.data(),
+              dense_cpu_tensor.numel() *
+                  framework::DataTypeSize(dense_cpu_tensor.dtype()));
+          PADDLE_ENFORCE_EQ(ret,
+                            0,
+                            platform::errors::PreconditionNotMet(
+                                "Receive from the switch module error."));
+        }
+      }
+    } else {
+      auto gloo_task = inter_pg_->Broadcast(cpu_tensors, cpu_tensors, opts);
+      gloo_task->Wait();
+    }
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = out_tensors[i];
+      auto cpu_tensor = cpu_tensors[i];
+      framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
+    }
+  }
+  auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts);
+  broadcast_task->Wait();
+  return CreateTask(rank_, CommType::BROADCAST, in_tensors);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
+    std::vector<phi::DenseTensor>& in_tensors, int peer) {
+  PADDLE_ENFORCE_EQ(
+      in_tensors.size(),
+      1,
+      platform::errors::PreconditionNotMet(
+          "For each send operation, there can only be one tensor to send."));
+  // Copy Tensor to cpu
+  auto start = std::chrono::high_resolution_clock::now();
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = in_tensors[0];
+  framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+  PADDLE_ENFORCE_EQ(with_switch_,
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from gpu to cpu for send " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+  // Send to switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  int64_t tensor_size =
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype());
+  std::vector<int64_t> send_size;
+  send_size.push_back(tensor_size);
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(send_count++);
+  VLOG(2) << "tensor_name:" << tensor_name;
+  int ret = client_->Send(
+      gid_, {tensor_name}, send_size, cpu_tensor.data(), tensor_size);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      0,
+      platform::errors::PreconditionNotMet("Send to the switch module error."));
+  return CreateTask(rank_, CommType::SEND, in_tensors);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
+    std::vector<phi::DenseTensor>& out_tensors, int peer) {
+  PADDLE_ENFORCE_EQ(
+      out_tensors.size(),
+      1,
+      platform::errors::PreconditionNotMet(
+          "For each rece operation, there can only be one tensor to receive."));
+  // Copy Tensor to cpu
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = out_tensors[0];
+  cpu_tensor.Resize(gpu_tensor.dims());
+  cpu_tensor.set_layout(gpu_tensor.layout());
+  cpu_tensor.mutable_data(platform::CPUPlace(), gpu_tensor.dtype());
+  PADDLE_ENFORCE_EQ(with_switch_,
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  // recv from switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(recv_count++);
+  VLOG(2) << "tensor_name: " << tensor_name;
+  auto start = std::chrono::high_resolution_clock::now();
+  int ret = client_->Recv(
+      gid_,
+      {tensor_name},
+      cpu_tensor.data(),
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
+  PADDLE_ENFORCE_EQ(ret,
+                    0,
+                    platform::errors::PreconditionNotMet(
+                        "receive to the switch module error."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  double goodput = cpu_tensor.numel() *
+                   framework::DataTypeSize(cpu_tensor.dtype()) / diff.count();
+  VLOG(2) << "Goodput: " << goodput << "B/s" << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
+  end = std::chrono::high_resolution_clock::now();
+  diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from cpu to gpu for recv " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+  return CreateTask(rank_, CommType::RECV, out_tensors);
+}
+}  // namespace distributed
+}  // namespace paddle