2.4.1-dtk-23.04

de2e6515 · yuguo960516yuguo · ad08b8ce · de2e6515 · de2e6515 · de2e6515
Commit de2e6515 authored Apr 26, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/fluid/distributed/auto_parallel/test/process_mesh_test.cc
+++ b/paddle/fluid/distributed/auto_parallel/test/process_mesh_test.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/distributed/auto_parallel/process_mesh.h"
+#include <iostream>
+#include <sstream>
+#include "gtest/gtest.h"
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+TEST(ProcessMesh, Ctor) {
+  std::vector<int64_t> shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  int64_t size = shape[0] * shape[1];
+  ProcessMesh process_mesh(shape, process_ids, dim_names);
+  EXPECT_EQ(process_mesh.shape(), shape);
+  EXPECT_EQ(process_mesh.process_ids(), process_ids);
+  EXPECT_EQ(process_mesh.dim_names()[0], "x");
+  EXPECT_EQ(process_mesh.dim_names()[1], "y");
+  EXPECT_EQ(process_mesh.size(), size);
+  EXPECT_EQ(process_mesh.ndim(), static_cast<int64_t>(shape.size()));
+  EXPECT_EQ(process_mesh.dim_size(0), shape[0]);
+  EXPECT_EQ(process_mesh.dim_size(-1), shape[1]);
+  EXPECT_EQ(process_mesh.dim_size("x"), shape[0]);
+  EXPECT_EQ(process_mesh.dim_size("y"), shape[1]);
+  EXPECT_EQ(process_mesh.empty(), false);
+  EXPECT_EQ(process_mesh.contains(0), true);
+  EXPECT_EQ(process_mesh.contains(6), false);
+  std::stringstream sstream;
+  sstream << process_mesh;
+  EXPECT_EQ(sstream.str(), process_mesh.to_string());
+  auto proto = process_mesh.to_proto();
+  ProcessMesh new_process_mesh = ProcessMesh::from_proto(proto);
+  EXPECT_EQ(process_mesh, new_process_mesh);
+}
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/auto_parallel/utils.h
+++ b/paddle/fluid/distributed/auto_parallel/utils.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+// struct Indent {
+//   Indent(int &level) : level(level) { ++level; }
+//   ~Indent() { --level; }
+//   int &level;
+// };
+// inline std::string str_indent(std::string& str, cur_indent) {
+//   string spaces(cur_indent, " ");
+//   return str + std::string(cur_indent, " ");
+// }
+template <class T>
+bool has_duplicates(const std::vector<T>& vec) {
+  std::unordered_map<T, int> map;
+  for (const auto& i : vec) {
+    ++map[i];
+    if (map[i] > 1) return true;
+  }
+  return false;
+}
+inline int64_t canonical_dim(int dim, int ndim) {
+  PADDLE_ENFORCE_EQ(
+      dim >= -ndim && dim < ndim,
+      true,
+      platform::errors::InvalidArgument(
+          "Dimension %d is outside of [-%d, %d).", dim, ndim, ndim));
+  if (dim < 0) {
+    return dim + ndim;
+  }
+  return dim;
+}
+// Refer to https://stackoverflow.com/a/5289170
+template <typename Range, typename Value = typename Range::value_type>
+std::string str_join(Range const& elements,
+                     const std::string& delimiter = ",") {
+  std::ostringstream os;
+  auto b = std::begin(elements), e = std::end(elements);
+  if (b != e) {
+    std::copy(b, prev(e), std::ostream_iterator<Value>(os, delimiter.c_str()));
+    b = prev(e);
+  }
+  if (b != e) {
+    os << *b;
+  }
+  return os.str();
+}
+inline std::string str_join(std::map<std::string, bool> const& elements,
+                            const std::string& delimiter = ",") {
+  std::string str;
+  for (const auto& item : elements) {
+    str += item.first + ": " + std::to_string(item.second) + ",";
+  }
+  return str.substr(0, str.size() - 2);
+}
+// Refer to https://stackoverflow.com/a/46931770
+inline std::vector<std::string> str_split(std::string const& input,
+                                          const std::string& delimiter = ",") {
+  size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+  std::string token;
+  std::vector<std::string> output;
+  while ((pos_end = input.find(delimiter, pos_start)) != std::string::npos) {
+    token = input.substr(pos_start, pos_end - pos_start);
+    pos_start = pos_end + delim_len;
+    output.push_back(token);
+  }
+  output.push_back(input.substr(pos_start));
+  return output;
+}
+// Refer to https://stackoverflow.com/a/29200671/2358969
+template <typename T>
+std::string to_string_with_precision(const T a_value, const int n = 2) {
+  std::ostringstream out;
+  out.precision(n);
+  out << std::fixed << a_value;
+  return out.str();
+}
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
+cc_library(
+  processgroup
+  SRCS ProcessGroup.cc
+  DEPS dense_tensor)
+cc_library(
+  processgroup_stream
+  SRCS ProcessGroupStream.cc
+  DEPS dense_tensor)
+cc_library(
+  eager_reducer
+  SRCS reducer.cc
+  DEPS eager_api processgroup processgroup_stream phi_api string_helper)
+if(WITH_DISTRIBUTE)
+  cc_library(
+    processgroup_gloo
+    SRCS ProcessGroupGloo.cc
+    DEPS phi_api eager_api gloo_wrapper)
+endif()
+if(WITH_NCCL OR WITH_RCCL)
+  cc_library(
+    processgroup_nccl
+    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
+    DEPS processgroup
+         processgroup_stream
+         place
+         enforce
+         collective_helper
+         device_context
+         dense_tensor)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
+      DEPS place enforce collective_helper device_context phi_api eager_api)
+  endif()
+endif()
+if(WITH_ASCEND_CL)
+  cc_library(
+    processgroup_hccl
+    SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc
+    DEPS place
+         npu_stream
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc
+      DEPS place
+           npu_stream
+           enforce
+           collective_helper
+           device_context
+           phi_api
+           eager_api)
+  endif()
+endif()
+if(WITH_CUSTOM_DEVICE)
+  cc_library(
+    processgroup_custom
+    SRCS ProcessGroupCustom.cc CustomCCLTools.cc Common.cc
+    DEPS phi_backends
+         place
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+endif()
--- a/paddle/fluid/distributed/collective/Common.cc
+++ b/paddle/fluid/distributed/collective/Common.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/Common.h"
+namespace paddle {
+namespace distributed {
+std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.place());
+  }
+  return places;
+}
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors) {
+  return std::all_of(
+      tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
+        return platform::is_gpu_place(t.place());
+      });
+}
+bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
+                               const std::string& dev_type) {
+  return std::all_of(
+      tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
+        return platform::places_are_same_class(
+            t.place(), paddle::platform::CustomPlace(dev_type));
+      });
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/Common.h
+++ b/paddle/fluid/distributed/collective/Common.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors);
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places);
+bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors);
+bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
+                               const std::string& dev_type);
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/CustomCCLTools.cc
+++ b/paddle/fluid/distributed/collective/CustomCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+namespace paddle {
+namespace distributed {
+phi::ccl::CCLReduceOp ToCustomCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, phi::ccl::CCLReduceOp> red_type = {
+      {ReduceOp::MIN, phi::ccl::CCLReduceOp::MIN},
+      {ReduceOp::MAX, phi::ccl::CCLReduceOp::MAX},
+      {ReduceOp::SUM, phi::ccl::CCLReduceOp::SUM},
+      {ReduceOp::PRODUCT, phi::ccl::CCLReduceOp::PRODUCT},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(),
+      true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+std::string SerializeCustomCCLUniqueId(const phi::ccl::CCLRootId& ccl_id) {
+  const uint8_t* bytes = ccl_id.data();
+  std::ostringstream oss;
+  for (size_t i = 0; i < ccl_id.size(); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/CustomCCLTools.h
+++ b/paddle/fluid/distributed/collective/CustomCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <error.h>
+#include <string>
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+namespace paddle {
+namespace distributed {
+class CustomEventManager {
+ public:
+  CustomEventManager() = default;
+  ~CustomEventManager() {
+    if (is_created_) {
+      event_->Destroy();
+    }
+  }
+  CustomEventManager(const CustomEventManager&) = delete;
+  CustomEventManager& operator=(const CustomEventManager&) = delete;
+  CustomEventManager(CustomEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(device_type_, other.device_type_);
+    std::swap(event_, other.event_);
+  }
+  CustomEventManager& operator=(CustomEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(device_type_, other.device_type_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+  bool IsCreated() const { return is_created_; }
+  int8_t DeviceId() const { return device_index_; }
+  std::string DeviceType() const { return device_type_; }
+  phi::event::event_t GetRawCustomEvent() const { return event_->raw_event(); }
+  phi::event::Event* GetCustomEvent() const { return event_.get(); }
+  void Record(const paddle::platform::CustomDeviceContext& ctx) {
+    auto place = ctx.GetPlace();
+    auto device_type = place.GetDeviceType();
+    auto device_index = place.GetDeviceId();
+    if (!is_created_) {
+      CreateEvent(place);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "CustomDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+    PADDLE_ENFORCE_EQ(device_type,
+                      device_type_,
+                      platform::errors::PreconditionNotMet(
+                          "CustomDeviceContext's device %d does not match"
+                          "Event's device type %d",
+                          device_type,
+                          device_type_));
+    phi::DeviceGuard guard(place);
+    phi::stream::Stream stream(place, ctx.stream());
+    event_->Record(&stream);
+  }
+  bool Query() const { return event_->Query(); }
+  void Block(const paddle::platform::CustomDeviceContext& ctx) const {
+    if (is_created_) {
+      auto place = ctx.GetPlace();
+      auto device_type = place.GetDeviceType();
+      auto device_index = place.GetDeviceId();
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CustomDeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      PADDLE_ENFORCE_EQ(device_type,
+                        device_type_,
+                        platform::errors::PreconditionNotMet(
+                            "CustomDeviceContext's device %d does not match"
+                            "Event's device type %d",
+                            device_type,
+                            device_type_));
+      phi::DeviceGuard guard(place);
+      phi::stream::Stream stream(place, ctx.stream());
+      stream.WaitEvent(event_.get());
+    }
+  }
+ private:
+  bool is_created_{false};
+  std::shared_ptr<phi::event::Event> event_{nullptr};
+  int8_t device_index_{0};
+  std::string device_type_;
+ private:
+  void CreateEvent(const platform::Place& place) {
+    device_index_ = place.GetDeviceId();
+    device_type_ = place.GetDeviceType();
+    event_.reset(new phi::event::Event);
+    event_->Init(place);
+    is_created_ = true;
+  }
+};
+class CustomCCLCommManager {
+ public:
+  CustomCCLCommManager(const std::string& device_type,
+                       phi::ccl::CCLComm ccl_comm)
+      : device_type_(device_type), ccl_comm_(ccl_comm) {}
+  CustomCCLCommManager() : CustomCCLCommManager("", nullptr) {}
+  ~CustomCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (ccl_comm_) {
+      phi::DeviceManager::CCLDestroyComm(device_type_, ccl_comm_);
+    }
+  }
+  static std::shared_ptr<CustomCCLCommManager> Create(
+      const std::string& device_type,
+      int num_ranks,
+      int rank,
+      phi::ccl::CCLRootId* comm_id,
+      phi::ccl::CCLComm* ccl_comm) {
+    auto custom_ccl_manager = std::make_shared<CustomCCLCommManager>();
+    phi::DeviceManager::CCLCommInitRank(
+        device_type, num_ranks, comm_id, rank, ccl_comm);
+    custom_ccl_manager->device_type_ = device_type;
+    custom_ccl_manager->ccl_id_ = comm_id;
+    custom_ccl_manager->rank_ = rank;
+    custom_ccl_manager->ccl_comm_ = *ccl_comm;
+    return custom_ccl_manager;
+  }
+  phi::ccl::CCLRootId* GetCustomCCLId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return ccl_id_;
+  }
+  phi::ccl::CCLComm GetCustomCCLComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return ccl_comm_;
+  }
+  CustomCCLCommManager(const CustomCCLCommManager&) = delete;
+  CustomCCLCommManager& operator=(const CustomCCLCommManager&) = delete;
+  CustomCCLCommManager& operator=(CustomCCLCommManager&& other) = delete;
+  CustomCCLCommManager(CustomCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(ccl_comm_, other.ccl_comm_);
+  }
+ protected:
+  std::string device_type_;
+  phi::ccl::CCLComm ccl_comm_;
+  phi::ccl::CCLRootId* ccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+phi::ccl::CCLReduceOp ToCustomCCLRedType(ReduceOp reduction);
+std::string SerializeCustomCCLUniqueId(const phi::ccl::CCLRootId& ccl_id);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/HCCLTools.cc
+++ b/paddle/fluid/distributed/collective/HCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+namespace paddle {
+namespace distributed {
+HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(),
+      true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <error.h>
+#include <string>
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/variant.h"
+namespace paddle {
+namespace distributed {
+class NPUEventManager {
+ public:
+  NPUEventManager() = default;
+  ~NPUEventManager() {
+    if (is_created_) {
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUEventDestroy(event_);
+    }
+  }
+  NPUEventManager(const NPUEventManager&) = delete;
+  NPUEventManager& operator=(const NPUEventManager&) = delete;
+  NPUEventManager(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+  NPUEventManager& operator=(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  aclrtEvent GetRawNPUEvent() const { return event_; }
+  void Record(const paddle::platform::NPUDeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "NPUDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+    platform::NPUDeviceGuard guard(device_index_);
+    platform::NPUEventRecord(event_, ctx.stream());
+  }
+  bool Query() const {
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    platform::NPUEventQuery(event_, &status);
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      return true;
+    }
+    return false;
+  }
+  void Block(const paddle::platform::NPUDeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "phi::GPUContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUStreamWaitEvent(ctx.stream(), event_);
+    }
+  }
+ private:
+  bool is_created_{false};
+  aclrtEvent event_{};
+  int8_t device_index_{0};
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::NPUDeviceGuard guard(device_index);
+    platform::NPUEventCreate(&event_);
+    is_created_ = true;
+  }
+};
+class HCCLCommManager {
+ public:
+  explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
+  HCCLCommManager() : HCCLCommManager(nullptr) {}
+  ~HCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hccl_comm_) {
+      platform::dynload::HcclCommDestroy(hccl_comm_);
+    }
+  }
+  static std::shared_ptr<HCCLCommManager> Create(int num_ranks,
+                                                 int rank,
+                                                 HcclRootInfo* comm_id,
+                                                 HcclComm hccl_comm) {
+    auto hccl_manager = std::make_shared<HCCLCommManager>();
+    auto ret = platform::dynload::HcclCommInitRootInfo(
+        num_ranks, comm_id, rank, &hccl_comm);
+    using __NPU_STATUS_TYPE__ = decltype(ret);
+    constexpr auto __success_type__ =
+        platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
+    if (UNLIKELY(ret != __success_type__)) {
+      VLOG(0) << "Error: create hccl_id error.";
+      exit(-1);
+    }
+    hccl_manager->hccl_id_ = comm_id;
+    hccl_manager->rank_ = rank;
+    hccl_manager->hccl_comm_ = hccl_comm;
+    return hccl_manager;
+  }
+  HcclRootInfo* GetHcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_id_;
+  }
+  HcclComm GetHcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_comm_;
+  }
+  HCCLCommManager(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
+  HCCLCommManager(HCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(hccl_comm_, other.hccl_comm_);
+  }
+ protected:
+  HcclComm hccl_comm_;
+  HcclRootInfo* hccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+HcclReduceOp ToHCCLRedType(ReduceOp reduction);
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+namespace paddle {
+namespace distributed {
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, ncclRedOp_t> red_type = {
+      {ReduceOp::MIN, ncclMin},
+      {ReduceOp::MAX, ncclMax},
+      {ReduceOp::SUM, ncclSum},
+      {ReduceOp::PRODUCT, ncclProd},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
+  return it->second;
+}
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#include <error.h>
+#include <string>
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/variant.h"
+namespace paddle {
+namespace distributed {
+#define NCCLCHECK(cmd)                                  \
+  do {                                                  \
+    ncclResult_t r = cmd;                               \
+    if (r != ncclSuccess) {                             \
+      printf("Failed, NCCL error %s:%d '%s'\n",         \
+             __FILE__,                                  \
+             __LINE__,                                  \
+             platform::dynload::ncclGetErrorString(r)); \
+      exit(EXIT_FAILURE);                               \
+    }                                                   \
+  } while (0)
+// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
+// EventManage is different from paddle::platform::CudaEvent.
+// It uses lazy initialization and is only created when the
+// Record() method is called for the first time; it also monitors
+// device information to ensure that recorded stream and event
+// are on the same device.
+class EventManager {
+ public:
+  EventManager() {}
+  explicit EventManager(unsigned int flags) : flags_{flags} {}
+  ~EventManager() {
+    if (is_created_) {
+      platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      hipEventDestroy(event_);
+#else
+      cudaEventDestroy(event_);
+#endif
+    }
+  }
+  EventManager(const EventManager&) = delete;
+  EventManager& operator=(const EventManager&) = delete;
+  EventManager(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+  EventManager& operator=(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  gpuEvent_t GetRawCudaEvent() const { return event_; }
+  void Record(const phi::GPUContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "phi::GPUContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+    platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream()));
+#endif
+  }
+  bool Query() const {
+#ifdef PADDLE_WITH_HIP
+    gpuError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) {
+      return true;
+    }
+    if (err == hipErrorNotReady) {
+      return false;
+    }
+#else
+    gpuError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    }
+    if (err == cudaErrorNotReady) {
+      return false;
+    }
+#endif
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    return false;
+  }
+  void Synchronize() const {
+    if (is_created_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
+#endif
+    }
+  }
+  void Block(const phi::GPUContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "phi::GPUContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
+#endif
+    }
+  }
+ private:
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags_ = hipEventDefault;
+#else
+  unsigned int flags_ = cudaEventDefault;
+#endif
+  bool is_created_{false};
+  gpuEvent_t event_{};
+  int8_t device_index_{0};
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::CUDADeviceGuard guard(device_index);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
+#endif
+    is_created_ = true;
+  }
+};
+// NOTE(shenliang03): NCCLCommManager is more lightweight than
+// platform::NCCLComm
+class NCCLCommManager {
+ public:
+  explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {}
+  NCCLCommManager() : NCCLCommManager(nullptr) {}
+  ~NCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (nccl_comm_) {
+      platform::dynload::ncclCommDestroy(nccl_comm_);
+    }
+  }
+  static std::shared_ptr<NCCLCommManager> Create(int num_ranks,
+                                                 int rank,
+                                                 ncclUniqueId comm_id) {
+    auto nccl_manager = std::make_shared<NCCLCommManager>();
+    NCCLCHECK(platform::dynload::ncclCommInitRank(
+        &(nccl_manager->nccl_comm_), num_ranks, comm_id, rank));
+    nccl_manager->nccl_id_ = comm_id;
+    nccl_manager->rank_ = rank;
+    return nccl_manager;
+  }
+  ncclUniqueId GetNcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_id_;
+  }
+  ncclComm_t GetNcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_comm_;
+  }
+  NCCLCommManager(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(NCCLCommManager&& other) = delete;
+  NCCLCommManager(NCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(nccl_comm_, other.nccl_comm_);
+  }
+ protected:
+  ncclComm_t nccl_comm_;
+  ncclUniqueId nccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+namespace paddle {
+namespace distributed {
+ProcessGroup::Task::Task(int rank,
+                         const std::vector<phi::DenseTensor>& inputs,
+                         CommType comm_type)
+    : rank_(rank), comm_type_(comm_type) {}
+ProcessGroup::Task::Task(int rank,
+                         const std::vector<phi::DenseTensor>& inputs,
+                         CommType comm_type,
+                         bool sync_op)
+    : rank_(rank), comm_type_(comm_type), sync_op_(sync_op) {}
+ProcessGroup::Task::~Task() = default;
+bool ProcessGroup::Task::IsCompleted() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return is_completed_;
+}
+bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
+  return false;
+}
+void ProcessGroup::Task::Synchronize() {}
+ProcessGroup::ProcessGroup(int rank,
+                           int size,
+                           const platform::Place& place,
+                           int gid)
+    : rank_(rank), size_(size), place_(place), gid_(gid) {
+  if (gid != IGNORE_ID) {
+    auto map = ProcessGroupMapFromGid::getInstance();
+    map->insert(gid_, this);
+  }
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
+namespace paddle {
+namespace distributed {
+constexpr int IGNORE_ID = -1;
+using Tensor = paddle::experimental::Tensor;
+enum class CommType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_SPARSE = 2,  // TODO(shenliang03): to support sparse in allreduce
+  REDUCE = 3,
+  ALLGATHER = 4,
+  GATHER = 5,
+  SCATTER = 6,
+  REDUCE_SCATTER = 7,
+  ALLTOALL = 8,
+  SEND = 9,
+  RECV = 10,
+  BARRIER = 11,
+  ALLTOALL_SINGLE = 12,
+  UNKNOWN = 100,
+};
+class ProcessGroup {
+ public:
+  class Task {
+   public:
+    Task(int rank,
+         const std::vector<phi::DenseTensor>& inputs,
+         CommType comm_type);
+    Task(int rank,
+         const std::vector<phi::DenseTensor>& inputs,
+         CommType comm_type,
+         bool sync_op);
+    virtual ~Task();
+    virtual bool IsCompleted();
+    virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    virtual void Synchronize();
+    bool IsSync() const { return sync_op_; }
+   protected:
+    const int rank_;
+    CommType comm_type_{CommType::UNKNOWN};
+    std::mutex mutex_;
+    bool is_completed_{false};
+   private:
+    bool sync_op_{true};
+  };
+  explicit ProcessGroup(int rank,
+                        int size,
+                        const platform::Place& place,
+                        int gid);
+  virtual ~ProcessGroup() {}
+  int GetRank() const { return rank_; }
+  int GetSize() const { return size_; }
+  virtual const std::string GetBackendName() const = 0;
+  virtual phi::DeviceContext* GetDeviceContext(const Place& place) const {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Does not support to get device_context from ProcessGroup%s.",
+        GetBackendName()));
+  }
+  // TODO(liyurui): This API will be moved later
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const AllreduceOptions& = AllreduceOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const AllreduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions& = BroadcastOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support barrier", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int64_t,
+      int64_t) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int64_t,
+      int64_t) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv_partial", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv_partial with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>&,    // NOLINT
+      std::vector<phi::DenseTensor>&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support all_gather", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support all_gather with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,    // NOLINT
+      std::vector<phi::DenseTensor>&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall_single", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceOptions& opts) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const ReduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter", GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce_scatter with sync_op flag",
+        GetBackendName()));
+  }
+  virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
+      phi::DenseTensor&,  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      const ReduceScatterOptions&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support ReduceScatter", GetBackendName()));
+  }
+ protected:
+  const int rank_;
+  const int size_;
+  const platform::Place place_;
+  const int gid_;
+};
+class ProcessGroupMapFromGid {
+ public:
+  bool has(int gid) {
+    auto it = map_.find(gid);
+    return it != map_.end();
+  }
+  void insert(int gid, ProcessGroup* pg) {
+    // TODO(sandyhouse): address ut and uncomment the following codes
+    // PADDLE_ENFORCE_EQ(has(gid), false,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "The process group with id %d doesnot exist.",
+    //                       gid));
+    map_[gid] = pg;
+  }
+  ProcessGroup* get(int gid) {
+    // TODO(sandyhouse): address ut and uncomment the following codes
+    // PADDLE_ENFORCE_EQ(has(gid), true,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "The process group with id %d doesnot exist.",
+    //                       gid));
+    return map_.find(gid)->second;
+  }
+  static std::shared_ptr<ProcessGroupMapFromGid> getInstance() {
+    static auto s_instance = std::make_shared<ProcessGroupMapFromGid>();
+    return s_instance;
+  }
+  ProcessGroupMapFromGid() = default;
+  ~ProcessGroupMapFromGid() = default;
+ private:
+  std::unordered_map<int, ProcessGroup*> map_;
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+DECLARE_bool(xccl_blocking_wait);
+constexpr int64_t kWaitBlockTImeout = 10;
+namespace paddle {
+namespace distributed {
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<CustomEventManager>& cclEvents,                    // NOLINT
+    std::vector<std::unique_ptr<CustomDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CustomDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    cclEvents[i].Record(*dev_ctx[i]);
+    cclEvents[i].Block(*default_ctx);
+  }
+}
+std::shared_ptr<ProcessGroupCustom::CustomTask> ProcessGroupCustom::CreateTask(
+    std::vector<Place> places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs) {
+  return std::make_shared<ProcessGroupCustom::CustomTask>(
+      places, rank, comm_type, inputs);
+}
+ProcessGroupCustom::CustomTask::CustomTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType CommType,
+    const std::vector<phi::DenseTensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  cclComms_.resize(places.size());
+}
+ProcessGroupCustom::CustomTask::~CustomTask() {}
+void ProcessGroupCustom::CustomTask::SetOutputs(
+    std::vector<phi::DenseTensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
+}
+void ProcessGroupCustom::CustomTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CustomDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    phi::DeviceGuard guard(default_ctx->GetPlace());
+    phi::stream::Stream stream(default_ctx->GetPlace(), default_ctx->stream());
+    stream.WaitEvent(control_events_[i].GetCustomEvent());
+  }
+}
+bool ProcessGroupCustom::CustomTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+  return true;
+}
+bool ProcessGroupCustom::CustomTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+  }
+  return true;
+}
+// Same as Wait
+void ProcessGroupCustom::CustomTask::Synchronize() { Wait(kWaitTimeout); }
+ProcessGroupCustom::ProcessGroupCustom(const std::shared_ptr<Store>& store,
+                                       int rank,
+                                       int size,
+                                       const platform::Place& place,
+                                       int gid)
+    : ProcessGroup(rank, size, place, gid),
+      store_(store),
+      device_type_(place.GetDeviceType()) {
+  phi::DeviceManager::SetDevice(place_);
+}
+void ProcessGroupCustom::BroadcastUniqueCustomID(
+    std::vector<phi::ccl::CCLRootId>& ccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < ccl_ids.size(); i++) {
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      store_->set(key, ccl_ids[i]);
+    }
+  } else {
+    for (size_t i = 0; i < ccl_ids.size(); i++) {
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      ccl_ids[i] = store_->get(key);
+    }
+  }
+}
+// create CustomCCLManager cache for places_key
+void ProcessGroupCustom::CreateCustomManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(),
+                    false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+  const std::string device_type = places.back().GetDeviceType();
+  std::vector<std::shared_ptr<CustomCCLCommManager>> ccl_comms;
+  ccl_comms.resize(places.size());
+  // using vector just for broadcast
+  std::vector<phi::ccl::CCLRootId> ccl_ids;
+  ccl_ids.resize(1);
+  auto& ccl_id = ccl_ids.front();
+  if (rank_ == 0) {
+    phi::DeviceManager::CCLGetUniqueId(device_type, &ccl_id);
+  }
+  BroadcastUniqueCustomID(ccl_ids);
+  VLOG(3) << "init custom ccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", custom ccl uniqueid: " << SerializeCustomCCLUniqueId(ccl_id);
+  std::vector<std::unique_ptr<CustomDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+  std::unique_ptr<phi::ccl::CCLComm> comms(
+      new phi::ccl::CCLComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    phi::DeviceGuard guard(places[i]);
+    ccl_comms[i] = CustomCCLCommManager::Create(
+        device_type, GetSize(), GetRank(), &ccl_id, comms.get() + i);
+    dev_ctx[i].reset(new CustomDeviceContext(places[i]));
+  }
+  std::vector<CustomEventManager> events;
+  events.resize(places.size());
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_customcomm_.emplace(places_key, std::move(ccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Collective(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_customcomm_.find(key) == places_to_customcomm_.end()) {
+      CreateCustomManagerCache(key, places);
+    }
+  }
+  auto& ccl_comms = places_to_customcomm_[key];
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    phi::DeviceGuard guard(places[i]);
+    const auto& ccl_stream = places_to_ctx_[key][i]->stream();
+    phi::stream::Stream stream(places[i], ccl_stream);
+    fn(inputs[i], outputs[i], ccl_comms[i]->GetCustomCCLComm(), stream);
+  }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    phi::DeviceGuard guard(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllGather(
+            device_type_,
+            input.data(),
+            output.data(),
+            input.numel(),
+            phi::ccl::ToCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER);
+}
+void* XcclGetPointerByOffset(void* raw_pointer,
+                             size_t offset,
+                             experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in xccl is not supported."));
+  }
+  return nullptr;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllGather(
+            device_type_,
+            XcclGetPointerByOffset(input.data(), offset, input.dtype()),
+            output.data(),
+            length,
+            phi::ccl::ToCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const AllreduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllReduce(
+            device_type_,
+            input.data(),
+            output.data(),
+            input.numel(),
+            phi::ccl::ToCCLDataType(input.dtype()),
+            ToCustomCCLRedType(opts.reduce_op),
+            comm,
+            stream);
+      },
+      CommType::ALLREDUCE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const BroadcastOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        int root = opts.source_rank * in_tensors.size() + opts.source_root;
+        if (rank_ == root) {
+          return phi::DeviceManager::CCLBroadcast(
+              device_type_,
+              input.data(),
+              input.numel(),
+              phi::ccl::ToCCLDataType(input.dtype()),
+              root,
+              comm,
+              stream);
+        } else {
+          return phi::DeviceManager::CCLBroadcast(
+              device_type_,
+              output.data(),
+              output.numel(),
+              phi::ccl::ToCCLDataType(output.dtype()),
+              root,
+              comm,
+              stream);
+        }
+      },
+      CommType::BROADCAST);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Barrier(
+    const BarrierOptions& opts) {
+  // Only support single card single process
+  std::vector<phi::CustomPlace> places = {place_};
+  std::vector<phi::DenseTensor> barrierTensors;
+  barrierTensors.reserve(places.size());
+  for (auto& place : places) {
+    phi::DeviceGuard guard(place);
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, place);
+    barrierTensors.push_back(
+        *std::dynamic_pointer_cast<phi::DenseTensor>(dt.impl()));
+  }
+  auto task = ProcessGroupCustom::AllReduce(barrierTensors, barrierTensors);
+  auto xccl_task = dynamic_cast<ProcessGroupCustom::CustomTask*>(task.get());
+  xccl_task->barrierTensors_ = std::move(barrierTensors);
+  return task;
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+using CustomDeviceContext = paddle::platform::CustomDeviceContext;
+class ProcessGroupCustom : public ProcessGroup {
+ public:
+  class CustomTask : public ProcessGroup::Task,
+                     public std::enable_shared_from_this<CustomTask> {
+   public:
+    CustomTask(const std::vector<Place>& places,
+               int rank,
+               CommType CommType,
+               const std::vector<phi::DenseTensor>& inputs);
+    bool IsCompleted();
+    void SynchronizeStreams();
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    void Synchronize();
+    void SetOutputs(std::vector<phi::DenseTensor>& outputs);  // NOLINT
+    virtual ~CustomTask();
+    std::vector<CustomEventManager> control_events_;
+    std::vector<phi::DenseTensor> barrierTensors_;
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<CustomCCLCommManager>> cclComms_;
+    std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
+   private:
+    const std::string device_type_;
+  };
+  ProcessGroupCustom(const std::shared_ptr<Store>& store,
+                     int rank,
+                     int size,
+                     const platform::Place& place,
+                     int gid);
+  const std::string GetBackendName() const override {
+    return "XCCL_" + device_type_;
+  }
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+  std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      int64_t offset,
+      int64_t length) override;
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+ protected:
+  virtual std::shared_ptr<ProcessGroupCustom::CustomTask> CreateTask(
+      std::vector<Place> places,
+      int rank,
+      CommType opType,
+      const std::vector<phi::DenseTensor>& inputs);
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<CustomCCLCommManager> custom_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string,
+                     std::vector<std::shared_ptr<CustomCCLCommManager>>>
+      places_to_customcomm_;
+  std::unordered_map<std::string, std::vector<CustomEventManager>>
+      places_to_events_;
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<CustomDeviceContext>>>
+      places_to_ctx_;
+  std::set<int> used_place_ids_;
+ private:
+  void BcastCustomId(std::vector<phi::ccl::CCLRootId>& ccl_ids,  // NOLINT
+                     int root,
+                     int server_fd);
+  void BroadcastUniqueCustomID(
+      std::vector<phi::ccl::CCLRootId>& custom_ccl_ids);  // NOLINT
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<phi::DenseTensor>& inputs,   // NOLINT
+      std::vector<phi::DenseTensor>& outputs,  // NOLINT
+      Fn fn,
+      CommType op_type);
+  void CreateCustomManagerCache(const std::string& places_key,
+                                const std::vector<Place>& places);
+  const std::string device_type_;
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+#include <gloo/broadcast.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace distributed {
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#define HOST_NAME_MAX 256
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT8:       \
+      func<int8_t>(args);                    \
+      break;                                 \
+    case experimental::DataType::UINT8:      \
+      func<uint8_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::BOOL:       \
+      func<bool>(args);                      \
+      break;                                 \
+    case experimental::DataType::BFLOAT16:   \
+      func<bfloat16>(args);                  \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+template <typename T>
+T* get_data(phi::DenseTensor& tensor) {  // NOLINT
+  return reinterpret_cast<T*>(tensor.data());
+}
+template <typename T>
+std::vector<T*> get_multi_data(
+    std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  std::vector<T*> ret;
+  ret.reserve(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret.push_back(get_data<T>(tensors[i]));
+  }
+  return ret;
+}
+template <typename T, typename P>
+void set_output(P& opts, phi::DenseTensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+template <typename T, typename P>
+void set_input(P& opts, phi::DenseTensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+template <typename T, typename P>
+void set_outputs(P& opts,                                   // NOLINT
+                 std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+template <typename T, typename P>
+void set_inputs(P& opts,                                   // NOLINT
+                std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+template <typename T, typename P>
+void set_inputs_for_scatter(P& opts,                   // NOLINT
+                            phi::DenseTensor& tensor,  // NOLINT
+                            int nranks) {
+  std::vector<T*> ret;
+  ret.reserve(nranks);
+  T* raw_pointer = reinterpret_cast<T*>(tensor.data());
+  size_t offset = 0;
+  for (int i = 0; i < nranks; i++) {
+    ret.push_back(raw_pointer + offset);
+    offset += tensor.numel() / nranks;
+  }
+  opts.setInputs(ret, tensor.numel() / nranks);
+}
+ProcessGroupGloo::GlooTask::GlooTask(
+    int rank, const std::vector<phi::DenseTensor>& inputs, CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {}
+ProcessGroupGloo::ProcessGroupGloo(
+    const std::shared_ptr<distributed::Store>& store,
+    int rank,
+    int world_size,
+    const platform::Place& place,
+    int gid,
+    const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size, place, gid),
+      _tag(0),
+      _store(new GlooStore(store)) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(gid), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    int rank,
+                    int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+  void Run() override { _do_broadcast(_inputs[0], _outputs[0]); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<phi::DenseTensor> _inputs{};
+  std::vector<phi::DenseTensor> _outputs{};
+  const uint32_t _tag;
+  void _do_broadcast(phi::DenseTensor& in, phi::DenseTensor& out) {  // NOLINT
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = in.dtype();
+    if (rank_ == _root) {
+      GENERATE_FUNC(dtype, set_input, opts, in);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(
+      context, inputs, outputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank,
+                    const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    ReduceOp reduce_op,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+  void Run() override { _do_allreduce(_inputs, _outputs); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+  void _do_allreduce(std::vector<phi::DenseTensor>& ins,     // NOLINT
+                     std::vector<phi::DenseTensor>& outs) {  // NOLINT
+    const auto& dtype = ins[0].dtype();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, ins);
+    GENERATE_FUNC(dtype, set_outputs, opts, outs);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const AllreduceOptions& opts) {
+  return AllReduce(inputs, outputs, opts, true);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const AllreduceOptions& opts,
+    bool sync_op) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(
+      rank_, context, inputs, outputs, opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+class BarrierGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BarrierGlooTask(int rank, const std::shared_ptr<gloo::Context>& context)
+      : ProcessGroupGloo::GlooTask(
+            rank, std::vector<phi::DenseTensor>{}, CommType::BARRIER),
+        _context(context) {}
+  void Run() override { _do_barrier(); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  void _do_barrier() {
+    gloo::BarrierOptions opts(_context);
+    gloo::barrier(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Barrier(
+    const BarrierOptions& opts) {
+  std::shared_ptr<BarrierGlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<BarrierGlooTask>(rank_, context);
+  task->Run();
+  return task;
+}
+class AllgatherGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllgatherGlooTask(int rank,
+                    const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+  void Run() override { _do_allgather(_inputs, _outputs); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  uint32_t _tag;
+  void _do_allgather(std::vector<phi::DenseTensor>& in,     // NOLINT
+                     std::vector<phi::DenseTensor>& out) {  // NOLINT
+    const auto& dtype = in[0].dtype();
+    gloo::AllgatherOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, in[0]);
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setTag(_tag);
+    gloo::allgather(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  std::shared_ptr<AllgatherGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<AllgatherGlooTask>(
+      rank_, context, in_tensors, out_tensors, tag);
+  task->Run();
+  return task;
+}
+class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ReduceGlooTask(int rank,
+                 const std::shared_ptr<gloo::Context>& context,
+                 std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                 std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                 ReduceOp reduce_op,
+                 int dst,
+                 uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::REDUCE),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _reduce_op(reduce_op),
+        _dst(dst),
+        _tag(tag) {}
+  void Run() override { _do_reduce(_inputs, _outputs, _dst); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  const ReduceOp _reduce_op;
+  int _dst;
+  uint32_t _tag;
+  gloo::ReduceOptions::Func _get_function(const experimental::DataType type,
+                                          const ReduceOp op) {
+    gloo::ReduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+  template <typename T>
+  void _get_function_impl(gloo::ReduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+  void _do_reduce(std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                  std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                  int dst) {
+    const auto& dtype = inputs[0].dtype();
+    gloo::ReduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, inputs[0]);
+    GENERATE_FUNC(dtype, set_output, opts, outputs[0]);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    opts.setRoot(dst);
+    gloo::reduce(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const ReduceOptions& opts) {
+  std::shared_ptr<ReduceGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ReduceGlooTask>(
+      rank_, context, inputs, outputs, opts.reduce_op, opts.root_rank, tag);
+  task->Run();
+  return task;
+}
+class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ScatterGlooTask(int rank,
+                  const std::shared_ptr<gloo::Context>& context,
+                  std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                  std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                  int src,
+                  int size,
+                  uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _src(src),
+        _size(size),
+        _tag(tag) {}
+  void Run() override { _do_scatter(_inputs, _outputs, _src); }
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  int _src;
+  int _size;
+  uint32_t _tag;
+  void _do_scatter(std::vector<phi::DenseTensor>& in,   // NOLINT
+                   std::vector<phi::DenseTensor>& out,  // NOLINT
+                   int src) {
+    const auto& dtype = in[0].dtype();
+    gloo::ScatterOptions opts(_context);
+    if (rank_ == src) {
+      GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in[0], _size);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setRoot(src);
+    opts.setTag(_tag);
+    gloo::scatter(opts);
+  }
+};
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts) {
+  std::shared_ptr<ScatterGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ScatterGlooTask>(
+      rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag);
+  task->Run();
+  return task;
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      0,
+      platform::errors::Fatal("Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <future>
+#include <mutex>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+constexpr const char* GLOO_BACKEND_NAME = "GLOO";
+namespace paddle {
+namespace distributed {
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class GlooTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<GlooTask> {
+   public:
+    explicit GlooTask(int rank,
+                      const std::vector<phi::DenseTensor>& input_tensors,
+                      CommType comm_type);
+    ~GlooTask() = default;
+    virtual void Run() = 0;
+    bool Wait(std::chrono::milliseconds timeout) override { return true; }
+    bool IsCompleted() override { return true; }
+    void Synchronize() override {}
+   protected:
+    friend class ProcessGroupGloo;
+  };
+  class GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    explicit GlooStore(const std::shared_ptr<paddle::distributed::Store>& store)
+        : _store(store) {}
+    ~GlooStore() = default;
+    std::vector<char> get(const std::string& key) override {
+      VLOG(3) << "GlooStore::get";
+      auto value = _store->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+    void wait(const std::vector<std::string>& keys) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+    }
+    void set(const std::string& key, const std::vector<char>& value) override {
+      VLOG(3) << "GlooStore::set";
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      _store->set(key, tmp);
+    }
+    void wait(const std::vector<std::string>& keys,
+              const std::chrono::milliseconds& timeout) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+      // wait(keys);
+    }
+   protected:
+    std::shared_ptr<paddle::distributed::Store> _store;
+  };
+  class GlooOptions {
+   public:
+    GlooOptions() = default;
+    ~GlooOptions() = default;
+    static std::shared_ptr<GlooOptions> create() {
+      return std::make_shared<GlooOptions>();
+    }
+    std::shared_ptr<::gloo::transport::Device> device;
+  };
+  explicit ProcessGroupGloo(
+      const std::shared_ptr<paddle::distributed::Store>& store,
+      int rank,
+      int world_size,
+      const platform::Place& place,
+      int gid,
+      std::shared_ptr<GlooOptions> options);
+  ~ProcessGroupGloo() = default;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& inputs,
+      std::vector<phi::DenseTensor>& outputs,
+      const BroadcastOptions& = BroadcastOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& inputs,
+      std::vector<phi::DenseTensor>& outputs,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& inputs,
+      std::vector<phi::DenseTensor>& outputs,
+      const AllreduceOptions& opts,
+      bool sync_op) override;
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceOptions& opts) override;
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ScatterOptions&) override;
+  std::shared_ptr<::gloo::Context> get_context() { return _context; }
+  uint64_t next_tag() { return _tag++; }
+  const std::string GetBackendName() const override {
+    return GLOO_BACKEND_NAME;
+  }
+  // Helper functions for Gloo.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& ifname);
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+ protected:
+  uint32_t _tag;
+  std::shared_ptr<gloo::rendezvous::Context> _context;
+  std::shared_ptr<::gloo::rendezvous::Store> _store;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+DECLARE_bool(hccl_blocking_wait);
+// DECLARE_bool(use_stream_safe_npu_allocator);
+constexpr int64_t kWaitBlockTImeout = 10;
+namespace paddle {
+namespace distributed {
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<NPUEventManager>& hcclEvents,                   // NOLINT
+    std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    hcclEvents[i].Record(*dev_ctx[i]);
+    hcclEvents[i].Block(*default_ctx);
+  }
+}
+std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
+    std::vector<Place> places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs) {
+  return std::make_shared<ProcessGroupHCCL::HCCLTask>(
+      places, rank, comm_type, inputs);
+}
+ProcessGroupHCCL::HCCLTask::HCCLTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType CommType,
+    const std::vector<phi::DenseTensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  hcclComms_.resize(places.size());
+}
+ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
+void ProcessGroupHCCL::HCCLTask::SetOutputs(
+    std::vector<phi::DenseTensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
+}
+void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    platform::NPUStreamWaitEvent(default_ctx->stream(),
+                                 control_events_[i].GetRawNPUEvent());
+  }
+}
+bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+  return true;
+}
+// TODO(sandyhouse): Add timeout for wait, now timeout unused
+bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  // NOTE(sandyhouse): It will block host for sync
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+  }
+  return true;
+}
+// Same as Wait
+void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
+ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                                   int rank,
+                                   int size,
+                                   const platform::Place& place,
+                                   int gid)
+    : ProcessGroup(rank, size, place, gid), store_(store) {
+  platform::SetNPUDeviceId(place_.device);
+}
+void ProcessGroupHCCL::BroadcastUniqueHCCLID(
+    std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto hccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
+      store_->set(key, hccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&hccl_ids[i], ret.data(), ret.size());
+    }
+  }
+}
+// create HCCLManager cache for places_key
+void ProcessGroupHCCL::CreateHCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(),
+                    false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+  std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
+  hccl_comms.resize(places.size());
+  // using vector just for broadcast
+  std::vector<HcclRootInfo> hccl_ids;
+  hccl_ids.resize(1);
+  auto& hccl_id = hccl_ids.front();
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
+  }
+  BroadcastUniqueHCCLID(hccl_ids);
+  VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
+  std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+  std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    hccl_comms[i] = HCCLCommManager::Create(
+        GetSize(), GetRank(), &hccl_id, comms.get() + i);
+    dev_ctx[i].reset(new NPUDeviceContext(places[i]));
+  }
+  std::vector<NPUEventManager> events;
+  events.resize(places.size());
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+  auto& hccl_comms = places_to_hcclcomm_[key];
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
+  }
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const AllreduceOptions& opts) {
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          HcclComm comm,
+          const aclrtStream& stream) {
+        return platform::dynload::HcclAllReduce(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToHCCLDataType(input.dtype()),
+            ToHCCLRedType(opts.reduce_op),
+            comm,
+            stream);
+      },
+      CommType::ALLREDUCE);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const BroadcastOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          HcclComm comm,
+          const aclrtStream& stream) {
+        int root = opts.source_rank * in_tensors.size() + opts.source_root;
+        if (rank_ == root) {
+          return platform::dynload::HcclBroadcast(
+              input.data(),
+              input.numel(),
+              platform::ToHCCLDataType(input.dtype()),
+              root,
+              comm,
+              stream);
+        } else {
+          return platform::dynload::HcclBroadcast(
+              output.data(),
+              output.numel(),
+              platform::ToHCCLDataType(output.dtype()),
+              root,
+              comm,
+              stream);
+        }
+      },
+      CommType::BROADCAST);
+}
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+constexpr const char* HCCL_BACKEND_NAME = "HCCL";
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+using NPUStream = platform::stream::NPUStream;
+using NPUDeviceContext = paddle::platform::NPUDeviceContext;
+class ProcessGroupHCCL : public ProcessGroup {
+ public:
+  class HCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<HCCLTask> {
+   public:
+    HCCLTask(const std::vector<Place>& places,
+             int rank,
+             CommType CommType,
+             const std::vector<phi::DenseTensor>& inputs);
+    bool IsCompleted();
+    void SynchronizeStreams();
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    void Synchronize();
+    void SetOutputs(std::vector<phi::DenseTensor>& outputs);  // NOLINT
+    virtual ~HCCLTask();
+    std::vector<NPUEventManager> control_events_;
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
+    std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
+   private:
+  };
+  ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                   int rank,
+                   int size,
+                   const platform::Place& place,
+                   int gid);
+  const std::string GetBackendName() const override {
+    return std::string(HCCL_BACKEND_NAME);
+  }
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+ protected:
+  virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
+      std::vector<Place> places,
+      int rank,
+      CommType opType,
+      const std::vector<phi::DenseTensor>& inputs);
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<HCCLCommManager> hccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
+      places_to_hcclcomm_;
+  std::unordered_map<std::string, std::vector<NPUEventManager>>
+      places_to_events_;
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<NPUDeviceContext>>>
+      places_to_ctx_;
+  std::set<int> used_place_ids_;
+ private:
+  void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids,
+                   int root,  // NOLINT
+                   int server_fd);
+  void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids);  // NOLINT
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<phi::DenseTensor>& inputs,   // NOLINT
+      std::vector<phi::DenseTensor>& outputs,  // NOLINT
+      Fn fn,
+      CommType op_type);
+  void CreateHCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+#include <chrono>
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+constexpr int64_t kWaitBlockTImeout = 10;
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+int ProcessGroupHeter::send_count = 0;
+int ProcessGroupHeter::recv_count = 0;
+std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
+    int rank, CommType comm_type, const std::vector<phi::DenseTensor>& inputs) {
+  return std::make_shared<ProcessGroupHeter::HeterTask>(
+      rank, comm_type, inputs);
+}
+ProcessGroupHeter::HeterTask::HeterTask(
+    int rank, CommType CommType, const std::vector<phi::DenseTensor>& inputs)
+    : Task(rank, inputs, CommType) {}
+ProcessGroupHeter::HeterTask::~HeterTask() {}
+bool ProcessGroupHeter::HeterTask::IsCompleted() { return true; }
+// TODO(sheniang03): Add timeout for wait, now timeout unused
+bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
+  return true;
+}
+ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
+                                     int rank,
+                                     int size,
+                                     const platform::Place& place,
+                                     int gid,
+                                     int local_rank,
+                                     int local_size,
+                                     int gloo_rank,
+                                     int gloo_size,
+                                     bool with_switch,
+                                     std::string switch_endpoint,
+                                     int src_rank,
+                                     int dst_rank)
+    : ProcessGroup(rank, size, place, gid),
+      store_(store),
+      local_rank_(local_rank),
+      local_size_(local_size),
+      gloo_rank_(gloo_rank),
+      gloo_size_(gloo_size),
+      with_switch_(with_switch),
+      switch_endpoint_(switch_endpoint),
+      src_rank_(src_rank),
+      dst_rank_(dst_rank) {
+  return;
+#ifdef PADDLE_WITH_CUSTOM
+  if (paddle::platform::is_custom_place(place_)) {
+    inner_pg_ = std::make_shared<ProcessGroupCustom>(
+        store, local_rank, local_size, place_, IGNORE_ID);
+  } else {
+#endif
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    inner_pg_ = std::make_shared<ProcessGroupNCCL>(
+        store, local_rank, local_size, place_, IGNORE_ID);
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  inner_pg_ = std::make_shared<ProcessGroupHCCL>(
+      store, local_rank, local_size, place_, IGNORE_ID);
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "ProcessGroupHeter only supports NCCL, RCCL and HCCL now."));
+#endif
+#ifdef PADDLE_WITH_CUSTOM
+  }
+#endif
+  if (local_rank_ == 0 && !with_switch_) {
+    auto opts = ProcessGroupGloo::GlooOptions::create();
+    opts->device = ProcessGroupGloo::createDefaultDevice();
+    inter_pg_ = std::make_shared<ProcessGroupGloo>(
+        store, gloo_rank_, gloo_size_, place_, IGNORE_ID, opts);
+  }
+}
+template <typename T>
+static void _do_add(T* dst, T* src, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    *dst += *src;
+    dst++;
+    src++;
+  }
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const AllreduceOptions& opts) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+#endif
+  // Step1: do allreduce in inner cluster
+  auto task = inner_pg_->AllReduce(in_tensors, in_tensors, opts);
+  task->Wait();
+  // Step2: copy tensors to CPU
+  if (local_rank_ == 0) {
+    std::vector<phi::DenseTensor> cpu_tensors;
+    cpu_tensors.reserve(in_tensors.size());
+    phi::DenseTensor cpu_tensor;
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = in_tensors[i];
+      cpu_tensor.Resize(gpu_tensor.dims());
+      framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+      cpu_tensors.push_back(cpu_tensor);
+    }
+    // Step3: do inter cluster allreduce
+    if (with_switch_) {
+      if (local_rank_ == 0) {
+        HeterClient* client_ =
+            HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+        auto dense_cpu_tensor = cpu_tensors[0];
+        std::vector<int64_t> send_size;
+        send_size.push_back(dense_cpu_tensor.numel());
+        int ret = client_->Send(
+            gid_,
+            {dense_cpu_tensor.name()},
+            send_size,
+            dense_cpu_tensor.data(),
+            dense_cpu_tensor.numel() *
+                framework::DataTypeSize(dense_cpu_tensor.dtype()));
+        PADDLE_ENFORCE_EQ(ret,
+                          0,
+                          platform::errors::PreconditionNotMet(
+                              "Send to the switch module error."));
+        phi::DenseTensor cpu_tensor2;
+        cpu_tensor2.AllocateFrom(
+            std::make_unique<paddle::experimental::DefaultAllocator>(
+                paddle::platform::CPUPlace())
+                .get(),
+            dense_cpu_tensor.dtype(),
+            dense_cpu_tensor.numel());
+        ret = client_->Recv(
+            gid_,
+            {dense_cpu_tensor.name()},
+            cpu_tensor2.data(),
+            cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype()));
+        PADDLE_ENFORCE_EQ(ret,
+                          0,
+                          platform::errors::PreconditionNotMet(
+                              "Recv from the switch module error."));
+        switch (dense_cpu_tensor.dtype()) {
+          case DataType::FLOAT32:
+            _do_add<float>(reinterpret_cast<float*>(dense_cpu_tensor.data()),
+                           reinterpret_cast<float*>(cpu_tensor2.data()),
+                           dense_cpu_tensor.numel());
+            break;
+          case DataType::FLOAT64:
+            _do_add<double>(reinterpret_cast<double*>(dense_cpu_tensor.data()),
+                            reinterpret_cast<double*>(cpu_tensor2.data()),
+                            dense_cpu_tensor.numel());
+            break;
+          case DataType::INT32:
+            _do_add<int>(reinterpret_cast<int*>(dense_cpu_tensor.data()),
+                         reinterpret_cast<int*>(cpu_tensor2.data()),
+                         dense_cpu_tensor.numel());
+            break;
+          default:
+            PADDLE_THROW(platform::errors::PreconditionNotMet(
+                "Unsupported data type (%s) to do add.",
+                framework::DataType2String(dense_cpu_tensor.dtype())));
+        }
+      }
+    } else {
+      auto gloo_task = inter_pg_->AllReduce(cpu_tensors, cpu_tensors, opts);
+      gloo_task->Wait();
+    }
+    // Step4: copy cpu tensors to gpu
+    // copy cpu tensors to gpu
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = out_tensors[i];
+      auto cpu_tensor = cpu_tensors[i];
+      framework::TensorCopySync(cpu_tensor, cpu_tensor.place(), &gpu_tensor);
+    }
+  }
+  // Step5: broadcast among inner cluster
+  auto b_opts = BroadcastOptions();
+  b_opts.source_rank = 0;
+  auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts);
+  broadcast_task->Wait();
+  return CreateTask(rank_, CommType::ALLREDUCE, in_tensors);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+#endif
+  // Step1: do broadcast in inner cluster
+  auto b_opts = BroadcastOptions();
+  b_opts.source_rank = 0;
+  inner_pg_->Broadcast(in_tensors, out_tensors, b_opts);
+  if (local_rank_ == 0) {
+    std::vector<phi::DenseTensor> cpu_tensors;
+    cpu_tensors.reserve(in_tensors.size());
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = in_tensors[i];
+      phi::DenseTensor cpu_tensor;
+      cpu_tensor.Resize(gpu_tensor.dims());
+      framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+      cpu_tensors.push_back(cpu_tensor);
+    }
+    if (with_switch_) {
+      if (local_rank_ == 0) {
+        HeterClient* client_ =
+            HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+        auto dense_cpu_tensor = cpu_tensors[0];
+        if (gloo_rank_ == 0) {
+          std::vector<int64_t> send_size;
+          send_size.push_back(dense_cpu_tensor.numel());
+          int ret = client_->Send(
+              gid_,
+              {dense_cpu_tensor.name()},
+              send_size,
+              dense_cpu_tensor.data(),
+              dense_cpu_tensor.numel() *
+                  framework::DataTypeSize(dense_cpu_tensor.dtype()));
+          PADDLE_ENFORCE_EQ(ret,
+                            0,
+                            platform::errors::PreconditionNotMet(
+                                "Send to the switch module error."));
+        } else {
+          int ret = client_->Recv(
+              gid_,
+              {dense_cpu_tensor.name()},
+              dense_cpu_tensor.data(),
+              dense_cpu_tensor.numel() *
+                  framework::DataTypeSize(dense_cpu_tensor.dtype()));
+          PADDLE_ENFORCE_EQ(ret,
+                            0,
+                            platform::errors::PreconditionNotMet(
+                                "Receive from the switch module error."));
+        }
+      }
+    } else {
+      auto gloo_task = inter_pg_->Broadcast(cpu_tensors, cpu_tensors, opts);
+      gloo_task->Wait();
+    }
+    for (size_t i = 0; i < in_tensors.size(); i++) {
+      auto gpu_tensor = out_tensors[i];
+      auto cpu_tensor = cpu_tensors[i];
+      framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
+    }
+  }
+  auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts);
+  broadcast_task->Wait();
+  return CreateTask(rank_, CommType::BROADCAST, in_tensors);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
+    std::vector<phi::DenseTensor>& in_tensors, int peer) {
+  PADDLE_ENFORCE_EQ(
+      in_tensors.size(),
+      1,
+      platform::errors::PreconditionNotMet(
+          "For each send operation, there can only be one tensor to send."));
+  // Copy Tensor to cpu
+  auto start = std::chrono::high_resolution_clock::now();
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = in_tensors[0];
+  framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+  PADDLE_ENFORCE_EQ(with_switch_,
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from gpu to cpu for send " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+  // Send to switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  int64_t tensor_size =
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype());
+  std::vector<int64_t> send_size;
+  send_size.push_back(tensor_size);
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(send_count++);
+  VLOG(2) << "tensor_name:" << tensor_name;
+  int ret = client_->Send(
+      gid_, {tensor_name}, send_size, cpu_tensor.data(), tensor_size);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      0,
+      platform::errors::PreconditionNotMet("Send to the switch module error."));
+  return CreateTask(rank_, CommType::SEND, in_tensors);
+}
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
+    std::vector<phi::DenseTensor>& out_tensors, int peer) {
+  PADDLE_ENFORCE_EQ(
+      out_tensors.size(),
+      1,
+      platform::errors::PreconditionNotMet(
+          "For each rece operation, there can only be one tensor to receive."));
+  // Copy Tensor to cpu
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = out_tensors[0];
+  cpu_tensor.Resize(gpu_tensor.dims());
+  cpu_tensor.set_layout(gpu_tensor.layout());
+  cpu_tensor.mutable_data(platform::CPUPlace(), gpu_tensor.dtype());
+  PADDLE_ENFORCE_EQ(with_switch_,
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  // recv from switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(recv_count++);
+  VLOG(2) << "tensor_name: " << tensor_name;
+  auto start = std::chrono::high_resolution_clock::now();
+  int ret = client_->Recv(
+      gid_,
+      {tensor_name},
+      cpu_tensor.data(),
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
+  PADDLE_ENFORCE_EQ(ret,
+                    0,
+                    platform::errors::PreconditionNotMet(
+                        "receive to the switch module error."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  double goodput = cpu_tensor.numel() *
+                   framework::DataTypeSize(cpu_tensor.dtype()) / diff.count();
+  VLOG(2) << "Goodput: " << goodput << "B/s" << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
+  end = std::chrono::high_resolution_clock::now();
+  diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from cpu to gpu for recv " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+  return CreateTask(rank_, CommType::RECV, out_tensors);
+}
+}  // namespace distributed
+}  // namespace paddle