2.3.2-dtk-22.10.1

f0ef3442 · yuguo960516yuguo · ad08b8ce · f0ef3442 · f0ef3442 · f0ef3442
Commit f0ef3442 authored Apr 26, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc
+++ b/paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <sstream>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/distributed/auto_parallel/dist_attr.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+TEST(DistAttr, ctor) {
+  ProgramDesc program;
+  auto* global_block = program.MutableBlock(0);
+  auto* x = global_block->Var("X");
+  x->SetType(framework::proto::VarType::LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(framework::proto::VarType::FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(framework::proto::VarType::LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(framework::proto::VarType::FP32);
+  y->SetShape({784, 100});
+
+  auto* op = global_block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {x->Name()});
+  op->SetInput("Y", {y->Name()});
+
+  auto* out = global_block->Var("Out");
+  out->SetType(framework::proto::VarType::LOD_TENSOR);
+  out->SetShape({1000, 100});
+  op->SetOutput("Out", {out->Name()});
+
+  std::vector<int64_t> shape = {2, 4};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(shape, process_ids, dim_names);
+
+  std::vector<int64_t> shape2 = {2, 2};
+  std::vector<int64_t> process_ids2 = {0, 1, 2, 3};
+  std::vector<std::string> dim_names2 = {"a", "b"};
+  ProcessMesh process_mesh2(shape2, process_ids2, dim_names2);
+
+  TensorDistAttr x_dist_attr(*x), y_dist_attr(*y), out_dist_attr(*out);
+  x_dist_attr.set_process_mesh(process_mesh);
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({0, -1}));
+  x_dist_attr.set_batch_dim(0);
+  x_dist_attr.set_dynamic_dims(std::vector<bool>({true, false}));
+  x_dist_attr.annotate("process_mesh");
+  x_dist_attr.annotate("dims_mapping");
+  EXPECT_EQ(x_dist_attr.process_mesh(), process_mesh);
+  EXPECT_EQ(x_dist_attr.dims_mapping(), std::vector<int64_t>({0, -1}));
+  EXPECT_EQ(x_dist_attr.batch_dim(), 0);
+  EXPECT_EQ(x_dist_attr.dynamic_dims(), std::vector<bool>({true, false}));
+  EXPECT_EQ(x_dist_attr.is_annotated("process_mesh"), true);
+  EXPECT_EQ(x_dist_attr.is_annotated("dims_mapping"), true);
+  EXPECT_EQ(x_dist_attr.verify(), true);
+
+  std::stringstream x_sstream;
+  x_sstream << x_dist_attr;
+  EXPECT_EQ(x_sstream.str(), x_dist_attr.to_string());
+  auto x_proto = x_dist_attr.to_proto();
+  TensorDistAttr new_x_dist_attr = TensorDistAttr::from_proto(x_proto);
+  EXPECT_EQ(x_dist_attr, new_x_dist_attr);
+  // new_x_dist_attr is not valid since it does not bind to an var_desc
+  EXPECT_EQ(new_x_dist_attr.verify(), false);
+
+  y_dist_attr.set_process_mesh(process_mesh);
+  y_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 0}));
+  y_dist_attr.set_batch_dim(-1);
+  y_dist_attr.set_dynamic_dims(std::vector<bool>({false, true}));
+  x_dist_attr.annotate("batch_dim");
+  x_dist_attr.annotate("dynamic_dims");
+  EXPECT_EQ(y_dist_attr.process_mesh(), process_mesh);
+  EXPECT_EQ(y_dist_attr.dims_mapping(), std::vector<int64_t>({-1, 0}));
+  EXPECT_EQ(y_dist_attr.batch_dim(), 1);
+  EXPECT_EQ(y_dist_attr.dynamic_dims(), std::vector<bool>({false, true}));
+  EXPECT_EQ(x_dist_attr.is_annotated("batch_dim"), true);
+  EXPECT_EQ(x_dist_attr.is_annotated("dynamic_dims"), true);
+  EXPECT_EQ(x_dist_attr.verify(), true);
+
+  out_dist_attr.set_process_mesh(process_mesh);
+  out_dist_attr.set_dims_mapping(std::vector<int64_t>({0, 1}));
+  out_dist_attr.set_batch_dim(1);
+  out_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+  EXPECT_EQ(out_dist_attr.process_mesh(), process_mesh);
+  EXPECT_EQ(out_dist_attr.dims_mapping(), std::vector<int64_t>({0, 1}));
+  EXPECT_EQ(out_dist_attr.batch_dim(), 1);
+  EXPECT_EQ(out_dist_attr.dynamic_dims(), std::vector<bool>({false, false}));
+  EXPECT_EQ(out_dist_attr.verify(), true);
+
+  OperatorDistAttr mul_dist_attr(*op);
+  mul_dist_attr.set_input_dist_attr(x->Name(), x_dist_attr);
+  mul_dist_attr.set_input_dist_attr(y->Name(), y_dist_attr);
+  mul_dist_attr.set_output_dist_attr(out->Name(), out_dist_attr);
+  mul_dist_attr.set_process_mesh(process_mesh2);
+  mul_dist_attr.set_impl_type("dist_mul");
+  mul_dist_attr.set_impl_idx(0);
+  mul_dist_attr.annotate("process_mesh");
+  mul_dist_attr.annotate("impl_type");
+  mul_dist_attr.annotate("impl_idx");
+  EXPECT_NE(mul_dist_attr.input_dist_attr(x->Name()), x_dist_attr);
+  EXPECT_NE(mul_dist_attr.input_dist_attr(y->Name()), y_dist_attr);
+  EXPECT_NE(mul_dist_attr.output_dist_attr(out->Name()), out_dist_attr);
+  EXPECT_EQ(mul_dist_attr.process_mesh(), process_mesh2);
+  EXPECT_EQ(mul_dist_attr.input_dist_attr(x->Name()).process_mesh(),
+            process_mesh2);
+  EXPECT_EQ(mul_dist_attr.input_dist_attr(y->Name()).process_mesh(),
+            process_mesh2);
+  EXPECT_EQ(mul_dist_attr.impl_type(), "dist_mul");
+  EXPECT_EQ(mul_dist_attr.impl_idx(), 0);
+  EXPECT_EQ(mul_dist_attr.is_annotated("process_mesh"), true);
+  EXPECT_EQ(mul_dist_attr.is_annotated("impl_type"), true);
+  EXPECT_EQ(mul_dist_attr.is_annotated("impl_idx"), true);
+  EXPECT_EQ(mul_dist_attr.verify(), true);
+
+  std::stringstream mul_sstream;
+  mul_sstream << mul_dist_attr;
+  EXPECT_EQ(mul_sstream.str(), mul_dist_attr.to_string());
+  auto mul_proto = mul_dist_attr.to_proto();
+  OperatorDistAttr new_mul_dist_attr = OperatorDistAttr::from_proto(mul_proto);
+  EXPECT_EQ(mul_dist_attr, new_mul_dist_attr);
+  // new_mul_dist_attr is not valid since it does not bind to an op_desc
+  EXPECT_EQ(new_mul_dist_attr.verify(), false);
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/auto_parallel/test/dist_mapper_test.cc
+++ b/paddle/fluid/distributed/auto_parallel/test/dist_mapper_test.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h"
+#include <map>
+#include <sstream>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+TEST(DistributedMapper, Ctor) {
+  std::vector<int64_t> shape = {2, 3};
+  std::vector<int64_t> device_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  std::string device_type = "GPU";
+  int64_t size = shape[0] * shape[1];
+
+  DeviceMesh device_mesh("device_mesh", shape, device_ids, dim_names);
+  for (int64_t i = 0; i < shape[0]; ++i) {
+    for (int64_t j = 0; j < shape[1]; ++j) {
+      int64_t global_id = i * shape[1] + j;
+      int64_t local_id = j;
+      int64_t machine_id = i;
+      device_mesh.add_device(
+          Device(global_id, local_id, machine_id, device_type));
+    }
+  }
+  for (int64_t i = 0; i < size; ++i) {
+    for (int64_t j = 0; j < size; ++j) {
+      device_mesh.add_link(Link(i, j, "NVL"));
+    }
+  }
+
+  DistributedMapper dist_mapper;
+  dist_mapper.add_device_mesh(device_mesh);
+  std::map<int64_t, std::pair<std::string, std::vector<int64_t>>>
+      process_id_to_device_ids;
+  process_id_to_device_ids[0] = {"device_mesh", {5}};
+  process_id_to_device_ids[1] = {"device_mesh", {4}};
+  process_id_to_device_ids[2] = {"device_mesh", {3}};
+  process_id_to_device_ids[3] = {"device_mesh", {2}};
+  process_id_to_device_ids[4] = {"device_mesh", {1}};
+  process_id_to_device_ids[5] = {"device_mesh", {0}};
+  dist_mapper.set_process_id_to_device_ids(process_id_to_device_ids);
+
+  EXPECT_EQ(dist_mapper.device_meshes().at("device_mesh"), device_mesh);
+  EXPECT_EQ(dist_mapper.device_mesh("device_mesh"), device_mesh);
+  EXPECT_EQ(dist_mapper.process_id_to_device_ids(), process_id_to_device_ids);
+  std::stringstream sstream;
+  sstream << dist_mapper;
+  EXPECT_EQ(sstream.str(), dist_mapper.to_string());
+  auto proto = dist_mapper.to_proto();
+  DistributedMapper new_dist_mapper = DistributedMapper::from_proto(proto);
+  EXPECT_EQ(dist_mapper, new_dist_mapper);
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/auto_parallel/test/process_mesh_test.cc
+++ b/paddle/fluid/distributed/auto_parallel/test/process_mesh_test.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/auto_parallel/process_mesh.h"
+#include <iostream>
+#include <sstream>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+TEST(ProcessMesh, Ctor) {
+  std::vector<int64_t> shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  int64_t size = shape[0] * shape[1];
+  ProcessMesh process_mesh(shape, process_ids, dim_names);
+  EXPECT_EQ(process_mesh.shape(), shape);
+  EXPECT_EQ(process_mesh.process_ids(), process_ids);
+  EXPECT_EQ(process_mesh.dim_names()[0], "x");
+  EXPECT_EQ(process_mesh.dim_names()[1], "y");
+  EXPECT_EQ(process_mesh.size(), size);
+  EXPECT_EQ(process_mesh.ndim(), static_cast<int64_t>(shape.size()));
+  EXPECT_EQ(process_mesh.dim_size(0), shape[0]);
+  EXPECT_EQ(process_mesh.dim_size(-1), shape[1]);
+  EXPECT_EQ(process_mesh.dim_size("x"), shape[0]);
+  EXPECT_EQ(process_mesh.dim_size("y"), shape[1]);
+  EXPECT_EQ(process_mesh.empty(), false);
+  EXPECT_EQ(process_mesh.contains(0), true);
+  EXPECT_EQ(process_mesh.contains(6), false);
+  std::stringstream sstream;
+  sstream << process_mesh;
+  EXPECT_EQ(sstream.str(), process_mesh.to_string());
+  auto proto = process_mesh.to_proto();
+  ProcessMesh new_process_mesh = ProcessMesh::from_proto(proto);
+  EXPECT_EQ(process_mesh, new_process_mesh);
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/auto_parallel/utils.h
+++ b/paddle/fluid/distributed/auto_parallel/utils.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+// struct Indent {
+//   Indent(int &level) : level(level) { ++level; }
+//   ~Indent() { --level; }
+//   int &level;
+// };
+
+// inline std::string str_indent(std::string& str, cur_indent) {
+//   string spaces(cur_indent, " ");
+//   return str + std::string(cur_indent, " ");
+// }
+
+template <class T>
+bool has_duplicates(const std::vector<T>& vec) {
+  std::unordered_map<T, int> map;
+  for (const auto& i : vec) {
+    ++map[i];
+    if (map[i] > 1) return true;
+  }
+  return false;
+}
+
+inline int64_t canonical_dim(int dim, int ndim) {
+  PADDLE_ENFORCE_EQ(
+      dim >= -ndim && dim < ndim,
+      true,
+      platform::errors::InvalidArgument(
+          "Dimension %d is outside of [-%d, %d).", dim, ndim, ndim));
+  if (dim < 0) {
+    return dim + ndim;
+  }
+  return dim;
+}
+
+// Refer to https://stackoverflow.com/a/5289170
+template <typename Range, typename Value = typename Range::value_type>
+std::string str_join(Range const& elements,
+                     const std::string& delimiter = ",") {
+  std::ostringstream os;
+  auto b = std::begin(elements), e = std::end(elements);
+
+  if (b != e) {
+    std::copy(b, prev(e), std::ostream_iterator<Value>(os, delimiter.c_str()));
+    b = prev(e);
+  }
+  if (b != e) {
+    os << *b;
+  }
+
+  return os.str();
+}
+
+inline std::string str_join(std::map<std::string, bool> const& elements,
+                            const std::string& delimiter = ",") {
+  std::string str;
+  for (const auto& item : elements) {
+    str += item.first + ": " + std::to_string(item.second) + ",";
+  }
+  return str.substr(0, str.size() - 2);
+}
+
+// Refer to https://stackoverflow.com/a/46931770
+inline std::vector<std::string> str_split(std::string const& input,
+                                          const std::string& delimiter = ",") {
+  size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+  std::string token;
+  std::vector<std::string> output;
+  while ((pos_end = input.find(delimiter, pos_start)) != std::string::npos) {
+    token = input.substr(pos_start, pos_end - pos_start);
+    pos_start = pos_end + delim_len;
+    output.push_back(token);
+  }
+  output.push_back(input.substr(pos_start));
+  return output;
+}
+
+// Refer to https://stackoverflow.com/a/29200671/2358969
+template <typename T>
+std::string to_string_with_precision(const T a_value, const int n = 2) {
+  std::ostringstream out;
+  out.precision(n);
+  out << std::fixed << a_value;
+  return out.str();
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
+cc_library(
+  processgroup
+  SRCS ProcessGroup.cc
+  DEPS dense_tensor)
+cc_library(
+  processgroup_stream
+  SRCS ProcessGroupStream.cc
+  DEPS dense_tensor)
+cc_library(
+  eager_reducer
+  SRCS reducer.cc
+  DEPS eager_api processgroup processgroup_stream phi_api string_helper)
+
+if(WITH_DISTRIBUTE)
+  cc_library(
+    processgroup_gloo
+    SRCS ProcessGroupGloo.cc
+    DEPS phi_api eager_api gloo_wrapper)
+endif()
+
+if(WITH_NCCL OR WITH_RCCL)
+  cc_library(
+    processgroup_nccl
+    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
+    DEPS processgroup
+         processgroup_stream
+         place
+         enforce
+         collective_helper
+         device_context
+         dense_tensor)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
+      DEPS place enforce collective_helper device_context phi_api eager_api)
+  endif()
+endif()
+
+if(WITH_MPI)
+  cc_library(
+    processgroup_mpi
+    SRCS ProcessGroupMPI.cc MPITools.cc Common.cc
+    DEPS collective_helper device_context)
+endif()
+
+if(WITH_ASCEND_CL)
+  cc_library(
+    processgroup_hccl
+    SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc
+    DEPS place
+         npu_stream
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc
+      DEPS place
+           npu_stream
+           enforce
+           collective_helper
+           device_context
+           phi_api
+           eager_api)
+  endif()
+endif()
+
+if(WITH_CUSTOM_DEVICE)
+  cc_library(
+    processgroup_custom
+    SRCS ProcessGroupCustom.cc CustomCCLTools.cc Common.cc
+    DEPS phi_backends
+         place
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+endif()
--- a/paddle/fluid/distributed/collective/Common.cc
+++ b/paddle/fluid/distributed/collective/Common.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/Common.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.place());
+  }
+  return places;
+}
+
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors) {
+  return std::all_of(
+      tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
+        return platform::is_gpu_place(t.place());
+      });
+}
+
+bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
+                               const std::string& dev_type) {
+  return std::all_of(
+      tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
+        return platform::places_are_same_class(
+            t.place(), paddle::platform::CustomPlace(dev_type));
+      });
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/Common.h
+++ b/paddle/fluid/distributed/collective/Common.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors);
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places);
+
+bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors);
+
+bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
+                               const std::string& dev_type);
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/CustomCCLTools.cc
+++ b/paddle/fluid/distributed/collective/CustomCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
+
+phi::ccl::CCLReduceOp ToCustomCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, phi::ccl::CCLReduceOp> red_type = {
+      {ReduceOp::MIN, phi::ccl::CCLReduceOp::MIN},
+      {ReduceOp::MAX, phi::ccl::CCLReduceOp::MAX},
+      {ReduceOp::SUM, phi::ccl::CCLReduceOp::SUM},
+      {ReduceOp::PRODUCT, phi::ccl::CCLReduceOp::PRODUCT},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(),
+      true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+
+std::string SerializeCustomCCLUniqueId(const phi::ccl::CCLRootId& ccl_id) {
+  const uint8_t* bytes = ccl_id.data();
+  std::ostringstream oss;
+  for (size_t i = 0; i < ccl_id.size(); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/CustomCCLTools.h
+++ b/paddle/fluid/distributed/collective/CustomCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <error.h>
+#include <string>
+
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+
+namespace paddle {
+namespace distributed {
+
+class CustomEventManager {
+ public:
+  CustomEventManager() = default;
+
+  ~CustomEventManager() {
+    if (is_created_) {
+      event_->Destroy();
+    }
+  }
+
+  CustomEventManager(const CustomEventManager&) = delete;
+  CustomEventManager& operator=(const CustomEventManager&) = delete;
+
+  CustomEventManager(CustomEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(device_type_, other.device_type_);
+    std::swap(event_, other.event_);
+  }
+
+  CustomEventManager& operator=(CustomEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(device_type_, other.device_type_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  int8_t DeviceId() const { return device_index_; }
+  std::string DeviceType() const { return device_type_; }
+  phi::event::event_t GetRawCustomEvent() const { return event_->raw_event(); }
+  phi::event::Event* GetCustomEvent() const { return event_.get(); }
+
+  void Record(const paddle::platform::CustomDeviceContext& ctx) {
+    auto place = ctx.GetPlace();
+    auto device_type = place.GetDeviceType();
+    auto device_index = place.GetDeviceId();
+    if (!is_created_) {
+      CreateEvent(place);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "CustomDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+    PADDLE_ENFORCE_EQ(device_type,
+                      device_type_,
+                      platform::errors::PreconditionNotMet(
+                          "CustomDeviceContext's device %d does not match"
+                          "Event's device type %d",
+                          device_type,
+                          device_type_));
+
+    phi::DeviceGuard guard(place);
+    phi::stream::Stream stream(place, ctx.stream());
+    event_->Record(&stream);
+  }
+
+  bool Query() const { return event_->Query(); }
+
+  void Block(const paddle::platform::CustomDeviceContext& ctx) const {
+    if (is_created_) {
+      auto place = ctx.GetPlace();
+      auto device_type = place.GetDeviceType();
+      auto device_index = place.GetDeviceId();
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CustomDeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      PADDLE_ENFORCE_EQ(device_type,
+                        device_type_,
+                        platform::errors::PreconditionNotMet(
+                            "CustomDeviceContext's device %d does not match"
+                            "Event's device type %d",
+                            device_type,
+                            device_type_));
+      phi::DeviceGuard guard(place);
+      phi::stream::Stream stream(place, ctx.stream());
+      stream.WaitEvent(event_.get());
+    }
+  }
+
+ private:
+  bool is_created_{false};
+  std::shared_ptr<phi::event::Event> event_{nullptr};
+  int8_t device_index_{0};
+  std::string device_type_;
+
+ private:
+  void CreateEvent(const platform::Place& place) {
+    device_index_ = place.GetDeviceId();
+    device_type_ = place.GetDeviceType();
+    event_.reset(new phi::event::Event);
+    event_->Init(place);
+    is_created_ = true;
+  }
+};
+
+class CustomCCLCommManager {
+ public:
+  CustomCCLCommManager(const std::string& device_type,
+                       phi::ccl::CCLComm ccl_comm)
+      : device_type_(device_type), ccl_comm_(ccl_comm) {}
+
+  CustomCCLCommManager() : CustomCCLCommManager("", nullptr) {}
+
+  ~CustomCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (ccl_comm_) {
+      phi::DeviceManager::CCLDestroyComm(device_type_, ccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<CustomCCLCommManager> Create(
+      const std::string& device_type,
+      int num_ranks,
+      int rank,
+      phi::ccl::CCLRootId* comm_id,
+      phi::ccl::CCLComm* ccl_comm) {
+    auto custom_ccl_manager = std::make_shared<CustomCCLCommManager>();
+    phi::DeviceManager::CCLCommInitRank(
+        device_type, num_ranks, comm_id, rank, ccl_comm);
+    custom_ccl_manager->device_type_ = device_type;
+    custom_ccl_manager->ccl_id_ = comm_id;
+    custom_ccl_manager->rank_ = rank;
+    custom_ccl_manager->ccl_comm_ = *ccl_comm;
+    return custom_ccl_manager;
+  }
+
+  phi::ccl::CCLRootId* GetCustomCCLId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return ccl_id_;
+  }
+
+  phi::ccl::CCLComm GetCustomCCLComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return ccl_comm_;
+  }
+
+  CustomCCLCommManager(const CustomCCLCommManager&) = delete;
+  CustomCCLCommManager& operator=(const CustomCCLCommManager&) = delete;
+  CustomCCLCommManager& operator=(CustomCCLCommManager&& other) = delete;
+
+  CustomCCLCommManager(CustomCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(ccl_comm_, other.ccl_comm_);
+  }
+
+ protected:
+  std::string device_type_;
+  phi::ccl::CCLComm ccl_comm_;
+  phi::ccl::CCLRootId* ccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+phi::ccl::CCLReduceOp ToCustomCCLRedType(ReduceOp reduction);
+std::string SerializeCustomCCLUniqueId(const phi::ccl::CCLRootId& ccl_id);
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/HCCLTools.cc
+++ b/paddle/fluid/distributed/collective/HCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
+
+HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(),
+      true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <error.h>
+
+#include <string>
+
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/variant.h"
+
+namespace paddle {
+namespace distributed {
+
+class NPUEventManager {
+ public:
+  NPUEventManager() = default;
+
+  ~NPUEventManager() {
+    if (is_created_) {
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUEventDestroy(event_);
+    }
+  }
+
+  NPUEventManager(const NPUEventManager&) = delete;
+  NPUEventManager& operator=(const NPUEventManager&) = delete;
+
+  NPUEventManager(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  NPUEventManager& operator=(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  aclrtEvent GetRawNPUEvent() const { return event_; }
+
+  void Record(const paddle::platform::NPUDeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "NPUDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+
+    platform::NPUDeviceGuard guard(device_index_);
+    platform::NPUEventRecord(event_, ctx.stream());
+  }
+
+  bool Query() const {
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    platform::NPUEventQuery(event_, &status);
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      return true;
+    }
+    return false;
+  }
+
+  void Block(const paddle::platform::NPUDeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "phi::GPUContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUStreamWaitEvent(ctx.stream(), event_);
+    }
+  }
+
+ private:
+  bool is_created_{false};
+  aclrtEvent event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::NPUDeviceGuard guard(device_index);
+    platform::NPUEventCreate(&event_);
+    is_created_ = true;
+  }
+};
+
+class HCCLCommManager {
+ public:
+  explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
+
+  HCCLCommManager() : HCCLCommManager(nullptr) {}
+
+  ~HCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hccl_comm_) {
+      platform::dynload::HcclCommDestroy(hccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<HCCLCommManager> Create(int num_ranks,
+                                                 int rank,
+                                                 HcclRootInfo* comm_id,
+                                                 HcclComm hccl_comm) {
+    auto hccl_manager = std::make_shared<HCCLCommManager>();
+    auto ret = platform::dynload::HcclCommInitRootInfo(
+        num_ranks, comm_id, rank, &hccl_comm);
+    using __NPU_STATUS_TYPE__ = decltype(ret);
+    constexpr auto __success_type__ =
+        platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
+    if (UNLIKELY(ret != __success_type__)) {
+      VLOG(0) << "Error: create hccl_id error.";
+      exit(-1);
+    }
+
+    hccl_manager->hccl_id_ = comm_id;
+    hccl_manager->rank_ = rank;
+    hccl_manager->hccl_comm_ = hccl_comm;
+    return hccl_manager;
+  }
+
+  HcclRootInfo* GetHcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_id_;
+  }
+
+  HcclComm GetHcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_comm_;
+  }
+
+  HCCLCommManager(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
+
+  HCCLCommManager(HCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(hccl_comm_, other.hccl_comm_);
+  }
+
+ protected:
+  HcclComm hccl_comm_;
+  HcclRootInfo* hccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+HcclReduceOp ToHCCLRedType(ReduceOp reduction);
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID);
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/MPITools.cc
+++ b/paddle/fluid/distributed/collective/MPITools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/MPITools.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
+namespace mpi {
+
+MPI_Op ToMPIType(ReduceOp reduction) {
+  static const std::map<ReduceOp, MPI_Op> red_type = {
+      {ReduceOp::MIN, MPI_MIN},
+      {ReduceOp::MAX, MPI_MAX},
+      {ReduceOp::SUM, MPI_SUM},
+      {ReduceOp::PRODUCT, MPI_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Invalid mpi reduction. Must be MPI_MIN | MPI_MAX | "
+                        "MPI_PROD | MPI_SUM."));
+  return it->second;
+}
+
+// NOTE: MPI dose not support CUDA aware now.
+bool CheckMpiCudaAware() { return false; }
+
+void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size() == 1,
+      true,
+      platform::errors::InvalidArgument("the inputs size of MPI must be 1!"));
+
+  PADDLE_ENFORCE_EQ(CheckTensorsInCudaPlace(tensors) && !CheckMpiCudaAware(),
+                    false,
+                    platform::errors::InvalidArgument(
+                        "Found CUDA Tensor. But CUDA-aware MPI not support!"));
+}
+
+}  //  namespace mpi
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/MPITools.h
+++ b/paddle/fluid/distributed/collective/MPITools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <error.h>
+#include <iostream>
+#include <string>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/distributed/collective/Types.h"
+
+#ifdef HOST
+#undef HOST
+#endif
+
+#include <mpi.h>
+
+namespace paddle {
+namespace distributed {
+namespace mpi {
+
+#define MPI_CHECK(cmd)                                                     \
+  do {                                                                     \
+    int r = cmd;                                                           \
+    if (r != MPI_SUCCESS) {                                                \
+      LOG(FATAL) << "Failed, MPI error in" << __FILE__ << ":" << __LINE__  \
+                 << "with error code: " << std::to_string(r) << std::endl; \
+      exit(EXIT_FAILURE);                                                  \
+    }                                                                      \
+  } while (0)
+
+MPI_Op ToMPIType(ReduceOp reduction);
+
+bool CheckMpiCudaAware();
+
+void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors);
+
+}  // namespace mpi
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
+
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, ncclRedOp_t> red_type = {
+      {ReduceOp::MIN, ncclMin},
+      {ReduceOp::MAX, ncclMax},
+      {ReduceOp::SUM, ncclSum},
+      {ReduceOp::PRODUCT, ncclProd},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
+  return it->second;
+}
+
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#include <error.h>
+
+#include <string>
+
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#include "paddle/fluid/platform/device_context.h"
+
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/variant.h"
+
+namespace paddle {
+namespace distributed {
+
+#define NCCLCHECK(cmd)                                  \
+  do {                                                  \
+    ncclResult_t r = cmd;                               \
+    if (r != ncclSuccess) {                             \
+      printf("Failed, NCCL error %s:%d '%s'\n",         \
+             __FILE__,                                  \
+             __LINE__,                                  \
+             platform::dynload::ncclGetErrorString(r)); \
+      exit(EXIT_FAILURE);                               \
+    }                                                   \
+  } while (0)
+
+// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
+// EventManage is different from paddle::platform::CudaEvent.
+// It uses lazy initialization and is only created when the
+// Record() method is called for the first time; it also monitors
+// device information to ensure that recorded stream and event
+// are on the same device.
+
+class EventManager {
+ public:
+  EventManager() {}
+  explicit EventManager(unsigned int flags) : flags_{flags} {}
+
+  ~EventManager() {
+    if (is_created_) {
+      platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      hipEventDestroy(event_);
+#else
+      cudaEventDestroy(event_);
+#endif
+    }
+  }
+
+  EventManager(const EventManager&) = delete;
+  EventManager& operator=(const EventManager&) = delete;
+
+  EventManager(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  EventManager& operator=(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  gpuEvent_t GetRawCudaEvent() const { return event_; }
+
+  void Record(const phi::GPUContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index,
+                      device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "phi::GPUContext's device %d does not match"
+                          "Event's device %d",
+                          device_index,
+                          device_index_));
+
+    platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream()));
+#endif
+  }
+
+  bool Query() const {
+#ifdef PADDLE_WITH_HIP
+    gpuError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) {
+      return true;
+    }
+    if (err == hipErrorNotReady) {
+      return false;
+    }
+#else
+    gpuError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    }
+    if (err == cudaErrorNotReady) {
+      return false;
+    }
+#endif
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    return false;
+  }
+
+  void Synchronize() const {
+    if (is_created_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
+#endif
+    }
+  }
+
+  void Block(const phi::GPUContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index,
+                        device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "phi::GPUContext's device %d does not match"
+                            "Event's device %d",
+                            device_index,
+                            device_index_));
+      platform::CUDADeviceGuard guard(device_index_);
+
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
+#endif
+    }
+  }
+
+ private:
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags_ = hipEventDefault;
+#else
+  unsigned int flags_ = cudaEventDefault;
+#endif
+
+  bool is_created_{false};
+  gpuEvent_t event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::CUDADeviceGuard guard(device_index);
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
+#endif
+
+    is_created_ = true;
+  }
+};
+
+// NOTE(shenliang03): NCCLCommManager is more lightweight than
+// platform::NCCLComm
+
+class NCCLCommManager {
+ public:
+  explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {}
+
+  NCCLCommManager() : NCCLCommManager(nullptr) {}
+
+  ~NCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (nccl_comm_) {
+      platform::dynload::ncclCommDestroy(nccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<NCCLCommManager> Create(int num_ranks,
+                                                 int rank,
+                                                 ncclUniqueId comm_id) {
+    auto nccl_manager = std::make_shared<NCCLCommManager>();
+    NCCLCHECK(platform::dynload::ncclCommInitRank(
+        &(nccl_manager->nccl_comm_), num_ranks, comm_id, rank));
+
+    nccl_manager->nccl_id_ = comm_id;
+    nccl_manager->rank_ = rank;
+    return nccl_manager;
+  }
+
+  ncclUniqueId GetNcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_id_;
+  }
+
+  ncclComm_t GetNcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_comm_;
+  }
+
+  NCCLCommManager(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(NCCLCommManager&& other) = delete;
+
+  NCCLCommManager(NCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(nccl_comm_, other.nccl_comm_);
+  }
+
+ protected:
+  ncclComm_t nccl_comm_;
+  ncclUniqueId nccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID);
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+
+namespace paddle {
+namespace distributed {
+
+ProcessGroup::Task::Task(int rank,
+                         const std::vector<phi::DenseTensor>& inputs,
+                         CommType comm_type)
+    : rank_(rank), comm_type_(comm_type) {}
+
+ProcessGroup::Task::Task(int rank,
+                         const std::vector<phi::DenseTensor>& inputs,
+                         CommType comm_type,
+                         bool sync_op)
+    : rank_(rank), comm_type_(comm_type), sync_op_(sync_op) {}
+
+ProcessGroup::Task::~Task() = default;
+
+bool ProcessGroup::Task::IsCompleted() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return is_completed_;
+}
+
+bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
+  return false;
+}
+
+void ProcessGroup::Task::Synchronize() {}
+
+ProcessGroup::ProcessGroup(int rank,
+                           int size,
+                           const platform::Place& place,
+                           int gid)
+    : rank_(rank), size_(size), place_(place), gid_(gid) {
+  if (gid != IGNORE_ID) {
+    auto map = ProcessGroupMapFromGid::getInstance();
+    map->insert(gid_, this);
+  }
+}
+
+ProcessGroup::ProcessGroup(int rank, int size, int gid)
+    : rank_(rank), size_(size), gid_(gid) {
+  if (gid != IGNORE_ID) {
+    auto map = ProcessGroupMapFromGid::getInstance();
+    map->insert(gid_, this);
+  }
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
+
+namespace paddle {
+namespace distributed {
+
+constexpr int IGNORE_ID = -1;
+using Tensor = paddle::experimental::Tensor;
+
+enum class CommType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_SPARSE = 2,  // TODO(shenliang03): to support sparse in allreduce
+  REDUCE = 3,
+  ALLGATHER = 4,
+  GATHER = 5,
+  SCATTER = 6,
+  REDUCE_SCATTER = 7,
+  ALLTOALL = 8,
+  SEND = 9,
+  RECV = 10,
+  BARRIER = 11,
+  ALLTOALL_SINGLE = 12,
+  UNKNOWN = 100,
+};
+
+class ProcessGroup {
+ public:
+  class Task {
+   public:
+    Task(int rank,
+         const std::vector<phi::DenseTensor>& inputs,
+         CommType comm_type);
+    Task(int rank,
+         const std::vector<phi::DenseTensor>& inputs,
+         CommType comm_type,
+         bool sync_op);
+
+    virtual ~Task();
+    virtual bool IsCompleted();
+    virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    virtual void Synchronize();
+    bool IsSync() const { return sync_op_; }
+
+   protected:
+    const int rank_;
+    CommType comm_type_{CommType::UNKNOWN};
+    std::mutex mutex_;
+    bool is_completed_{false};
+
+   private:
+    bool sync_op_{true};
+  };
+
+  explicit ProcessGroup(int rank,
+                        int size,
+                        const platform::Place& place,
+                        int gid);
+
+  explicit ProcessGroup(int rank, int size, int gid);
+
+  virtual ~ProcessGroup() {}
+
+  int GetRank() const { return rank_; }
+
+  int GetSize() const { return size_; }
+
+  virtual const std::string GetBackendName() const = 0;
+  virtual phi::DeviceContext* GetDeviceContext(const Place& place) const {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Does not support to get device_context from ProcessGroup%s.",
+        GetBackendName()));
+  }
+
+  // TODO(liyurui): This API will be moved later
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const AllreduceOptions& = AllreduceOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const AllreduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions& = BroadcastOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support barrier", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int64_t,
+      int64_t) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int64_t,
+      int64_t) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv_partial with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>&,    // NOLINT
+      std::vector<phi::DenseTensor>&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support all_gather", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support all_gather with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,    // NOLINT
+      std::vector<phi::DenseTensor>&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall_single", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceOptions& opts) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const ReduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce_scatter with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
+      phi::DenseTensor&,  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      const ReduceScatterOptions&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support ReduceScatter", GetBackendName()));
+  }
+
+ protected:
+  const int rank_;
+  const int size_;
+  const platform::Place place_;
+  const int gid_;
+};
+
+class ProcessGroupMapFromGid {
+ public:
+  bool has(int gid) {
+    auto it = map_.find(gid);
+    return it != map_.end();
+  }
+
+  void insert(int gid, ProcessGroup* pg) {
+    // TODO(sandyhouse): address ut and uncomment the following codes
+    // PADDLE_ENFORCE_EQ(has(gid), false,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "The process group with id %d doesnot exist.",
+    //                       gid));
+    map_[gid] = pg;
+  }
+
+  ProcessGroup* get(int gid) {
+    // TODO(sandyhouse): address ut and uncomment the following codes
+    // PADDLE_ENFORCE_EQ(has(gid), true,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "The process group with id %d doesnot exist.",
+    //                       gid));
+    return map_.find(gid)->second;
+  }
+
+  static std::shared_ptr<ProcessGroupMapFromGid> getInstance() {
+    static auto s_instance = std::make_shared<ProcessGroupMapFromGid>();
+    return s_instance;
+  }
+
+  ProcessGroupMapFromGid() = default;
+  ~ProcessGroupMapFromGid() = default;
+
+ private:
+  std::unordered_map<int, ProcessGroup*> map_;
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
+
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+
+DECLARE_bool(xccl_blocking_wait);
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<CustomEventManager>& cclEvents,                    // NOLINT
+    std::vector<std::unique_ptr<CustomDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CustomDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    cclEvents[i].Record(*dev_ctx[i]);
+    cclEvents[i].Block(*default_ctx);
+  }
+}
+
+std::shared_ptr<ProcessGroupCustom::CustomTask> ProcessGroupCustom::CreateTask(
+    std::vector<Place> places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs) {
+  return std::make_shared<ProcessGroupCustom::CustomTask>(
+      places, rank, comm_type, inputs);
+}
+
+ProcessGroupCustom::CustomTask::CustomTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType CommType,
+    const std::vector<phi::DenseTensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  cclComms_.resize(places.size());
+}
+
+ProcessGroupCustom::CustomTask::~CustomTask() {}
+
+void ProcessGroupCustom::CustomTask::SetOutputs(
+    std::vector<phi::DenseTensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
+}
+
+void ProcessGroupCustom::CustomTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CustomDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    phi::DeviceGuard guard(default_ctx->GetPlace());
+    phi::stream::Stream stream(default_ctx->GetPlace(), default_ctx->stream());
+    stream.WaitEvent(control_events_[i].GetCustomEvent());
+  }
+}
+
+bool ProcessGroupCustom::CustomTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool ProcessGroupCustom::CustomTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  while (!IsCompleted()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+  }
+  return true;
+}
+
+// Same as Wait
+void ProcessGroupCustom::CustomTask::Synchronize() { Wait(kWaitTimeout); }
+
+ProcessGroupCustom::ProcessGroupCustom(const std::shared_ptr<Store>& store,
+                                       int rank,
+                                       int size,
+                                       const platform::Place& place,
+                                       int gid)
+    : ProcessGroup(rank, size, place, gid),
+      store_(store),
+      device_type_(place.GetDeviceType()) {
+  phi::DeviceManager::SetDevice(place_);
+}
+
+void ProcessGroupCustom::BroadcastUniqueCustomID(
+    std::vector<phi::ccl::CCLRootId>& ccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < ccl_ids.size(); i++) {
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      store_->set(key, ccl_ids[i]);
+    }
+  } else {
+    for (size_t i = 0; i < ccl_ids.size(); i++) {
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      ccl_ids[i] = store_->get(key);
+    }
+  }
+}
+
+// create CustomCCLManager cache for places_key
+void ProcessGroupCustom::CreateCustomManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(),
+                    false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+  const std::string device_type = places.back().GetDeviceType();
+
+  std::vector<std::shared_ptr<CustomCCLCommManager>> ccl_comms;
+  ccl_comms.resize(places.size());
+
+  // using vector just for broadcast
+  std::vector<phi::ccl::CCLRootId> ccl_ids;
+  ccl_ids.resize(1);
+  auto& ccl_id = ccl_ids.front();
+
+  if (rank_ == 0) {
+    phi::DeviceManager::CCLGetUniqueId(device_type, &ccl_id);
+  }
+  BroadcastUniqueCustomID(ccl_ids);
+
+  VLOG(3) << "init custom ccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", custom ccl uniqueid: " << SerializeCustomCCLUniqueId(ccl_id);
+
+  std::vector<std::unique_ptr<CustomDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+
+  std::unique_ptr<phi::ccl::CCLComm> comms(
+      new phi::ccl::CCLComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    phi::DeviceGuard guard(places[i]);
+    ccl_comms[i] = CustomCCLCommManager::Create(
+        device_type, GetSize(), GetRank(), &ccl_id, comms.get() + i);
+    dev_ctx[i].reset(new CustomDeviceContext(places[i]));
+  }
+
+  std::vector<CustomEventManager> events;
+  events.resize(places.size());
+
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_customcomm_.emplace(places_key, std::move(ccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Collective(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_customcomm_.find(key) == places_to_customcomm_.end()) {
+      CreateCustomManagerCache(key, places);
+    }
+  }
+
+  auto& ccl_comms = places_to_customcomm_[key];
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    phi::DeviceGuard guard(places[i]);
+    const auto& ccl_stream = places_to_ctx_[key][i]->stream();
+    phi::stream::Stream stream(places[i], ccl_stream);
+    fn(inputs[i], outputs[i], ccl_comms[i]->GetCustomCCLComm(), stream);
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    phi::DeviceGuard guard(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllGather(
+            device_type_,
+            input.data(),
+            output.data(),
+            input.numel(),
+            phi::ccl::ToCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER);
+}
+
+void* XcclGetPointerByOffset(void* raw_pointer,
+                             size_t offset,
+                             experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in xccl is not supported."));
+  }
+  return nullptr;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllGather(
+            device_type_,
+            XcclGetPointerByOffset(input.data(), offset, input.dtype()),
+            output.data(),
+            length,
+            phi::ccl::ToCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const AllreduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllReduce(
+            device_type_,
+            input.data(),
+            output.data(),
+            input.numel(),
+            phi::ccl::ToCCLDataType(input.dtype()),
+            ToCustomCCLRedType(opts.reduce_op),
+            comm,
+            stream);
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+    const BroadcastOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        int root = opts.source_rank * in_tensors.size() + opts.source_root;
+        if (rank_ == root) {
+          return phi::DeviceManager::CCLBroadcast(
+              device_type_,
+              input.data(),
+              input.numel(),
+              phi::ccl::ToCCLDataType(input.dtype()),
+              root,
+              comm,
+              stream);
+        } else {
+          return phi::DeviceManager::CCLBroadcast(
+              device_type_,
+              output.data(),
+              output.numel(),
+              phi::ccl::ToCCLDataType(output.dtype()),
+              root,
+              comm,
+              stream);
+        }
+      },
+      CommType::BROADCAST);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Barrier(
+    const BarrierOptions& opts) {
+  // Only support single card single process
+  std::vector<phi::CustomPlace> places = {place_};
+  std::vector<phi::DenseTensor> barrierTensors;
+  barrierTensors.reserve(places.size());
+
+  for (auto& place : places) {
+    phi::DeviceGuard guard(place);
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, place);
+    barrierTensors.push_back(
+        *std::dynamic_pointer_cast<phi::DenseTensor>(dt.impl()));
+  }
+  auto task = ProcessGroupCustom::AllReduce(barrierTensors, barrierTensors);
+  auto xccl_task = dynamic_cast<ProcessGroupCustom::CustomTask*>(task.get());
+  xccl_task->barrierTensors_ = std::move(barrierTensors);
+  return task;
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace distributed {
+using Place = paddle::platform::Place;
+using CustomDeviceContext = paddle::platform::CustomDeviceContext;
+class ProcessGroupCustom : public ProcessGroup {
+ public:
+  class CustomTask : public ProcessGroup::Task,
+                     public std::enable_shared_from_this<CustomTask> {
+   public:
+    CustomTask(const std::vector<Place>& places,
+               int rank,
+               CommType CommType,
+               const std::vector<phi::DenseTensor>& inputs);
+
+    bool IsCompleted();
+    void SynchronizeStreams();
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    void Synchronize();
+    void SetOutputs(std::vector<phi::DenseTensor>& outputs);  // NOLINT
+    virtual ~CustomTask();
+
+    std::vector<CustomEventManager> control_events_;
+    std::vector<phi::DenseTensor> barrierTensors_;
+
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<CustomCCLCommManager>> cclComms_;
+    std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
+
+   private:
+    const std::string device_type_;
+  };
+
+  ProcessGroupCustom(const std::shared_ptr<Store>& store,
+                     int rank,
+                     int size,
+                     const platform::Place& place,
+                     int gid);
+
+  const std::string GetBackendName() const override {
+    return "XCCL_" + device_type_;
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      int64_t offset,
+      int64_t length) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupCustom::CustomTask> CreateTask(
+      std::vector<Place> places,
+      int rank,
+      CommType opType,
+      const std::vector<phi::DenseTensor>& inputs);
+
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<CustomCCLCommManager> custom_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string,
+                     std::vector<std::shared_ptr<CustomCCLCommManager>>>
+      places_to_customcomm_;
+  std::unordered_map<std::string, std::vector<CustomEventManager>>
+      places_to_events_;
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<CustomDeviceContext>>>
+      places_to_ctx_;
+  std::set<int> used_place_ids_;
+
+ private:
+  void BcastCustomId(std::vector<phi::ccl::CCLRootId>& ccl_ids,  // NOLINT
+                     int root,
+                     int server_fd);
+
+  void BroadcastUniqueCustomID(
+      std::vector<phi::ccl::CCLRootId>& custom_ccl_ids);  // NOLINT
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<phi::DenseTensor>& inputs,   // NOLINT
+      std::vector<phi::DenseTensor>& outputs,  // NOLINT
+      Fn fn,
+      CommType op_type);
+
+  void CreateCustomManagerCache(const std::string& places_key,
+                                const std::vector<Place>& places);
+  const std::string device_type_;
+};
+}  //  namespace distributed
+}  //  namespace paddle
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <gloo/broadcast.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
+
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+
+#define HOST_NAME_MAX 256
+
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT8:       \
+      func<int8_t>(args);                    \
+      break;                                 \
+    case experimental::DataType::UINT8:      \
+      func<uint8_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::BOOL:       \
+      func<bool>(args);                      \
+      break;                                 \
+    case experimental::DataType::BFLOAT16:   \
+      func<bfloat16>(args);                  \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+
+template <typename T>
+T* get_data(phi::DenseTensor& tensor) {  // NOLINT
+  return reinterpret_cast<T*>(tensor.data());
+}
+
+template <typename T>
+std::vector<T*> get_multi_data(
+    std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  std::vector<T*> ret;
+  ret.reserve(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret.push_back(get_data<T>(tensors[i]));
+  }
+  return ret;
+}
+
+template <typename T, typename P>
+void set_output(P& opts, phi::DenseTensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_input(P& opts, phi::DenseTensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_outputs(P& opts,                                   // NOLINT
+                 std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs(P& opts,                                   // NOLINT
+                std::vector<phi::DenseTensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs_for_scatter(P& opts,                   // NOLINT
+                            phi::DenseTensor& tensor,  // NOLINT
+                            int nranks) {
+  std::vector<T*> ret;
+  ret.reserve(nranks);
+  T* raw_pointer = reinterpret_cast<T*>(tensor.data());
+  size_t offset = 0;
+  for (int i = 0; i < nranks; i++) {
+    ret.push_back(raw_pointer + offset);
+    offset += tensor.numel() / nranks;
+  }
+  opts.setInputs(ret, tensor.numel() / nranks);
+}
+
+ProcessGroupGloo::GlooTask::GlooTask(
+    int rank, const std::vector<phi::DenseTensor>& inputs, CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {}
+
+ProcessGroupGloo::ProcessGroupGloo(
+    const std::shared_ptr<distributed::Store>& store,
+    int rank,
+    int world_size,
+    const platform::Place& place,
+    int gid,
+    const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size, place, gid),
+      _tag(0),
+      _store(new GlooStore(store)) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(gid), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    int rank,
+                    int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+
+  void Run() override { _do_broadcast(_inputs[0], _outputs[0]); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<phi::DenseTensor> _inputs{};
+  std::vector<phi::DenseTensor> _outputs{};
+  const uint32_t _tag;
+
+  void _do_broadcast(phi::DenseTensor& in, phi::DenseTensor& out) {  // NOLINT
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = in.dtype();
+    if (rank_ == _root) {
+      GENERATE_FUNC(dtype, set_input, opts, in);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(
+      context, inputs, outputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank,
+                    const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    ReduceOp reduce_op,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+
+  void Run() override { _do_allreduce(_inputs, _outputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_allreduce(std::vector<phi::DenseTensor>& ins,     // NOLINT
+                     std::vector<phi::DenseTensor>& outs) {  // NOLINT
+    const auto& dtype = ins[0].dtype();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, ins);
+    GENERATE_FUNC(dtype, set_outputs, opts, outs);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const AllreduceOptions& opts) {
+  return AllReduce(inputs, outputs, opts, true);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const AllreduceOptions& opts,
+    bool sync_op) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(
+      rank_, context, inputs, outputs, opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+
+class BarrierGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BarrierGlooTask(int rank, const std::shared_ptr<gloo::Context>& context)
+      : ProcessGroupGloo::GlooTask(
+            rank, std::vector<phi::DenseTensor>{}, CommType::BARRIER),
+        _context(context) {}
+
+  void Run() override { _do_barrier(); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+
+  void _do_barrier() {
+    gloo::BarrierOptions opts(_context);
+    gloo::barrier(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Barrier(
+    const BarrierOptions& opts) {
+  std::shared_ptr<BarrierGlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<BarrierGlooTask>(rank_, context);
+  task->Run();
+  return task;
+}
+
+class AllgatherGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllgatherGlooTask(int rank,
+                    const std::shared_ptr<gloo::Context>& context,
+                    std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                    std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _tag(tag) {}
+
+  void Run() override { _do_allgather(_inputs, _outputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  uint32_t _tag;
+
+  void _do_allgather(std::vector<phi::DenseTensor>& in,     // NOLINT
+                     std::vector<phi::DenseTensor>& out) {  // NOLINT
+    const auto& dtype = in[0].dtype();
+    gloo::AllgatherOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, in[0]);
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setTag(_tag);
+    gloo::allgather(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  std::shared_ptr<AllgatherGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<AllgatherGlooTask>(
+      rank_, context, in_tensors, out_tensors, tag);
+  task->Run();
+  return task;
+}
+
+class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ReduceGlooTask(int rank,
+                 const std::shared_ptr<gloo::Context>& context,
+                 std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                 std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                 ReduceOp reduce_op,
+                 int dst,
+                 uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::REDUCE),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _reduce_op(reduce_op),
+        _dst(dst),
+        _tag(tag) {}
+
+  void Run() override { _do_reduce(_inputs, _outputs, _dst); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  const ReduceOp _reduce_op;
+  int _dst;
+  uint32_t _tag;
+
+  gloo::ReduceOptions::Func _get_function(const experimental::DataType type,
+                                          const ReduceOp op) {
+    gloo::ReduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::ReduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_reduce(std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                  std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                  int dst) {
+    const auto& dtype = inputs[0].dtype();
+    gloo::ReduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_input, opts, inputs[0]);
+    GENERATE_FUNC(dtype, set_output, opts, outputs[0]);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    opts.setRoot(dst);
+    gloo::reduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const ReduceOptions& opts) {
+  std::shared_ptr<ReduceGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ReduceGlooTask>(
+      rank_, context, inputs, outputs, opts.reduce_op, opts.root_rank, tag);
+  task->Run();
+  return task;
+}
+
+class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  ScatterGlooTask(int rank,
+                  const std::shared_ptr<gloo::Context>& context,
+                  std::vector<phi::DenseTensor>& inputs,   // NOLINT
+                  std::vector<phi::DenseTensor>& outputs,  // NOLINT
+                  int src,
+                  int size,
+                  uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER),
+        _context(context),
+        _inputs(inputs),
+        _outputs(outputs),
+        _src(src),
+        _size(size),
+        _tag(tag) {}
+
+  void Run() override { _do_scatter(_inputs, _outputs, _src); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<phi::DenseTensor> _inputs;
+  std::vector<phi::DenseTensor> _outputs;
+  int _src;
+  int _size;
+  uint32_t _tag;
+
+  void _do_scatter(std::vector<phi::DenseTensor>& in,   // NOLINT
+                   std::vector<phi::DenseTensor>& out,  // NOLINT
+                   int src) {
+    const auto& dtype = in[0].dtype();
+    gloo::ScatterOptions opts(_context);
+    if (rank_ == src) {
+      GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in[0], _size);
+    }
+    GENERATE_FUNC(dtype, set_output, opts, out[0]);
+    opts.setRoot(src);
+    opts.setTag(_tag);
+    gloo::scatter(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts) {
+  std::shared_ptr<ScatterGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_shared<ScatterGlooTask>(
+      rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag);
+  task->Run();
+  return task;
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      0,
+      platform::errors::Fatal("Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+
+}  // namespace distributed
+}  // namespace paddle