Commit f0ef3442 authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.3.2-dtk-22.10.1

parent ad08b8ce
Pipeline #227 failed with stages
in 0 seconds
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <sstream>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/auto_parallel/dist_attr.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
namespace paddle {
namespace distributed {
namespace auto_parallel {
TEST(DistAttr, ctor) {
ProgramDesc program;
auto* global_block = program.MutableBlock(0);
auto* x = global_block->Var("X");
x->SetType(framework::proto::VarType::LOD_TENSOR);
x->SetLoDLevel(0);
x->SetDataType(framework::proto::VarType::FP32);
x->SetShape({1000, 784});
auto* y = global_block->Var("Y");
y->SetType(framework::proto::VarType::LOD_TENSOR);
y->SetLoDLevel(0);
y->SetDataType(framework::proto::VarType::FP32);
y->SetShape({784, 100});
auto* op = global_block->AppendOp();
op->SetType("mul");
op->SetInput("X", {x->Name()});
op->SetInput("Y", {y->Name()});
auto* out = global_block->Var("Out");
out->SetType(framework::proto::VarType::LOD_TENSOR);
out->SetShape({1000, 100});
op->SetOutput("Out", {out->Name()});
std::vector<int64_t> shape = {2, 4};
std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5, 6, 7};
std::vector<std::string> dim_names = {"x", "y"};
ProcessMesh process_mesh(shape, process_ids, dim_names);
std::vector<int64_t> shape2 = {2, 2};
std::vector<int64_t> process_ids2 = {0, 1, 2, 3};
std::vector<std::string> dim_names2 = {"a", "b"};
ProcessMesh process_mesh2(shape2, process_ids2, dim_names2);
TensorDistAttr x_dist_attr(*x), y_dist_attr(*y), out_dist_attr(*out);
x_dist_attr.set_process_mesh(process_mesh);
x_dist_attr.set_dims_mapping(std::vector<int64_t>({0, -1}));
x_dist_attr.set_batch_dim(0);
x_dist_attr.set_dynamic_dims(std::vector<bool>({true, false}));
x_dist_attr.annotate("process_mesh");
x_dist_attr.annotate("dims_mapping");
EXPECT_EQ(x_dist_attr.process_mesh(), process_mesh);
EXPECT_EQ(x_dist_attr.dims_mapping(), std::vector<int64_t>({0, -1}));
EXPECT_EQ(x_dist_attr.batch_dim(), 0);
EXPECT_EQ(x_dist_attr.dynamic_dims(), std::vector<bool>({true, false}));
EXPECT_EQ(x_dist_attr.is_annotated("process_mesh"), true);
EXPECT_EQ(x_dist_attr.is_annotated("dims_mapping"), true);
EXPECT_EQ(x_dist_attr.verify(), true);
std::stringstream x_sstream;
x_sstream << x_dist_attr;
EXPECT_EQ(x_sstream.str(), x_dist_attr.to_string());
auto x_proto = x_dist_attr.to_proto();
TensorDistAttr new_x_dist_attr = TensorDistAttr::from_proto(x_proto);
EXPECT_EQ(x_dist_attr, new_x_dist_attr);
// new_x_dist_attr is not valid since it does not bind to an var_desc
EXPECT_EQ(new_x_dist_attr.verify(), false);
y_dist_attr.set_process_mesh(process_mesh);
y_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 0}));
y_dist_attr.set_batch_dim(-1);
y_dist_attr.set_dynamic_dims(std::vector<bool>({false, true}));
x_dist_attr.annotate("batch_dim");
x_dist_attr.annotate("dynamic_dims");
EXPECT_EQ(y_dist_attr.process_mesh(), process_mesh);
EXPECT_EQ(y_dist_attr.dims_mapping(), std::vector<int64_t>({-1, 0}));
EXPECT_EQ(y_dist_attr.batch_dim(), 1);
EXPECT_EQ(y_dist_attr.dynamic_dims(), std::vector<bool>({false, true}));
EXPECT_EQ(x_dist_attr.is_annotated("batch_dim"), true);
EXPECT_EQ(x_dist_attr.is_annotated("dynamic_dims"), true);
EXPECT_EQ(x_dist_attr.verify(), true);
out_dist_attr.set_process_mesh(process_mesh);
out_dist_attr.set_dims_mapping(std::vector<int64_t>({0, 1}));
out_dist_attr.set_batch_dim(1);
out_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
EXPECT_EQ(out_dist_attr.process_mesh(), process_mesh);
EXPECT_EQ(out_dist_attr.dims_mapping(), std::vector<int64_t>({0, 1}));
EXPECT_EQ(out_dist_attr.batch_dim(), 1);
EXPECT_EQ(out_dist_attr.dynamic_dims(), std::vector<bool>({false, false}));
EXPECT_EQ(out_dist_attr.verify(), true);
OperatorDistAttr mul_dist_attr(*op);
mul_dist_attr.set_input_dist_attr(x->Name(), x_dist_attr);
mul_dist_attr.set_input_dist_attr(y->Name(), y_dist_attr);
mul_dist_attr.set_output_dist_attr(out->Name(), out_dist_attr);
mul_dist_attr.set_process_mesh(process_mesh2);
mul_dist_attr.set_impl_type("dist_mul");
mul_dist_attr.set_impl_idx(0);
mul_dist_attr.annotate("process_mesh");
mul_dist_attr.annotate("impl_type");
mul_dist_attr.annotate("impl_idx");
EXPECT_NE(mul_dist_attr.input_dist_attr(x->Name()), x_dist_attr);
EXPECT_NE(mul_dist_attr.input_dist_attr(y->Name()), y_dist_attr);
EXPECT_NE(mul_dist_attr.output_dist_attr(out->Name()), out_dist_attr);
EXPECT_EQ(mul_dist_attr.process_mesh(), process_mesh2);
EXPECT_EQ(mul_dist_attr.input_dist_attr(x->Name()).process_mesh(),
process_mesh2);
EXPECT_EQ(mul_dist_attr.input_dist_attr(y->Name()).process_mesh(),
process_mesh2);
EXPECT_EQ(mul_dist_attr.impl_type(), "dist_mul");
EXPECT_EQ(mul_dist_attr.impl_idx(), 0);
EXPECT_EQ(mul_dist_attr.is_annotated("process_mesh"), true);
EXPECT_EQ(mul_dist_attr.is_annotated("impl_type"), true);
EXPECT_EQ(mul_dist_attr.is_annotated("impl_idx"), true);
EXPECT_EQ(mul_dist_attr.verify(), true);
std::stringstream mul_sstream;
mul_sstream << mul_dist_attr;
EXPECT_EQ(mul_sstream.str(), mul_dist_attr.to_string());
auto mul_proto = mul_dist_attr.to_proto();
OperatorDistAttr new_mul_dist_attr = OperatorDistAttr::from_proto(mul_proto);
EXPECT_EQ(mul_dist_attr, new_mul_dist_attr);
// new_mul_dist_attr is not valid since it does not bind to an op_desc
EXPECT_EQ(new_mul_dist_attr.verify(), false);
}
} // namespace auto_parallel
} // namespace distributed
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h"
#include <map>
#include <sstream>
#include "gtest/gtest.h"
namespace paddle {
namespace distributed {
namespace auto_parallel {
TEST(DistributedMapper, Ctor) {
std::vector<int64_t> shape = {2, 3};
std::vector<int64_t> device_ids = {0, 1, 2, 3, 4, 5};
std::vector<std::string> dim_names = {"x", "y"};
std::string device_type = "GPU";
int64_t size = shape[0] * shape[1];
DeviceMesh device_mesh("device_mesh", shape, device_ids, dim_names);
for (int64_t i = 0; i < shape[0]; ++i) {
for (int64_t j = 0; j < shape[1]; ++j) {
int64_t global_id = i * shape[1] + j;
int64_t local_id = j;
int64_t machine_id = i;
device_mesh.add_device(
Device(global_id, local_id, machine_id, device_type));
}
}
for (int64_t i = 0; i < size; ++i) {
for (int64_t j = 0; j < size; ++j) {
device_mesh.add_link(Link(i, j, "NVL"));
}
}
DistributedMapper dist_mapper;
dist_mapper.add_device_mesh(device_mesh);
std::map<int64_t, std::pair<std::string, std::vector<int64_t>>>
process_id_to_device_ids;
process_id_to_device_ids[0] = {"device_mesh", {5}};
process_id_to_device_ids[1] = {"device_mesh", {4}};
process_id_to_device_ids[2] = {"device_mesh", {3}};
process_id_to_device_ids[3] = {"device_mesh", {2}};
process_id_to_device_ids[4] = {"device_mesh", {1}};
process_id_to_device_ids[5] = {"device_mesh", {0}};
dist_mapper.set_process_id_to_device_ids(process_id_to_device_ids);
EXPECT_EQ(dist_mapper.device_meshes().at("device_mesh"), device_mesh);
EXPECT_EQ(dist_mapper.device_mesh("device_mesh"), device_mesh);
EXPECT_EQ(dist_mapper.process_id_to_device_ids(), process_id_to_device_ids);
std::stringstream sstream;
sstream << dist_mapper;
EXPECT_EQ(sstream.str(), dist_mapper.to_string());
auto proto = dist_mapper.to_proto();
DistributedMapper new_dist_mapper = DistributedMapper::from_proto(proto);
EXPECT_EQ(dist_mapper, new_dist_mapper);
}
} // namespace auto_parallel
} // namespace distributed
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/auto_parallel/process_mesh.h"
#include <iostream>
#include <sstream>
#include "gtest/gtest.h"
namespace paddle {
namespace distributed {
namespace auto_parallel {
TEST(ProcessMesh, Ctor) {
std::vector<int64_t> shape = {2, 3};
std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
std::vector<std::string> dim_names = {"x", "y"};
int64_t size = shape[0] * shape[1];
ProcessMesh process_mesh(shape, process_ids, dim_names);
EXPECT_EQ(process_mesh.shape(), shape);
EXPECT_EQ(process_mesh.process_ids(), process_ids);
EXPECT_EQ(process_mesh.dim_names()[0], "x");
EXPECT_EQ(process_mesh.dim_names()[1], "y");
EXPECT_EQ(process_mesh.size(), size);
EXPECT_EQ(process_mesh.ndim(), static_cast<int64_t>(shape.size()));
EXPECT_EQ(process_mesh.dim_size(0), shape[0]);
EXPECT_EQ(process_mesh.dim_size(-1), shape[1]);
EXPECT_EQ(process_mesh.dim_size("x"), shape[0]);
EXPECT_EQ(process_mesh.dim_size("y"), shape[1]);
EXPECT_EQ(process_mesh.empty(), false);
EXPECT_EQ(process_mesh.contains(0), true);
EXPECT_EQ(process_mesh.contains(6), false);
std::stringstream sstream;
sstream << process_mesh;
EXPECT_EQ(sstream.str(), process_mesh.to_string());
auto proto = process_mesh.to_proto();
ProcessMesh new_process_mesh = ProcessMesh::from_proto(proto);
EXPECT_EQ(process_mesh, new_process_mesh);
}
} // namespace auto_parallel
} // namespace distributed
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
namespace auto_parallel {
// struct Indent {
// Indent(int &level) : level(level) { ++level; }
// ~Indent() { --level; }
// int &level;
// };
// inline std::string str_indent(std::string& str, cur_indent) {
// string spaces(cur_indent, " ");
// return str + std::string(cur_indent, " ");
// }
template <class T>
bool has_duplicates(const std::vector<T>& vec) {
std::unordered_map<T, int> map;
for (const auto& i : vec) {
++map[i];
if (map[i] > 1) return true;
}
return false;
}
inline int64_t canonical_dim(int dim, int ndim) {
PADDLE_ENFORCE_EQ(
dim >= -ndim && dim < ndim,
true,
platform::errors::InvalidArgument(
"Dimension %d is outside of [-%d, %d).", dim, ndim, ndim));
if (dim < 0) {
return dim + ndim;
}
return dim;
}
// Refer to https://stackoverflow.com/a/5289170
template <typename Range, typename Value = typename Range::value_type>
std::string str_join(Range const& elements,
const std::string& delimiter = ",") {
std::ostringstream os;
auto b = std::begin(elements), e = std::end(elements);
if (b != e) {
std::copy(b, prev(e), std::ostream_iterator<Value>(os, delimiter.c_str()));
b = prev(e);
}
if (b != e) {
os << *b;
}
return os.str();
}
inline std::string str_join(std::map<std::string, bool> const& elements,
const std::string& delimiter = ",") {
std::string str;
for (const auto& item : elements) {
str += item.first + ": " + std::to_string(item.second) + ",";
}
return str.substr(0, str.size() - 2);
}
// Refer to https://stackoverflow.com/a/46931770
inline std::vector<std::string> str_split(std::string const& input,
const std::string& delimiter = ",") {
size_t pos_start = 0, pos_end, delim_len = delimiter.length();
std::string token;
std::vector<std::string> output;
while ((pos_end = input.find(delimiter, pos_start)) != std::string::npos) {
token = input.substr(pos_start, pos_end - pos_start);
pos_start = pos_end + delim_len;
output.push_back(token);
}
output.push_back(input.substr(pos_start));
return output;
}
// Refer to https://stackoverflow.com/a/29200671/2358969
template <typename T>
std::string to_string_with_precision(const T a_value, const int n = 2) {
std::ostringstream out;
out.precision(n);
out << std::fixed << a_value;
return out.str();
}
} // namespace auto_parallel
} // namespace distributed
} // namespace paddle
cc_library(
processgroup
SRCS ProcessGroup.cc
DEPS dense_tensor)
cc_library(
processgroup_stream
SRCS ProcessGroupStream.cc
DEPS dense_tensor)
cc_library(
eager_reducer
SRCS reducer.cc
DEPS eager_api processgroup processgroup_stream phi_api string_helper)
if(WITH_DISTRIBUTE)
cc_library(
processgroup_gloo
SRCS ProcessGroupGloo.cc
DEPS phi_api eager_api gloo_wrapper)
endif()
if(WITH_NCCL OR WITH_RCCL)
cc_library(
processgroup_nccl
SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
DEPS processgroup
processgroup_stream
place
enforce
collective_helper
device_context
dense_tensor)
if(WITH_DISTRIBUTE AND WITH_PSCORE)
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
set_source_files_properties(
ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
${DISTRIBUTE_COMPILE_FLAGS})
endif()
cc_library(
processgroup_heter
SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
DEPS place enforce collective_helper device_context phi_api eager_api)
endif()
endif()
if(WITH_MPI)
cc_library(
processgroup_mpi
SRCS ProcessGroupMPI.cc MPITools.cc Common.cc
DEPS collective_helper device_context)
endif()
if(WITH_ASCEND_CL)
cc_library(
processgroup_hccl
SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc
DEPS place
npu_stream
enforce
collective_helper
device_context
phi_api
eager_api)
if(WITH_DISTRIBUTE AND WITH_PSCORE)
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
set_source_files_properties(
ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
${DISTRIBUTE_COMPILE_FLAGS})
endif()
cc_library(
processgroup_heter
SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc
DEPS place
npu_stream
enforce
collective_helper
device_context
phi_api
eager_api)
endif()
endif()
if(WITH_CUSTOM_DEVICE)
cc_library(
processgroup_custom
SRCS ProcessGroupCustom.cc CustomCCLTools.cc Common.cc
DEPS phi_backends
place
enforce
collective_helper
device_context
phi_api
eager_api)
endif()
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/Common.h"
namespace paddle {
namespace distributed {
std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors) {
std::vector<Place> places;
places.reserve(tensors.size());
for (auto& tensor : tensors) {
places.push_back(tensor.place());
}
return places;
}
std::string GetKeyFromPlaces(const std::vector<Place>& places) {
std::string placeList;
for (auto& place : places) {
std::stringstream tmp;
tmp << place;
if (placeList.empty()) {
placeList += tmp.str();
} else {
placeList += "," + tmp.str();
}
}
return placeList;
}
bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors) {
return std::all_of(
tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
return platform::is_gpu_place(t.place());
});
}
bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
const std::string& dev_type) {
return std::all_of(
tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
return platform::places_are_same_class(
t.place(), paddle::platform::CustomPlace(dev_type));
});
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
// Get the list of devices from list of tensors
std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors);
// Get the deviceList String from the list of devices
std::string GetKeyFromPlaces(const std::vector<Place>& places);
bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors);
bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
const std::string& dev_type);
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace paddle {
namespace distributed {
phi::ccl::CCLReduceOp ToCustomCCLRedType(ReduceOp reduction) {
static const std::map<ReduceOp, phi::ccl::CCLReduceOp> red_type = {
{ReduceOp::MIN, phi::ccl::CCLReduceOp::MIN},
{ReduceOp::MAX, phi::ccl::CCLReduceOp::MAX},
{ReduceOp::SUM, phi::ccl::CCLReduceOp::SUM},
{ReduceOp::PRODUCT, phi::ccl::CCLReduceOp::PRODUCT},
};
auto it = red_type.find(reduction);
PADDLE_ENFORCE_EQ(
it != red_type.end(),
true,
platform::errors::InvalidArgument("Invalid hccl reduction. "
"Must be Min | Max | Prod | Sum"));
return it->second;
}
std::string SerializeCustomCCLUniqueId(const phi::ccl::CCLRootId& ccl_id) {
const uint8_t* bytes = ccl_id.data();
std::ostringstream oss;
for (size_t i = 0; i < ccl_id.size(); ++i) {
oss << std::hex << static_cast<int>(bytes[i]);
}
return oss.str();
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <string>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/device_guard.h"
#include "paddle/phi/backends/device_manager.h"
namespace paddle {
namespace distributed {
class CustomEventManager {
public:
CustomEventManager() = default;
~CustomEventManager() {
if (is_created_) {
event_->Destroy();
}
}
CustomEventManager(const CustomEventManager&) = delete;
CustomEventManager& operator=(const CustomEventManager&) = delete;
CustomEventManager(CustomEventManager&& other) {
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(device_type_, other.device_type_);
std::swap(event_, other.event_);
}
CustomEventManager& operator=(CustomEventManager&& other) {
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(device_type_, other.device_type_);
std::swap(event_, other.event_);
return *this;
}
bool IsCreated() const { return is_created_; }
int8_t DeviceId() const { return device_index_; }
std::string DeviceType() const { return device_type_; }
phi::event::event_t GetRawCustomEvent() const { return event_->raw_event(); }
phi::event::Event* GetCustomEvent() const { return event_.get(); }
void Record(const paddle::platform::CustomDeviceContext& ctx) {
auto place = ctx.GetPlace();
auto device_type = place.GetDeviceType();
auto device_index = place.GetDeviceId();
if (!is_created_) {
CreateEvent(place);
}
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CustomDeviceContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
PADDLE_ENFORCE_EQ(device_type,
device_type_,
platform::errors::PreconditionNotMet(
"CustomDeviceContext's device %d does not match"
"Event's device type %d",
device_type,
device_type_));
phi::DeviceGuard guard(place);
phi::stream::Stream stream(place, ctx.stream());
event_->Record(&stream);
}
bool Query() const { return event_->Query(); }
void Block(const paddle::platform::CustomDeviceContext& ctx) const {
if (is_created_) {
auto place = ctx.GetPlace();
auto device_type = place.GetDeviceType();
auto device_index = place.GetDeviceId();
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CustomDeviceContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
PADDLE_ENFORCE_EQ(device_type,
device_type_,
platform::errors::PreconditionNotMet(
"CustomDeviceContext's device %d does not match"
"Event's device type %d",
device_type,
device_type_));
phi::DeviceGuard guard(place);
phi::stream::Stream stream(place, ctx.stream());
stream.WaitEvent(event_.get());
}
}
private:
bool is_created_{false};
std::shared_ptr<phi::event::Event> event_{nullptr};
int8_t device_index_{0};
std::string device_type_;
private:
void CreateEvent(const platform::Place& place) {
device_index_ = place.GetDeviceId();
device_type_ = place.GetDeviceType();
event_.reset(new phi::event::Event);
event_->Init(place);
is_created_ = true;
}
};
class CustomCCLCommManager {
public:
CustomCCLCommManager(const std::string& device_type,
phi::ccl::CCLComm ccl_comm)
: device_type_(device_type), ccl_comm_(ccl_comm) {}
CustomCCLCommManager() : CustomCCLCommManager("", nullptr) {}
~CustomCCLCommManager() noexcept {
std::unique_lock<std::mutex> lock(mutex_);
if (ccl_comm_) {
phi::DeviceManager::CCLDestroyComm(device_type_, ccl_comm_);
}
}
static std::shared_ptr<CustomCCLCommManager> Create(
const std::string& device_type,
int num_ranks,
int rank,
phi::ccl::CCLRootId* comm_id,
phi::ccl::CCLComm* ccl_comm) {
auto custom_ccl_manager = std::make_shared<CustomCCLCommManager>();
phi::DeviceManager::CCLCommInitRank(
device_type, num_ranks, comm_id, rank, ccl_comm);
custom_ccl_manager->device_type_ = device_type;
custom_ccl_manager->ccl_id_ = comm_id;
custom_ccl_manager->rank_ = rank;
custom_ccl_manager->ccl_comm_ = *ccl_comm;
return custom_ccl_manager;
}
phi::ccl::CCLRootId* GetCustomCCLId() const {
std::unique_lock<std::mutex> lock(mutex_);
return ccl_id_;
}
phi::ccl::CCLComm GetCustomCCLComm() const {
std::unique_lock<std::mutex> lock(mutex_);
return ccl_comm_;
}
CustomCCLCommManager(const CustomCCLCommManager&) = delete;
CustomCCLCommManager& operator=(const CustomCCLCommManager&) = delete;
CustomCCLCommManager& operator=(CustomCCLCommManager&& other) = delete;
CustomCCLCommManager(CustomCCLCommManager&& other) {
std::unique_lock<std::mutex> lock(other.mutex_);
std::swap(ccl_comm_, other.ccl_comm_);
}
protected:
std::string device_type_;
phi::ccl::CCLComm ccl_comm_;
phi::ccl::CCLRootId* ccl_id_;
int rank_;
mutable std::mutex mutex_;
};
phi::ccl::CCLReduceOp ToCustomCCLRedType(ReduceOp reduction);
std::string SerializeCustomCCLUniqueId(const phi::ccl::CCLRootId& ccl_id);
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace paddle {
namespace distributed {
HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
static const std::map<ReduceOp, HcclReduceOp> red_type = {
{ReduceOp::MIN, HCCL_REDUCE_MIN},
{ReduceOp::MAX, HCCL_REDUCE_MAX},
{ReduceOp::SUM, HCCL_REDUCE_SUM},
{ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
};
auto it = red_type.find(reduction);
PADDLE_ENFORCE_EQ(
it != red_type.end(),
true,
platform::errors::InvalidArgument("Invalid hccl reduction. "
"Must be Min | Max | Prod | Sum"));
return it->second;
}
std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
std::ostringstream oss;
for (size_t i = 0; i < sizeof(hcclID); ++i) {
oss << std::hex << static_cast<int>(bytes[i]);
}
return oss.str();
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <string>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/variant.h"
namespace paddle {
namespace distributed {
class NPUEventManager {
public:
NPUEventManager() = default;
~NPUEventManager() {
if (is_created_) {
platform::NPUDeviceGuard guard(device_index_);
platform::NPUEventDestroy(event_);
}
}
NPUEventManager(const NPUEventManager&) = delete;
NPUEventManager& operator=(const NPUEventManager&) = delete;
NPUEventManager(NPUEventManager&& other) {
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
}
NPUEventManager& operator=(NPUEventManager&& other) {
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
return *this;
}
bool IsCreated() const { return is_created_; }
bool DeviceId() const { return device_index_; }
aclrtEvent GetRawNPUEvent() const { return event_; }
void Record(const paddle::platform::NPUDeviceContext& ctx) {
auto device_index = ctx.GetPlace().device;
if (!is_created_) {
CreateEvent(device_index);
}
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"NPUDeviceContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
platform::NPUDeviceGuard guard(device_index_);
platform::NPUEventRecord(event_, ctx.stream());
}
bool Query() const {
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
platform::NPUEventQuery(event_, &status);
if (status == ACL_EVENT_STATUS_COMPLETE) {
return true;
}
return false;
}
void Block(const paddle::platform::NPUDeviceContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
platform::NPUDeviceGuard guard(device_index_);
platform::NPUStreamWaitEvent(ctx.stream(), event_);
}
}
private:
bool is_created_{false};
aclrtEvent event_{};
int8_t device_index_{0};
private:
void CreateEvent(int device_index) {
device_index_ = device_index;
platform::NPUDeviceGuard guard(device_index);
platform::NPUEventCreate(&event_);
is_created_ = true;
}
};
class HCCLCommManager {
public:
explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
HCCLCommManager() : HCCLCommManager(nullptr) {}
~HCCLCommManager() noexcept {
std::unique_lock<std::mutex> lock(mutex_);
if (hccl_comm_) {
platform::dynload::HcclCommDestroy(hccl_comm_);
}
}
static std::shared_ptr<HCCLCommManager> Create(int num_ranks,
int rank,
HcclRootInfo* comm_id,
HcclComm hccl_comm) {
auto hccl_manager = std::make_shared<HCCLCommManager>();
auto ret = platform::dynload::HcclCommInitRootInfo(
num_ranks, comm_id, rank, &hccl_comm);
using __NPU_STATUS_TYPE__ = decltype(ret);
constexpr auto __success_type__ =
platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
if (UNLIKELY(ret != __success_type__)) {
VLOG(0) << "Error: create hccl_id error.";
exit(-1);
}
hccl_manager->hccl_id_ = comm_id;
hccl_manager->rank_ = rank;
hccl_manager->hccl_comm_ = hccl_comm;
return hccl_manager;
}
HcclRootInfo* GetHcclId() const {
std::unique_lock<std::mutex> lock(mutex_);
return hccl_id_;
}
HcclComm GetHcclComm() const {
std::unique_lock<std::mutex> lock(mutex_);
return hccl_comm_;
}
HCCLCommManager(const HCCLCommManager&) = delete;
HCCLCommManager& operator=(const HCCLCommManager&) = delete;
HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
HCCLCommManager(HCCLCommManager&& other) {
std::unique_lock<std::mutex> lock(other.mutex_);
std::swap(hccl_comm_, other.hccl_comm_);
}
protected:
HcclComm hccl_comm_;
HcclRootInfo* hccl_id_;
int rank_;
mutable std::mutex mutex_;
};
HcclReduceOp ToHCCLRedType(ReduceOp reduction);
std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID);
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/MPITools.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace paddle {
namespace distributed {
namespace mpi {
MPI_Op ToMPIType(ReduceOp reduction) {
static const std::map<ReduceOp, MPI_Op> red_type = {
{ReduceOp::MIN, MPI_MIN},
{ReduceOp::MAX, MPI_MAX},
{ReduceOp::SUM, MPI_SUM},
{ReduceOp::PRODUCT, MPI_PROD},
};
auto it = red_type.find(reduction);
PADDLE_ENFORCE_EQ(it != red_type.end(),
true,
platform::errors::InvalidArgument(
"Invalid mpi reduction. Must be MPI_MIN | MPI_MAX | "
"MPI_PROD | MPI_SUM."));
return it->second;
}
// NOTE: MPI dose not support CUDA aware now.
bool CheckMpiCudaAware() { return false; }
void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors) {
PADDLE_ENFORCE_EQ(
tensors.size() == 1,
true,
platform::errors::InvalidArgument("the inputs size of MPI must be 1!"));
PADDLE_ENFORCE_EQ(CheckTensorsInCudaPlace(tensors) && !CheckMpiCudaAware(),
false,
platform::errors::InvalidArgument(
"Found CUDA Tensor. But CUDA-aware MPI not support!"));
}
} // namespace mpi
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <iostream>
#include <string>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/distributed/collective/Types.h"
#ifdef HOST
#undef HOST
#endif
#include <mpi.h>
namespace paddle {
namespace distributed {
namespace mpi {
#define MPI_CHECK(cmd) \
do { \
int r = cmd; \
if (r != MPI_SUCCESS) { \
LOG(FATAL) << "Failed, MPI error in" << __FILE__ << ":" << __LINE__ \
<< "with error code: " << std::to_string(r) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
MPI_Op ToMPIType(ReduceOp reduction);
bool CheckMpiCudaAware();
void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors);
} // namespace mpi
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace paddle {
namespace distributed {
ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
static const std::map<ReduceOp, ncclRedOp_t> red_type = {
{ReduceOp::MIN, ncclMin},
{ReduceOp::MAX, ncclMax},
{ReduceOp::SUM, ncclSum},
{ReduceOp::PRODUCT, ncclProd},
};
auto it = red_type.find(reduction);
PADDLE_ENFORCE_EQ(it != red_type.end(),
true,
platform::errors::InvalidArgument(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"));
return it->second;
}
std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
std::ostringstream oss;
for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
oss << std::hex << static_cast<int>(bytes[i]);
}
return oss.str();
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <error.h>
#include <string>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/variant.h"
namespace paddle {
namespace distributed {
#define NCCLCHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__, \
__LINE__, \
platform::dynload::ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
// EventManage is different from paddle::platform::CudaEvent.
// It uses lazy initialization and is only created when the
// Record() method is called for the first time; it also monitors
// device information to ensure that recorded stream and event
// are on the same device.
class EventManager {
public:
EventManager() {}
explicit EventManager(unsigned int flags) : flags_{flags} {}
~EventManager() {
if (is_created_) {
platform::CUDADeviceGuard guard(device_index_);
#ifdef PADDLE_WITH_HIP
hipEventDestroy(event_);
#else
cudaEventDestroy(event_);
#endif
}
}
EventManager(const EventManager&) = delete;
EventManager& operator=(const EventManager&) = delete;
EventManager(EventManager&& other) {
std::swap(flags_, other.flags_);
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
}
EventManager& operator=(EventManager&& other) {
std::swap(flags_, other.flags_);
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
return *this;
}
bool IsCreated() const { return is_created_; }
bool DeviceId() const { return device_index_; }
gpuEvent_t GetRawCudaEvent() const { return event_; }
void Record(const phi::GPUContext& ctx) {
auto device_index = ctx.GetPlace().device;
if (!is_created_) {
CreateEvent(device_index);
}
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
platform::CUDADeviceGuard guard(device_index_);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream()));
#endif
}
bool Query() const {
#ifdef PADDLE_WITH_HIP
gpuError_t err = hipEventQuery(event_);
if (err == hipSuccess) {
return true;
}
if (err == hipErrorNotReady) {
return false;
}
#else
gpuError_t err = cudaEventQuery(event_);
if (err == cudaSuccess) {
return true;
}
if (err == cudaErrorNotReady) {
return false;
}
#endif
PADDLE_ENFORCE_GPU_SUCCESS(err);
return false;
}
void Synchronize() const {
if (is_created_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
#endif
}
}
void Block(const phi::GPUContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
platform::CUDADeviceGuard guard(device_index_);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
#endif
}
}
private:
#ifdef PADDLE_WITH_HIP
unsigned int flags_ = hipEventDefault;
#else
unsigned int flags_ = cudaEventDefault;
#endif
bool is_created_{false};
gpuEvent_t event_{};
int8_t device_index_{0};
private:
void CreateEvent(int device_index) {
device_index_ = device_index;
platform::CUDADeviceGuard guard(device_index);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
#endif
is_created_ = true;
}
};
// NOTE(shenliang03): NCCLCommManager is more lightweight than
// platform::NCCLComm
class NCCLCommManager {
public:
explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {}
NCCLCommManager() : NCCLCommManager(nullptr) {}
~NCCLCommManager() noexcept {
std::unique_lock<std::mutex> lock(mutex_);
if (nccl_comm_) {
platform::dynload::ncclCommDestroy(nccl_comm_);
}
}
static std::shared_ptr<NCCLCommManager> Create(int num_ranks,
int rank,
ncclUniqueId comm_id) {
auto nccl_manager = std::make_shared<NCCLCommManager>();
NCCLCHECK(platform::dynload::ncclCommInitRank(
&(nccl_manager->nccl_comm_), num_ranks, comm_id, rank));
nccl_manager->nccl_id_ = comm_id;
nccl_manager->rank_ = rank;
return nccl_manager;
}
ncclUniqueId GetNcclId() const {
std::unique_lock<std::mutex> lock(mutex_);
return nccl_id_;
}
ncclComm_t GetNcclComm() const {
std::unique_lock<std::mutex> lock(mutex_);
return nccl_comm_;
}
NCCLCommManager(const NCCLCommManager&) = delete;
NCCLCommManager& operator=(const NCCLCommManager&) = delete;
NCCLCommManager& operator=(NCCLCommManager&& other) = delete;
NCCLCommManager(NCCLCommManager&& other) {
std::unique_lock<std::mutex> lock(other.mutex_);
std::swap(nccl_comm_, other.nccl_comm_);
}
protected:
ncclComm_t nccl_comm_;
ncclUniqueId nccl_id_;
int rank_;
mutable std::mutex mutex_;
};
ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID);
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
namespace paddle {
namespace distributed {
ProcessGroup::Task::Task(int rank,
const std::vector<phi::DenseTensor>& inputs,
CommType comm_type)
: rank_(rank), comm_type_(comm_type) {}
ProcessGroup::Task::Task(int rank,
const std::vector<phi::DenseTensor>& inputs,
CommType comm_type,
bool sync_op)
: rank_(rank), comm_type_(comm_type), sync_op_(sync_op) {}
ProcessGroup::Task::~Task() = default;
bool ProcessGroup::Task::IsCompleted() {
std::lock_guard<std::mutex> lock(mutex_);
return is_completed_;
}
bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
return false;
}
void ProcessGroup::Task::Synchronize() {}
ProcessGroup::ProcessGroup(int rank,
int size,
const platform::Place& place,
int gid)
: rank_(rank), size_(size), place_(place), gid_(gid) {
if (gid != IGNORE_ID) {
auto map = ProcessGroupMapFromGid::getInstance();
map->insert(gid_, this);
}
}
ProcessGroup::ProcessGroup(int rank, int size, int gid)
: rank_(rank), size_(size), gid_(gid) {
if (gid != IGNORE_ID) {
auto map = ProcessGroupMapFromGid::getInstance();
map->insert(gid_, this);
}
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
namespace paddle {
namespace distributed {
constexpr int IGNORE_ID = -1;
using Tensor = paddle::experimental::Tensor;
enum class CommType : std::uint8_t {
BROADCAST = 0,
ALLREDUCE = 1,
ALLREDUCE_SPARSE = 2, // TODO(shenliang03): to support sparse in allreduce
REDUCE = 3,
ALLGATHER = 4,
GATHER = 5,
SCATTER = 6,
REDUCE_SCATTER = 7,
ALLTOALL = 8,
SEND = 9,
RECV = 10,
BARRIER = 11,
ALLTOALL_SINGLE = 12,
UNKNOWN = 100,
};
class ProcessGroup {
public:
class Task {
public:
Task(int rank,
const std::vector<phi::DenseTensor>& inputs,
CommType comm_type);
Task(int rank,
const std::vector<phi::DenseTensor>& inputs,
CommType comm_type,
bool sync_op);
virtual ~Task();
virtual bool IsCompleted();
virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
virtual void Synchronize();
bool IsSync() const { return sync_op_; }
protected:
const int rank_;
CommType comm_type_{CommType::UNKNOWN};
std::mutex mutex_;
bool is_completed_{false};
private:
bool sync_op_{true};
};
explicit ProcessGroup(int rank,
int size,
const platform::Place& place,
int gid);
explicit ProcessGroup(int rank, int size, int gid);
virtual ~ProcessGroup() {}
int GetRank() const { return rank_; }
int GetSize() const { return size_; }
virtual const std::string GetBackendName() const = 0;
virtual phi::DeviceContext* GetDeviceContext(const Place& place) const {
PADDLE_THROW(platform::errors::InvalidArgument(
"Does not support to get device_context from ProcessGroup%s.",
GetBackendName()));
}
// TODO(liyurui): This API will be moved later
virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& /* input tensors */, // NOLINT
std::vector<phi::DenseTensor>& /* output tensors */, // NOLINT
const AllreduceOptions& = AllreduceOptions()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support allreduce", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& /* input tensors */, // NOLINT
std::vector<phi::DenseTensor>& /* output tensors */, // NOLINT
const AllreduceOptions&,
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support allreduce with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& /* input tensors */, // NOLINT
std::vector<phi::DenseTensor>& /* output tensors */, // NOLINT
const BroadcastOptions& = BroadcastOptions()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support broadcast", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& /* input tensors */, // NOLINT
std::vector<phi::DenseTensor>& /* output tensors */, // NOLINT
const BroadcastOptions&,
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support broadcast with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support barrier", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>&, int) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support send", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>&, int, bool) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support send with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>&, int) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support recv", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>&, int, bool) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support recv with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
phi::DenseTensor&, // NOLINT
int,
int64_t,
int64_t) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support send_partial", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
phi::DenseTensor&, int, int64_t, int64_t, bool) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support send_partial with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
phi::DenseTensor&, // NOLINT
int,
int64_t,
int64_t) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support recv_partial", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
phi::DenseTensor&, int, int64_t, int64_t, bool) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support recv_partial with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support all_gather", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support all_gather with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
int64_t offset,
int64_t length) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
int64_t offset,
int64_t length,
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllToAll", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support alltoall", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<int64_t>&,
std::vector<int64_t>&) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<int64_t>&,
std::vector<int64_t>&,
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support alltoall_single", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
const ReduceOptions& opts) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support reduce", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& /* input tensors */, // NOLINT
std::vector<phi::DenseTensor>& /* output tensors */, // NOLINT
const ReduceOptions&,
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support reduce with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
const ScatterOptions&) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support scatter", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
const ScatterOptions&,
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support scatter with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
const ReduceScatterOptions&,
bool) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support reduce_scatter with sync_op flag",
GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
phi::DenseTensor&, // NOLINT
phi::DenseTensor&, // NOLINT
const ReduceScatterOptions&) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support ReduceScatter", GetBackendName()));
}
protected:
const int rank_;
const int size_;
const platform::Place place_;
const int gid_;
};
class ProcessGroupMapFromGid {
public:
bool has(int gid) {
auto it = map_.find(gid);
return it != map_.end();
}
void insert(int gid, ProcessGroup* pg) {
// TODO(sandyhouse): address ut and uncomment the following codes
// PADDLE_ENFORCE_EQ(has(gid), false,
// platform::errors::PreconditionNotMet(
// "The process group with id %d doesnot exist.",
// gid));
map_[gid] = pg;
}
ProcessGroup* get(int gid) {
// TODO(sandyhouse): address ut and uncomment the following codes
// PADDLE_ENFORCE_EQ(has(gid), true,
// platform::errors::PreconditionNotMet(
// "The process group with id %d doesnot exist.",
// gid));
return map_.find(gid)->second;
}
static std::shared_ptr<ProcessGroupMapFromGid> getInstance() {
static auto s_instance = std::make_shared<ProcessGroupMapFromGid>();
return s_instance;
}
ProcessGroupMapFromGid() = default;
~ProcessGroupMapFromGid() = default;
private:
std::unordered_map<int, ProcessGroup*> map_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
DECLARE_bool(xccl_blocking_wait);
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
void SyncDefaultStream(
const std::vector<Place>& places,
std::vector<CustomEventManager>& cclEvents, // NOLINT
std::vector<std::unique_ptr<CustomDeviceContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::CustomDeviceContext*>(
platform::DeviceContextPool::Instance().Get(places[i]));
cclEvents[i].Record(*dev_ctx[i]);
cclEvents[i].Block(*default_ctx);
}
}
std::shared_ptr<ProcessGroupCustom::CustomTask> ProcessGroupCustom::CreateTask(
std::vector<Place> places,
int rank,
CommType comm_type,
const std::vector<phi::DenseTensor>& inputs) {
return std::make_shared<ProcessGroupCustom::CustomTask>(
places, rank, comm_type, inputs);
}
ProcessGroupCustom::CustomTask::CustomTask(
const std::vector<Place>& places,
int rank,
CommType CommType,
const std::vector<phi::DenseTensor>& inputs)
: Task(rank, inputs, CommType), places_(places) {
control_events_.resize(places.size());
cclComms_.resize(places.size());
}
ProcessGroupCustom::CustomTask::~CustomTask() {}
void ProcessGroupCustom::CustomTask::SetOutputs(
std::vector<phi::DenseTensor>& outputs) { // NOLINT
outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
}
void ProcessGroupCustom::CustomTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<platform::CustomDeviceContext*>(
platform::DeviceContextPool::Instance().Get(places_[i]));
phi::DeviceGuard guard(default_ctx->GetPlace());
phi::stream::Stream stream(default_ctx->GetPlace(), default_ctx->stream());
stream.WaitEvent(control_events_[i].GetCustomEvent());
}
}
bool ProcessGroupCustom::CustomTask::IsCompleted() {
for (size_t i = 0; i < places_.size(); ++i) {
if (!control_events_[i].Query()) {
return false;
}
}
return true;
}
bool ProcessGroupCustom::CustomTask::Wait(std::chrono::milliseconds timeout) {
SynchronizeStreams();
while (!IsCompleted()) {
std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
}
return true;
}
// Same as Wait
void ProcessGroupCustom::CustomTask::Synchronize() { Wait(kWaitTimeout); }
ProcessGroupCustom::ProcessGroupCustom(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid)
: ProcessGroup(rank, size, place, gid),
store_(store),
device_type_(place.GetDeviceType()) {
phi::DeviceManager::SetDevice(place_);
}
void ProcessGroupCustom::BroadcastUniqueCustomID(
std::vector<phi::ccl::CCLRootId>& ccl_ids) { // NOLINT
if (rank_ == 0) {
for (size_t i = 0; i < ccl_ids.size(); i++) {
auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
store_->set(key, ccl_ids[i]);
}
} else {
for (size_t i = 0; i < ccl_ids.size(); i++) {
auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
ccl_ids[i] = store_->get(key);
}
}
}
// create CustomCCLManager cache for places_key
void ProcessGroupCustom::CreateCustomManagerCache(
const std::string& places_key, const std::vector<Place>& places) {
PADDLE_ENFORCE_EQ(places_key.empty(),
false,
platform::errors::PreconditionNotMet(
"Not able to create/get the HCCL Communicator since "
"the NPU place are not known"));
const std::string device_type = places.back().GetDeviceType();
std::vector<std::shared_ptr<CustomCCLCommManager>> ccl_comms;
ccl_comms.resize(places.size());
// using vector just for broadcast
std::vector<phi::ccl::CCLRootId> ccl_ids;
ccl_ids.resize(1);
auto& ccl_id = ccl_ids.front();
if (rank_ == 0) {
phi::DeviceManager::CCLGetUniqueId(device_type, &ccl_id);
}
BroadcastUniqueCustomID(ccl_ids);
VLOG(3) << "init custom ccl rank: " << rank_ << ", nranks: " << size_
<< ", place: " << places_key
<< ", custom ccl uniqueid: " << SerializeCustomCCLUniqueId(ccl_id);
std::vector<std::unique_ptr<CustomDeviceContext>> dev_ctx;
dev_ctx.resize(places.size());
std::unique_ptr<phi::ccl::CCLComm> comms(
new phi::ccl::CCLComm[places.size()]);
for (size_t i = 0; i < places.size(); ++i) {
phi::DeviceGuard guard(places[i]);
ccl_comms[i] = CustomCCLCommManager::Create(
device_type, GetSize(), GetRank(), &ccl_id, comms.get() + i);
dev_ctx[i].reset(new CustomDeviceContext(places[i]));
}
std::vector<CustomEventManager> events;
events.resize(places.size());
// These caches will be useful to process sync/wait/communicate
places_to_events_.emplace(places_key, std::move(events));
places_to_customcomm_.emplace(places_key, std::move(ccl_comms));
places_to_ctx_.emplace(places_key, std::move(dev_ctx));
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Collective(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
Fn fn,
CommType op_type) {
const auto places = GetPlaceList(inputs);
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_customcomm_.find(key) == places_to_customcomm_.end()) {
CreateCustomManagerCache(key, places);
}
}
auto& ccl_comms = places_to_customcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
auto task = CreateTask(places, rank_, op_type, inputs);
task->SetOutputs(outputs);
for (size_t i = 0; i < inputs.size(); ++i) {
phi::DeviceGuard guard(places[i]);
const auto& ccl_stream = places_to_ctx_[key][i]->stream();
phi::stream::Stream stream(places[i], ccl_stream);
fn(inputs[i], outputs[i], ccl_comms[i]->GetCustomCCLComm(), stream);
}
for (size_t i = 0; i < inputs.size(); ++i) {
phi::DeviceGuard guard(places[i]);
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(in_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All inputs should be in CustomPlace(%s).", device_type_));
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(out_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All outputs should be in CustomPlace(%s).", device_type_));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
phi::ccl::CCLComm comm,
const phi::stream::Stream& stream) {
return phi::DeviceManager::CCLAllGather(
device_type_,
input.data(),
output.data(),
input.numel(),
phi::ccl::ToCCLDataType(input.dtype()),
comm,
stream);
},
CommType::ALLGATHER);
}
void* XcclGetPointerByOffset(void* raw_pointer,
size_t offset,
experimental::DataType type) {
if (type == experimental::DataType::FLOAT32) {
return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT64) {
return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT32) {
return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT64) {
return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
offset);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"This datatype in xccl is not supported."));
}
return nullptr;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(in_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All inputs should be in CustomPlace(%s).", device_type_));
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(out_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All outputs should be in CustomPlace(%s).", device_type_));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
phi::ccl::CCLComm comm,
const phi::stream::Stream& stream) {
return phi::DeviceManager::CCLAllGather(
device_type_,
XcclGetPointerByOffset(input.data(), offset, input.dtype()),
output.data(),
length,
phi::ccl::ToCCLDataType(input.dtype()),
comm,
stream);
},
CommType::ALLGATHER);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const AllreduceOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(in_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All inputs should be in CustomPlace(%s).", device_type_));
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(out_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All outputs should be in CustomPlace(%s).", device_type_));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
phi::ccl::CCLComm comm,
const phi::stream::Stream& stream) {
return phi::DeviceManager::CCLAllReduce(
device_type_,
input.data(),
output.data(),
input.numel(),
phi::ccl::ToCCLDataType(input.dtype()),
ToCustomCCLRedType(opts.reduce_op),
comm,
stream);
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const BroadcastOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(in_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All inputs should be in CustomPlace(%s).", device_type_));
PADDLE_ENFORCE_EQ(
CheckTensorsInCustomPlace(out_tensors, device_type_),
true,
platform::errors::InvalidArgument(
"All outputs should be in CustomPlace(%s).", device_type_));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
phi::ccl::CCLComm comm,
const phi::stream::Stream& stream) {
int root = opts.source_rank * in_tensors.size() + opts.source_root;
if (rank_ == root) {
return phi::DeviceManager::CCLBroadcast(
device_type_,
input.data(),
input.numel(),
phi::ccl::ToCCLDataType(input.dtype()),
root,
comm,
stream);
} else {
return phi::DeviceManager::CCLBroadcast(
device_type_,
output.data(),
output.numel(),
phi::ccl::ToCCLDataType(output.dtype()),
root,
comm,
stream);
}
},
CommType::BROADCAST);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Barrier(
const BarrierOptions& opts) {
// Only support single card single process
std::vector<phi::CustomPlace> places = {place_};
std::vector<phi::DenseTensor> barrierTensors;
barrierTensors.reserve(places.size());
for (auto& place : places) {
phi::DeviceGuard guard(place);
auto dt = full({1}, 0, phi::DataType::FLOAT32, place);
barrierTensors.push_back(
*std::dynamic_pointer_cast<phi::DenseTensor>(dt.impl()));
}
auto task = ProcessGroupCustom::AllReduce(barrierTensors, barrierTensors);
auto xccl_task = dynamic_cast<ProcessGroupCustom::CustomTask*>(task.get());
xccl_task->barrierTensors_ = std::move(barrierTensors);
return task;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/device/npu/npu_stream.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
using CustomDeviceContext = paddle::platform::CustomDeviceContext;
class ProcessGroupCustom : public ProcessGroup {
public:
class CustomTask : public ProcessGroup::Task,
public std::enable_shared_from_this<CustomTask> {
public:
CustomTask(const std::vector<Place>& places,
int rank,
CommType CommType,
const std::vector<phi::DenseTensor>& inputs);
bool IsCompleted();
void SynchronizeStreams();
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void Synchronize();
void SetOutputs(std::vector<phi::DenseTensor>& outputs); // NOLINT
virtual ~CustomTask();
std::vector<CustomEventManager> control_events_;
std::vector<phi::DenseTensor> barrierTensors_;
protected:
std::vector<Place> places_;
std::vector<std::shared_ptr<CustomCCLCommManager>> cclComms_;
std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
private:
const std::string device_type_;
};
ProcessGroupCustom(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid);
const std::string GetBackendName() const override {
return "XCCL_" + device_type_;
}
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length) override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override;
protected:
virtual std::shared_ptr<ProcessGroupCustom::CustomTask> CreateTask(
std::vector<Place> places,
int rank,
CommType opType,
const std::vector<phi::DenseTensor>& inputs);
std::shared_ptr<Store> store_;
std::shared_ptr<CustomCCLCommManager> custom_comm_;
std::mutex mutex_;
std::unordered_map<std::string,
std::vector<std::shared_ptr<CustomCCLCommManager>>>
places_to_customcomm_;
std::unordered_map<std::string, std::vector<CustomEventManager>>
places_to_events_;
std::unordered_map<std::string,
std::vector<std::unique_ptr<CustomDeviceContext>>>
places_to_ctx_;
std::set<int> used_place_ids_;
private:
void BcastCustomId(std::vector<phi::ccl::CCLRootId>& ccl_ids, // NOLINT
int root,
int server_fd);
void BroadcastUniqueCustomID(
std::vector<phi::ccl::CCLRootId>& custom_ccl_ids); // NOLINT
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> Collective(
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
Fn fn,
CommType op_type);
void CreateCustomManagerCache(const std::string& places_key,
const std::vector<Place>& places);
const std::string device_type_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#ifdef _WIN32
#include <gloo/common/win.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#else
#include <netdb.h>
#include <sys/socket.h>
#include <unistd.h>
#endif
#include <gloo/broadcast.h>
#include <gloo/reduce.h>
#include <gloo/scatter.h>
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
#ifdef _WIN32
#define GENERATE_FUNC(type, func, ...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(__VA_ARGS__); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(__VA_ARGS__); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(__VA_ARGS__); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#define HOST_NAME_MAX 256
#else
#define GENERATE_FUNC(type, func, args...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(args); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(args); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(args); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(args); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(args); \
break; \
case experimental::DataType::INT8: \
func<int8_t>(args); \
break; \
case experimental::DataType::UINT8: \
func<uint8_t>(args); \
break; \
case experimental::DataType::BOOL: \
func<bool>(args); \
break; \
case experimental::DataType::BFLOAT16: \
func<bfloat16>(args); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#endif
typedef void (*reduce_func)(void*, const void*, const void*, size_t);
template <typename T>
reduce_func get_function(const ReduceOp& r) {
switch (r) {
case ReduceOp::SUM:
return reduce_func(&::gloo::sum<T>);
case ReduceOp::PRODUCT:
return reduce_func(&::gloo::product<T>);
case ReduceOp::MIN:
return reduce_func(&::gloo::min<T>);
case ReduceOp::MAX:
return reduce_func(&::gloo::max<T>);
case ReduceOp::AVG:
VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
exit(-1);
}
VLOG(0) << "Error: Unknown ReduceOp.";
exit(-1);
}
template <typename T>
T* get_data(phi::DenseTensor& tensor) { // NOLINT
return reinterpret_cast<T*>(tensor.data());
}
template <typename T>
std::vector<T*> get_multi_data(
std::vector<phi::DenseTensor>& tensors) { // NOLINT
std::vector<T*> ret;
ret.reserve(tensors.size());
for (size_t i = 0; i < tensors.size(); i++) {
ret.push_back(get_data<T>(tensors[i]));
}
return ret;
}
template <typename T, typename P>
void set_output(P& opts, phi::DenseTensor& tensor) { // NOLINT
opts.setOutput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_input(P& opts, phi::DenseTensor& tensor) { // NOLINT
opts.setInput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_outputs(P& opts, // NOLINT
std::vector<phi::DenseTensor>& tensors) { // NOLINT
opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
}
template <typename T, typename P>
void set_inputs(P& opts, // NOLINT
std::vector<phi::DenseTensor>& tensors) { // NOLINT
opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
}
template <typename T, typename P>
void set_inputs_for_scatter(P& opts, // NOLINT
phi::DenseTensor& tensor, // NOLINT
int nranks) {
std::vector<T*> ret;
ret.reserve(nranks);
T* raw_pointer = reinterpret_cast<T*>(tensor.data());
size_t offset = 0;
for (int i = 0; i < nranks; i++) {
ret.push_back(raw_pointer + offset);
offset += tensor.numel() / nranks;
}
opts.setInputs(ret, tensor.numel() / nranks);
}
ProcessGroupGloo::GlooTask::GlooTask(
int rank, const std::vector<phi::DenseTensor>& inputs, CommType comm_type)
: ProcessGroup::Task(rank, inputs, comm_type) {}
ProcessGroupGloo::ProcessGroupGloo(
const std::shared_ptr<distributed::Store>& store,
int rank,
int world_size,
const platform::Place& place,
int gid,
const std::shared_ptr<GlooOptions> options)
: ProcessGroup(rank, world_size, place, gid),
_tag(0),
_store(new GlooStore(store)) {
_context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
auto prefix_store =
::gloo::rendezvous::PrefixStore(std::to_string(gid), *_store);
_context->connectFullMesh(prefix_store, options->device);
}
class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
public:
BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
int rank,
int root,
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
_context(context),
_root(root),
_inputs(inputs),
_outputs(outputs),
_tag(tag) {}
void Run() override { _do_broadcast(_inputs[0], _outputs[0]); }
private:
std::shared_ptr<gloo::Context> _context;
const int _root;
std::vector<phi::DenseTensor> _inputs{};
std::vector<phi::DenseTensor> _outputs{};
const uint32_t _tag;
void _do_broadcast(phi::DenseTensor& in, phi::DenseTensor& out) { // NOLINT
gloo::BroadcastOptions opts(_context);
const auto& dtype = in.dtype();
if (rank_ == _root) {
GENERATE_FUNC(dtype, set_input, opts, in);
}
GENERATE_FUNC(dtype, set_output, opts, out);
opts.setRoot(_root);
opts.setTag(_tag);
gloo::broadcast(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const BroadcastOptions& opts) {
auto root = opts.source_rank;
std::unique_ptr<BroadcastGlooTask> task;
auto tag = next_tag();
auto context = get_context();
task = std::make_unique<BroadcastGlooTask>(
context, inputs, outputs, rank_, root, tag);
task->Run();
return task;
}
class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
public:
AllreduceGlooTask(int rank,
const std::shared_ptr<gloo::Context>& context,
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
ReduceOp reduce_op,
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
_context(context),
_inputs(inputs),
_outputs(outputs),
_reduce_op(reduce_op),
_tag(tag) {}
void Run() override { _do_allreduce(_inputs, _outputs); }
private:
std::shared_ptr<gloo::Context> _context;
std::vector<phi::DenseTensor> _inputs;
std::vector<phi::DenseTensor> _outputs;
const ReduceOp _reduce_op;
uint32_t _tag;
gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
const ReduceOp op) {
gloo::AllreduceOptions::Func fn;
GENERATE_FUNC(type, _get_function_impl, fn, op);
return fn;
}
template <typename T>
void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT
const ReduceOp op) {
fn = get_function<T>(op);
}
void _do_allreduce(std::vector<phi::DenseTensor>& ins, // NOLINT
std::vector<phi::DenseTensor>& outs) { // NOLINT
const auto& dtype = ins[0].dtype();
gloo::AllreduceOptions opts(_context);
GENERATE_FUNC(dtype, set_inputs, opts, ins);
GENERATE_FUNC(dtype, set_outputs, opts, outs);
opts.setReduceFunction(_get_function(dtype, _reduce_op));
opts.setTag(_tag);
gloo::allreduce(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const AllreduceOptions& opts) {
return AllReduce(inputs, outputs, opts, true);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const AllreduceOptions& opts,
bool sync_op) {
auto tag = next_tag();
std::shared_ptr<GlooTask> task;
auto context = get_context();
task = std::make_shared<AllreduceGlooTask>(
rank_, context, inputs, outputs, opts.reduce_op, tag);
task->Run();
return task;
}
class BarrierGlooTask : public ProcessGroupGloo::GlooTask {
public:
BarrierGlooTask(int rank, const std::shared_ptr<gloo::Context>& context)
: ProcessGroupGloo::GlooTask(
rank, std::vector<phi::DenseTensor>{}, CommType::BARRIER),
_context(context) {}
void Run() override { _do_barrier(); }
private:
std::shared_ptr<gloo::Context> _context;
void _do_barrier() {
gloo::BarrierOptions opts(_context);
gloo::barrier(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Barrier(
const BarrierOptions& opts) {
std::shared_ptr<BarrierGlooTask> task;
auto context = get_context();
task = std::make_shared<BarrierGlooTask>(rank_, context);
task->Run();
return task;
}
class AllgatherGlooTask : public ProcessGroupGloo::GlooTask {
public:
AllgatherGlooTask(int rank,
const std::shared_ptr<gloo::Context>& context,
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER),
_context(context),
_inputs(inputs),
_outputs(outputs),
_tag(tag) {}
void Run() override { _do_allgather(_inputs, _outputs); }
private:
std::shared_ptr<gloo::Context> _context;
std::vector<phi::DenseTensor> _inputs;
std::vector<phi::DenseTensor> _outputs;
uint32_t _tag;
void _do_allgather(std::vector<phi::DenseTensor>& in, // NOLINT
std::vector<phi::DenseTensor>& out) { // NOLINT
const auto& dtype = in[0].dtype();
gloo::AllgatherOptions opts(_context);
GENERATE_FUNC(dtype, set_input, opts, in[0]);
GENERATE_FUNC(dtype, set_output, opts, out[0]);
opts.setTag(_tag);
gloo::allgather(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
std::shared_ptr<AllgatherGlooTask> task;
auto tag = next_tag();
auto context = get_context();
task = std::make_shared<AllgatherGlooTask>(
rank_, context, in_tensors, out_tensors, tag);
task->Run();
return task;
}
class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
public:
ReduceGlooTask(int rank,
const std::shared_ptr<gloo::Context>& context,
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
ReduceOp reduce_op,
int dst,
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::REDUCE),
_context(context),
_inputs(inputs),
_outputs(outputs),
_reduce_op(reduce_op),
_dst(dst),
_tag(tag) {}
void Run() override { _do_reduce(_inputs, _outputs, _dst); }
private:
std::shared_ptr<gloo::Context> _context;
std::vector<phi::DenseTensor> _inputs;
std::vector<phi::DenseTensor> _outputs;
const ReduceOp _reduce_op;
int _dst;
uint32_t _tag;
gloo::ReduceOptions::Func _get_function(const experimental::DataType type,
const ReduceOp op) {
gloo::ReduceOptions::Func fn;
GENERATE_FUNC(type, _get_function_impl, fn, op);
return fn;
}
template <typename T>
void _get_function_impl(gloo::ReduceOptions::Func& fn, // NOLINT
const ReduceOp op) {
fn = get_function<T>(op);
}
void _do_reduce(std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
int dst) {
const auto& dtype = inputs[0].dtype();
gloo::ReduceOptions opts(_context);
GENERATE_FUNC(dtype, set_input, opts, inputs[0]);
GENERATE_FUNC(dtype, set_output, opts, outputs[0]);
opts.setReduceFunction(_get_function(dtype, _reduce_op));
opts.setTag(_tag);
opts.setRoot(dst);
gloo::reduce(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const ReduceOptions& opts) {
std::shared_ptr<ReduceGlooTask> task;
auto tag = next_tag();
auto context = get_context();
task = std::make_shared<ReduceGlooTask>(
rank_, context, inputs, outputs, opts.reduce_op, opts.root_rank, tag);
task->Run();
return task;
}
class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
public:
ScatterGlooTask(int rank,
const std::shared_ptr<gloo::Context>& context,
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
int src,
int size,
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER),
_context(context),
_inputs(inputs),
_outputs(outputs),
_src(src),
_size(size),
_tag(tag) {}
void Run() override { _do_scatter(_inputs, _outputs, _src); }
private:
std::shared_ptr<gloo::Context> _context;
std::vector<phi::DenseTensor> _inputs;
std::vector<phi::DenseTensor> _outputs;
int _src;
int _size;
uint32_t _tag;
void _do_scatter(std::vector<phi::DenseTensor>& in, // NOLINT
std::vector<phi::DenseTensor>& out, // NOLINT
int src) {
const auto& dtype = in[0].dtype();
gloo::ScatterOptions opts(_context);
if (rank_ == src) {
GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in[0], _size);
}
GENERATE_FUNC(dtype, set_output, opts, out[0]);
opts.setRoot(src);
opts.setTag(_tag);
gloo::scatter(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts) {
std::shared_ptr<ScatterGlooTask> task;
auto tag = next_tag();
auto context = get_context();
task = std::make_shared<ScatterGlooTask>(
rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag);
task->Run();
return task;
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
::gloo::transport::tcp::attr attr;
attr.iface = ifname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
::gloo::transport::tcp::attr attr;
attr.hostname = hostname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDefaultDevice() {
std::array<char, HOST_NAME_MAX> hostname{};
auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
PADDLE_ENFORCE_EQ(
ret,
0,
platform::errors::Fatal("Get hostname error for createDefaultDevice."));
::addrinfo* result;
result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
::addrinfo* cur;
for (cur = result; cur != nullptr; cur = cur->ai_next) {
SocketType socket =
::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
if (socket == -1) {
continue;
}
ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
#ifdef _WIN32
closesocket(socket);
#else
close(socket);
#endif
if (ret == -1) {
continue;
}
break;
}
freeaddrinfo(result);
if (cur != nullptr) {
return createDeviceForHostname(hostname.data());
}
return createDeviceForHostname("127.0.0.1");
}
} // namespace distributed
} // namespace paddle
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment