Commit f0ef3442 authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.3.2-dtk-22.10.1

parent ad08b8ce
Pipeline #227 failed with stages
in 0 seconds
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <future>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
constexpr const char* GLOO_BACKEND_NAME = "GLOO";
namespace paddle {
namespace distributed {
class ProcessGroupGloo : public ProcessGroup {
public:
class GlooTask : public ProcessGroup::Task,
public std::enable_shared_from_this<GlooTask> {
public:
explicit GlooTask(int rank,
const std::vector<phi::DenseTensor>& input_tensors,
CommType comm_type);
~GlooTask() = default;
virtual void Run() = 0;
bool Wait(std::chrono::milliseconds timeout) override { return true; }
bool IsCompleted() override { return true; }
void Synchronize() override {}
protected:
friend class ProcessGroupGloo;
};
class GlooStore : public ::gloo::rendezvous::Store {
public:
explicit GlooStore(const std::shared_ptr<paddle::distributed::Store>& store)
: _store(store) {}
~GlooStore() = default;
std::vector<char> get(const std::string& key) override {
VLOG(3) << "GlooStore::get";
auto value = _store->get(key);
return std::vector<char>(value.begin(), value.end());
}
void wait(const std::vector<std::string>& keys) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
}
void set(const std::string& key, const std::vector<char>& value) override {
VLOG(3) << "GlooStore::set";
std::vector<uint8_t> tmp(value.begin(), value.end());
_store->set(key, tmp);
}
void wait(const std::vector<std::string>& keys,
const std::chrono::milliseconds& timeout) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
// wait(keys);
}
protected:
std::shared_ptr<paddle::distributed::Store> _store;
};
class GlooOptions {
public:
GlooOptions() = default;
~GlooOptions() = default;
static std::shared_ptr<GlooOptions> create() {
return std::make_shared<GlooOptions>();
}
std::shared_ptr<::gloo::transport::Device> device;
};
explicit ProcessGroupGloo(
const std::shared_ptr<paddle::distributed::Store>& store,
int rank,
int world_size,
const platform::Place& place,
int gid,
std::shared_ptr<GlooOptions> options);
~ProcessGroupGloo() = default;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const AllreduceOptions& opts = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const AllreduceOptions& opts,
bool sync_op) override;
std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions&) override;
std::shared_ptr<::gloo::Context> get_context() { return _context; }
uint64_t next_tag() { return _tag++; }
const std::string GetBackendName() const override {
return GLOO_BACKEND_NAME;
}
// Helper functions for Gloo.
static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
const std::string& hostname);
static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
const std::string& ifname);
static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
protected:
uint32_t _tag;
std::shared_ptr<gloo::rendezvous::Context> _context;
std::shared_ptr<::gloo::rendezvous::Store> _store;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/npu/hccl_helper.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
DECLARE_bool(hccl_blocking_wait);
// DECLARE_bool(use_stream_safe_npu_allocator);
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
void SyncDefaultStream(
const std::vector<Place>& places,
std::vector<NPUEventManager>& hcclEvents, // NOLINT
std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(places[i]));
hcclEvents[i].Record(*dev_ctx[i]);
hcclEvents[i].Block(*default_ctx);
}
}
std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
std::vector<Place> places,
int rank,
CommType comm_type,
const std::vector<phi::DenseTensor>& inputs) {
return std::make_shared<ProcessGroupHCCL::HCCLTask>(
places, rank, comm_type, inputs);
}
ProcessGroupHCCL::HCCLTask::HCCLTask(
const std::vector<Place>& places,
int rank,
CommType CommType,
const std::vector<phi::DenseTensor>& inputs)
: Task(rank, inputs, CommType), places_(places) {
control_events_.resize(places.size());
hcclComms_.resize(places.size());
}
ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
void ProcessGroupHCCL::HCCLTask::SetOutputs(
std::vector<phi::DenseTensor>& outputs) { // NOLINT
outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
}
void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(places_[i]));
platform::NPUStreamWaitEvent(default_ctx->stream(),
control_events_[i].GetRawNPUEvent());
}
}
bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
for (size_t i = 0; i < places_.size(); ++i) {
if (!control_events_[i].Query()) {
return false;
}
}
return true;
}
// TODO(sandyhouse): Add timeout for wait, now timeout unused
bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
SynchronizeStreams();
// NOTE(sandyhouse): It will block host for sync
while (!IsCompleted()) {
std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
}
return true;
}
// Same as Wait
void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid)
: ProcessGroup(rank, size, place, gid), store_(store) {
platform::SetNPUDeviceId(place_.device);
}
void ProcessGroupHCCL::BroadcastUniqueHCCLID(
std::vector<HcclRootInfo>& hccl_ids) { // NOLINT
if (rank_ == 0) {
for (size_t i = 0; i < hccl_ids.size(); i++) {
auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
auto hccl_id = std::vector<uint8_t>(
reinterpret_cast<uint8_t*>(&hccl_ids[i]),
reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
store_->set(key, hccl_id);
}
} else {
for (size_t i = 0; i < hccl_ids.size(); i++) {
auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
auto ret = store_->get(key);
std::memcpy(&hccl_ids[i], ret.data(), ret.size());
}
}
}
// create HCCLManager cache for places_key
void ProcessGroupHCCL::CreateHCCLManagerCache(
const std::string& places_key, const std::vector<Place>& places) {
PADDLE_ENFORCE_EQ(places_key.empty(),
false,
platform::errors::PreconditionNotMet(
"Not able to create/get the HCCL Communicator since "
"the NPU place are not known"));
std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
hccl_comms.resize(places.size());
// using vector just for broadcast
std::vector<HcclRootInfo> hccl_ids;
hccl_ids.resize(1);
auto& hccl_id = hccl_ids.front();
if (rank_ == 0) {
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
}
BroadcastUniqueHCCLID(hccl_ids);
VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
<< ", place: " << places_key
<< ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
dev_ctx.resize(places.size());
std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
for (size_t i = 0; i < places.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
hccl_comms[i] = HCCLCommManager::Create(
GetSize(), GetRank(), &hccl_id, comms.get() + i);
dev_ctx[i].reset(new NPUDeviceContext(places[i]));
}
std::vector<NPUEventManager> events;
events.resize(places.size());
// These caches will be useful to process sync/wait/communicate
places_to_events_.emplace(places_key, std::move(events));
places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
places_to_ctx_.emplace(places_key, std::move(dev_ctx));
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
Fn fn,
CommType op_type) {
const auto places = GetPlaceList(inputs);
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
CreateHCCLManagerCache(key, places);
}
}
auto& hccl_comms = places_to_hcclcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
auto task = CreateTask(places, rank_, op_type, inputs);
for (size_t i = 0; i < inputs.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
const auto& hccl_stream = places_to_ctx_[key][i]->stream();
fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
}
for (size_t i = 0; i < inputs.size(); ++i) {
platform::NPUDeviceGuard guard(places[i].GetDeviceId());
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const AllreduceOptions& opts) {
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
HcclComm comm,
const aclrtStream& stream) {
return platform::dynload::HcclAllReduce(
input.data(),
output.data(),
input.numel(),
platform::ToHCCLDataType(input.dtype()),
ToHCCLRedType(opts.reduce_op),
comm,
stream);
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const BroadcastOptions& opts) {
// PADDLE_ENFORCE_EQ(
// CheckTensorsInNPUPlace(tensors), true,
// platform::errors::InvalidArgument("All inputs should be in
// CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
HcclComm comm,
const aclrtStream& stream) {
int root = opts.source_rank * in_tensors.size() + opts.source_root;
if (rank_ == root) {
return platform::dynload::HcclBroadcast(
input.data(),
input.numel(),
platform::ToHCCLDataType(input.dtype()),
root,
comm,
stream);
} else {
return platform::dynload::HcclBroadcast(
output.data(),
output.numel(),
platform::ToHCCLDataType(output.dtype()),
root,
comm,
stream);
}
},
CommType::BROADCAST);
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/device/npu/npu_stream.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
constexpr const char* HCCL_BACKEND_NAME = "HCCL";
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
using NPUStream = platform::stream::NPUStream;
using NPUDeviceContext = paddle::platform::NPUDeviceContext;
class ProcessGroupHCCL : public ProcessGroup {
public:
class HCCLTask : public ProcessGroup::Task,
public std::enable_shared_from_this<HCCLTask> {
public:
HCCLTask(const std::vector<Place>& places,
int rank,
CommType CommType,
const std::vector<phi::DenseTensor>& inputs);
bool IsCompleted();
void SynchronizeStreams();
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void Synchronize();
void SetOutputs(std::vector<phi::DenseTensor>& outputs); // NOLINT
virtual ~HCCLTask();
std::vector<NPUEventManager> control_events_;
protected:
std::vector<Place> places_;
std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
private:
};
ProcessGroupHCCL(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid);
const std::string GetBackendName() const override {
return std::string(HCCL_BACKEND_NAME);
}
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& = BroadcastOptions()) override;
protected:
virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
std::vector<Place> places,
int rank,
CommType opType,
const std::vector<phi::DenseTensor>& inputs);
std::shared_ptr<Store> store_;
std::shared_ptr<HCCLCommManager> hccl_comm_;
std::mutex mutex_;
std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
places_to_hcclcomm_;
std::unordered_map<std::string, std::vector<NPUEventManager>>
places_to_events_;
std::unordered_map<std::string,
std::vector<std::unique_ptr<NPUDeviceContext>>>
places_to_ctx_;
std::set<int> used_place_ids_;
private:
void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids,
int root, // NOLINT
int server_fd);
void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids); // NOLINT
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> Collective(
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
Fn fn,
CommType op_type);
void CreateHCCLManagerCache(const std::string& places_key,
const std::vector<Place>& places);
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
#include <chrono>
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
int ProcessGroupHeter::send_count = 0;
int ProcessGroupHeter::recv_count = 0;
std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
int rank, CommType comm_type, const std::vector<phi::DenseTensor>& inputs) {
return std::make_shared<ProcessGroupHeter::HeterTask>(
rank, comm_type, inputs);
}
ProcessGroupHeter::HeterTask::HeterTask(
int rank, CommType CommType, const std::vector<phi::DenseTensor>& inputs)
: Task(rank, inputs, CommType) {}
ProcessGroupHeter::HeterTask::~HeterTask() {}
bool ProcessGroupHeter::HeterTask::IsCompleted() { return true; }
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
return true;
}
ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid,
int local_rank,
int local_size,
int gloo_rank,
int gloo_size,
bool with_switch,
std::string switch_endpoint,
int src_rank,
int dst_rank)
: ProcessGroup(rank, size, place, gid),
store_(store),
local_rank_(local_rank),
local_size_(local_size),
gloo_rank_(gloo_rank),
gloo_size_(gloo_size),
with_switch_(with_switch),
switch_endpoint_(switch_endpoint),
src_rank_(src_rank),
dst_rank_(dst_rank) {
return;
#ifdef PADDLE_WITH_CUSTOM
if (paddle::platform::is_custom_place(place_)) {
inner_pg_ = std::make_shared<ProcessGroupCustom>(
store, local_rank, local_size, place_, IGNORE_ID);
} else {
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
inner_pg_ = std::make_shared<ProcessGroupNCCL>(
store, local_rank, local_size, place_, IGNORE_ID);
#elif defined(PADDLE_WITH_ASCEND_CL)
inner_pg_ = std::make_shared<ProcessGroupHCCL>(
store, local_rank, local_size, place_, IGNORE_ID);
#else
PADDLE_THROW(platform::errors::Unavailable(
"ProcessGroupHeter only supports NCCL, RCCL and HCCL now."));
#endif
#ifdef PADDLE_WITH_CUSTOM
}
#endif
if (local_rank_ == 0 && !with_switch_) {
auto opts = ProcessGroupGloo::GlooOptions::create();
opts->device = ProcessGroupGloo::createDefaultDevice();
inter_pg_ = std::make_shared<ProcessGroupGloo>(
store, gloo_rank_, gloo_size_, place_, IGNORE_ID, opts);
}
}
template <typename T>
static void _do_add(T* dst, T* src, size_t size) {
for (size_t i = 0; i < size; i++) {
*dst += *src;
dst++;
src++;
}
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& opts) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
#endif
// Step1: do allreduce in inner cluster
auto task = inner_pg_->AllReduce(in_tensors, in_tensors, opts);
task->Wait();
// Step2: copy tensors to CPU
if (local_rank_ == 0) {
std::vector<phi::DenseTensor> cpu_tensors;
cpu_tensors.reserve(in_tensors.size());
phi::DenseTensor cpu_tensor;
for (size_t i = 0; i < in_tensors.size(); i++) {
auto gpu_tensor = in_tensors[i];
cpu_tensor.Resize(gpu_tensor.dims());
framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
cpu_tensors.push_back(cpu_tensor);
}
// Step3: do inter cluster allreduce
if (with_switch_) {
if (local_rank_ == 0) {
HeterClient* client_ =
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
auto dense_cpu_tensor = cpu_tensors[0];
std::vector<int64_t> send_size;
send_size.push_back(dense_cpu_tensor.numel());
int ret = client_->Send(
gid_,
{dense_cpu_tensor.name()},
send_size,
dense_cpu_tensor.data(),
dense_cpu_tensor.numel() *
framework::DataTypeSize(dense_cpu_tensor.dtype()));
PADDLE_ENFORCE_EQ(ret,
0,
platform::errors::PreconditionNotMet(
"Send to the switch module error."));
phi::DenseTensor cpu_tensor2;
cpu_tensor2.AllocateFrom(
std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace())
.get(),
dense_cpu_tensor.dtype(),
dense_cpu_tensor.numel());
ret = client_->Recv(
gid_,
{dense_cpu_tensor.name()},
cpu_tensor2.data(),
cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype()));
PADDLE_ENFORCE_EQ(ret,
0,
platform::errors::PreconditionNotMet(
"Recv from the switch module error."));
switch (dense_cpu_tensor.dtype()) {
case DataType::FLOAT32:
_do_add<float>(reinterpret_cast<float*>(dense_cpu_tensor.data()),
reinterpret_cast<float*>(cpu_tensor2.data()),
dense_cpu_tensor.numel());
break;
case DataType::FLOAT64:
_do_add<double>(reinterpret_cast<double*>(dense_cpu_tensor.data()),
reinterpret_cast<double*>(cpu_tensor2.data()),
dense_cpu_tensor.numel());
break;
case DataType::INT32:
_do_add<int>(reinterpret_cast<int*>(dense_cpu_tensor.data()),
reinterpret_cast<int*>(cpu_tensor2.data()),
dense_cpu_tensor.numel());
break;
default:
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Unsupported data type (%s) to do add.",
framework::DataType2String(dense_cpu_tensor.dtype())));
}
}
} else {
auto gloo_task = inter_pg_->AllReduce(cpu_tensors, cpu_tensors, opts);
gloo_task->Wait();
}
// Step4: copy cpu tensors to gpu
// copy cpu tensors to gpu
for (size_t i = 0; i < in_tensors.size(); i++) {
auto gpu_tensor = out_tensors[i];
auto cpu_tensor = cpu_tensors[i];
framework::TensorCopySync(cpu_tensor, cpu_tensor.place(), &gpu_tensor);
}
}
// Step5: broadcast among inner cluster
auto b_opts = BroadcastOptions();
b_opts.source_rank = 0;
auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts);
broadcast_task->Wait();
return CreateTask(rank_, CommType::ALLREDUCE, in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
#endif
// Step1: do broadcast in inner cluster
auto b_opts = BroadcastOptions();
b_opts.source_rank = 0;
inner_pg_->Broadcast(in_tensors, out_tensors, b_opts);
if (local_rank_ == 0) {
std::vector<phi::DenseTensor> cpu_tensors;
cpu_tensors.reserve(in_tensors.size());
for (size_t i = 0; i < in_tensors.size(); i++) {
auto gpu_tensor = in_tensors[i];
phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(gpu_tensor.dims());
framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
cpu_tensors.push_back(cpu_tensor);
}
if (with_switch_) {
if (local_rank_ == 0) {
HeterClient* client_ =
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
auto dense_cpu_tensor = cpu_tensors[0];
if (gloo_rank_ == 0) {
std::vector<int64_t> send_size;
send_size.push_back(dense_cpu_tensor.numel());
int ret = client_->Send(
gid_,
{dense_cpu_tensor.name()},
send_size,
dense_cpu_tensor.data(),
dense_cpu_tensor.numel() *
framework::DataTypeSize(dense_cpu_tensor.dtype()));
PADDLE_ENFORCE_EQ(ret,
0,
platform::errors::PreconditionNotMet(
"Send to the switch module error."));
} else {
int ret = client_->Recv(
gid_,
{dense_cpu_tensor.name()},
dense_cpu_tensor.data(),
dense_cpu_tensor.numel() *
framework::DataTypeSize(dense_cpu_tensor.dtype()));
PADDLE_ENFORCE_EQ(ret,
0,
platform::errors::PreconditionNotMet(
"Receive from the switch module error."));
}
}
} else {
auto gloo_task = inter_pg_->Broadcast(cpu_tensors, cpu_tensors, opts);
gloo_task->Wait();
}
for (size_t i = 0; i < in_tensors.size(); i++) {
auto gpu_tensor = out_tensors[i];
auto cpu_tensor = cpu_tensors[i];
framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
}
}
auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts);
broadcast_task->Wait();
return CreateTask(rank_, CommType::BROADCAST, in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
std::vector<phi::DenseTensor>& in_tensors, int peer) {
PADDLE_ENFORCE_EQ(
in_tensors.size(),
1,
platform::errors::PreconditionNotMet(
"For each send operation, there can only be one tensor to send."));
// Copy Tensor to cpu
auto start = std::chrono::high_resolution_clock::now();
phi::DenseTensor cpu_tensor;
auto& gpu_tensor = in_tensors[0];
framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
PADDLE_ENFORCE_EQ(with_switch_,
true,
platform::errors::PreconditionNotMet(
"Gloo does not support the send operation."));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
<< ") from gpu to cpu for send " << std::setw(9)
<< " is: " << diff.count() << " s" << std::endl;
// Send to switch
HeterClient* client_ =
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
int64_t tensor_size =
cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype());
std::vector<int64_t> send_size;
send_size.push_back(tensor_size);
auto id = src_rank_ * 10000 + dst_rank_;
std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
std::string("_") + std::to_string(send_count++);
VLOG(2) << "tensor_name:" << tensor_name;
int ret = client_->Send(
gid_, {tensor_name}, send_size, cpu_tensor.data(), tensor_size);
PADDLE_ENFORCE_EQ(
ret,
0,
platform::errors::PreconditionNotMet("Send to the switch module error."));
return CreateTask(rank_, CommType::SEND, in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
std::vector<phi::DenseTensor>& out_tensors, int peer) {
PADDLE_ENFORCE_EQ(
out_tensors.size(),
1,
platform::errors::PreconditionNotMet(
"For each rece operation, there can only be one tensor to receive."));
// Copy Tensor to cpu
phi::DenseTensor cpu_tensor;
auto& gpu_tensor = out_tensors[0];
cpu_tensor.Resize(gpu_tensor.dims());
cpu_tensor.set_layout(gpu_tensor.layout());
cpu_tensor.mutable_data(platform::CPUPlace(), gpu_tensor.dtype());
PADDLE_ENFORCE_EQ(with_switch_,
true,
platform::errors::PreconditionNotMet(
"Gloo does not support the send operation."));
// recv from switch
HeterClient* client_ =
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
auto id = src_rank_ * 10000 + dst_rank_;
std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
std::string("_") + std::to_string(recv_count++);
VLOG(2) << "tensor_name: " << tensor_name;
auto start = std::chrono::high_resolution_clock::now();
int ret = client_->Recv(
gid_,
{tensor_name},
cpu_tensor.data(),
cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
PADDLE_ENFORCE_EQ(ret,
0,
platform::errors::PreconditionNotMet(
"receive to the switch module error."));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
double goodput = cpu_tensor.numel() *
framework::DataTypeSize(cpu_tensor.dtype()) / diff.count();
VLOG(2) << "Goodput: " << goodput << "B/s" << std::endl;
start = std::chrono::high_resolution_clock::now();
framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
end = std::chrono::high_resolution_clock::now();
diff = end - start;
VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
<< ") from cpu to gpu for recv " << std::setw(9)
<< " is: " << diff.count() << " s" << std::endl;
return CreateTask(rank_, CommType::RECV, out_tensors);
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#endif
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
#endif
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL))
#include "paddle/fluid/distributed/ps/service/heter_client.h"
#endif
#include "paddle/fluid/distributed/collective/Common.h"
constexpr const char* HETER_BACKEND_NAME = "HETER_BACKEND";
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
class ProcessGroupHeter : public ProcessGroup {
public:
class HeterTask : public ProcessGroup::Task,
public std::enable_shared_from_this<HeterTask> {
public:
HeterTask(int rank,
CommType CommType,
const std::vector<phi::DenseTensor>&);
bool IsCompleted();
void SynchronizeStreams() {}
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void Synchronize() {}
virtual ~HeterTask();
};
ProcessGroupHeter(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid,
int local_rank,
int local_size,
int gloo_rank,
int gloo_size,
bool with_switch,
std::string switch_endpoints,
int src_rank,
int dst_rank);
const std::string GetBackendName() const override {
return std::string(HETER_BACKEND_NAME);
}
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>&,
std::vector<phi::DenseTensor>&,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>&,
std::vector<phi::DenseTensor>&,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& in_tensors, int peer) override;
std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& out_tensors, int peer) override;
protected:
virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
int rank, CommType opType, const std::vector<phi::DenseTensor>& inputs);
private:
std::shared_ptr<Store> store_;
std::shared_ptr<ProcessGroup> inner_pg_;
std::shared_ptr<ProcessGroupGloo> inter_pg_;
int local_rank_;
int local_size_;
int gloo_rank_;
int gloo_size_;
bool with_switch_;
std::string switch_endpoint_;
int src_rank_;
int dst_rank_;
static int send_count;
static int recv_count;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupMPI.h"
#include <chrono>
#include "paddle/fluid/distributed/collective/Common.h"
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
std::map<phi::DataType, MPI_Datatype> mpiDatatype = {
{phi::DataType::INT8, MPI_CHAR},
{phi::DataType::UINT8, MPI_UNSIGNED_CHAR},
{phi::DataType::FLOAT32, MPI_FLOAT},
{phi::DataType::FLOAT64, MPI_DOUBLE},
{phi::DataType::INT32, MPI_INT},
{phi::DataType::INT64, MPI_LONG}};
void ProcessGroupMPI::MPITask::FinishMPITaskError(std::exception_ptr eptr) {
Finish(eptr);
}
void ProcessGroupMPI::MPITask::FinishMPITask() { Finish(); }
ProcessGroupMPI::MPIAsyncTask::MPIAsyncTask(
MPI_Request request, const std::vector<phi::DenseTensor>& inputs)
: ProcessGroup::Task(-1, inputs, CommType::UNKNOWN), request_(request) {
memset(&status_, 0, sizeof(status_));
}
ProcessGroupMPI::MPIAsyncTask::~MPIAsyncTask() {
if (request_ != MPI_REQUEST_NULL) {
std::cerr << " Task has not completed, try to destruct async mpi task, "
<< "exit the program." << std::endl;
std::terminate();
}
}
bool ProcessGroupMPI::MPIAsyncTask::IsCompleted() {
if (request_ == MPI_REQUEST_NULL) {
return true;
}
std::unique_lock<std::mutex> lock(pg_global_mutex);
int flag = 0;
MPI_CHECK(MPI_Test(&request_, &flag, &status_));
if (request_ != MPI_REQUEST_NULL) {
return false;
}
if (status_.MPI_ERROR != MPI_SUCCESS) {
AppearException();
}
return true;
}
bool ProcessGroupMPI::MPIAsyncTask::Wait(std::chrono::milliseconds timeout) {
if (request_ == MPI_REQUEST_NULL) {
return true;
}
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Wait(&request_, &status_));
if (status_.MPI_ERROR != MPI_SUCCESS) {
AppearException();
std::rethrow_exception(exception_);
return false;
}
return true;
}
void ProcessGroupMPI::MPIAsyncTask::AppearException() {
std::array<char, MPI_MAX_ERROR_STRING> buf;
int len = buf.size();
MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
exception_ =
std::make_exception_ptr(std::runtime_error(std::string(buf.data(), len)));
}
void ProcessGroupMPI::MPIAsyncTask::SetOutputs(
std::vector<phi::DenseTensor>& outputs) {
outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
}
int ProcessGroupMPI::mpi_thread_support = 0;
std::mutex ProcessGroupMPI::pg_global_mutex;
std::once_flag ProcessGroupMPI::onceFlag;
void ProcessGroupMPI::ExitMPI() {
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Finalize());
}
void ProcessGroupMPI::InitOneTimeMPI() {
std::call_once(onceFlag, []() {
MPI_CHECK(MPI_Init_thread(
nullptr, nullptr, MPI_THREAD_SERIALIZED, &mpi_thread_support));
PADDLE_ENFORCE_EQ(
mpi_thread_support < MPI_THREAD_SERIALIZED,
false,
platform::errors::InvalidArgument("MPI supports the number of threads "
"less than MPI_THREAD_SERIALIZED. "));
std::atexit(ProcessGroupMPI::ExitMPI);
});
}
std::shared_ptr<ProcessGroupMPI> ProcessGroupMPI::CreateProcessGroupMPI(
const std::vector<int>& ranks, int gid) {
InitOneTimeMPI();
MPI_Comm groupComm = MPI_COMM_WORLD;
int rank = -1;
int size = -1;
{
std::lock_guard<std::mutex> lock(pg_global_mutex);
if (!ranks.empty()) {
MPI_Group worldGroup;
MPI_Group ranksGroup;
MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
MPI_CHECK(
MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
constexpr int maxRetries = 3;
bool create_success = false;
MPI_Barrier(MPI_COMM_WORLD);
for (auto i = 0; i < maxRetries; i++) {
if (MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm)) {
create_success = true;
break;
}
}
MPI_CHECK(create_success);
MPI_CHECK(MPI_Group_free(&worldGroup));
MPI_CHECK(MPI_Group_free(&ranksGroup));
}
if (groupComm != MPI_COMM_NULL) {
MPI_CHECK(MPI_Comm_rank(groupComm, &rank));
MPI_CHECK(MPI_Comm_size(groupComm, &size));
PADDLE_ENFORCE_EQ(
rank < 0 || size < 0,
false,
platform::errors::InvalidArgument("get world_size or rank failed!"));
}
}
if (groupComm == MPI_COMM_NULL) {
return std::shared_ptr<ProcessGroupMPI>();
}
VLOG(3) << "MPI Group Create Success! rank = " << rank << " size = " << size
<< " group_id = " << gid;
return std::make_shared<ProcessGroupMPI>(rank, size, groupComm, gid);
}
ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pg_comm, int gid)
: ProcessGroup(rank, size, gid), stop_(false), pg_comm(pg_comm) {
PADDLE_ENFORCE_EQ(
pg_comm == MPI_COMM_NULL,
false,
platform::errors::InvalidArgument("Error! mpi comm is MPI_COMM_NULL!"));
worker_thread = std::thread(&ProcessGroupMPI::workLoop, this);
}
ProcessGroupMPI::~ProcessGroupMPI() {
std::unique_lock<std::mutex> lock(pg_mutex);
queue_consume.wait(lock, [&] { return queue_.empty(); });
stop_ = true;
lock.unlock();
queue_produce.notify_all();
worker_thread.join();
}
void ProcessGroupMPI::workLoop() {
std::unique_lock<std::mutex> lock(pg_mutex);
while (!stop_) {
if (queue_.empty()) {
queue_produce.wait(lock);
continue;
}
auto taskTuple = std::move(queue_.front());
queue_.pop_front();
auto& taskEntry = std::get<0>(taskTuple);
auto& task = std::get<1>(taskTuple);
lock.unlock();
queue_consume.notify_one();
try {
taskEntry->run_(taskEntry);
task->FinishMPITask();
} catch (...) {
task->FinishMPITaskError(std::current_exception());
}
lock.lock();
}
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Enqueue(
std::unique_ptr<TaskEntry> entry,
const std::vector<phi::DenseTensor>& inputs) {
auto task = std::make_shared<MPITask>(entry->dst_, inputs);
std::unique_lock<std::mutex> lock(pg_mutex);
queue_.push_back(std::make_tuple(std::move(entry), task));
lock.unlock();
queue_produce.notify_one();
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts) {
mpi::CheckValidInputs(in_tensors);
const auto places = GetPlaceList(in_tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
std::unique_lock<std::mutex> lock(pg_global_mutex);
const auto root = opts.source_rank + opts.source_root;
MPI_CHECK(MPI_Bcast(data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
root,
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& opts) {
mpi::CheckValidInputs(in_tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE,
data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
mpi::ToMPIType(opts.reduce_op),
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Barrier(
const BarrierOptions& opts) {
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[this](std::unique_ptr<TaskEntry>& entry) {
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Barrier(pg_comm));
};
auto entry =
std::make_unique<TaskEntry>(nullptr, nullptr, std::move(runFunc));
return Enqueue(std::move(entry), std::vector<phi::DenseTensor>{});
}
// NOTE: MPI_send tag set gid_
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Send(
std::vector<phi::DenseTensor>& tensors, int dst_rank) {
mpi::CheckValidInputs(tensors);
auto& tensor = tensors[0];
MPI_Request request = MPI_REQUEST_NULL;
{
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Isend(tensor.data(),
tensor.numel(),
mpiDatatype.at(tensor.dtype()),
dst_rank,
this->gid_,
pg_comm,
&request));
}
return std::make_shared<ProcessGroupMPI::MPIAsyncTask>(request, tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Recv(
std::vector<phi::DenseTensor>& tensors, int src_rank) {
mpi::CheckValidInputs(tensors);
auto& tensor = tensors[0];
MPI_Request request = MPI_REQUEST_NULL;
{
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Irecv(tensor.data(),
tensor.numel(),
mpiDatatype.at(tensor.dtype()),
src_rank,
this->gid_,
pg_comm,
&request));
}
return std::make_shared<ProcessGroupMPI::MPIAsyncTask>(request, tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
mpi::CheckValidInputs(in_tensors);
PADDLE_ENFORCE_EQ(out_tensors.size() == 1,
true,
platform::errors::InvalidArgument(
"MPI only support a single tensor op."));
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
std::vector<phi::DenseTensor> dst = entry->dst_;
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Allgather(data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
dst[0].data(),
data.numel(),
mpiDatatype.at(data.dtype()),
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
mpi::CheckValidInputs(in_tensors);
mpi::CheckValidInputs(out_tensors);
PADDLE_ENFORCE_EQ(in_tensors[0].numel() == out_tensors[0].numel() &&
in_tensors[0].dtype() == out_tensors[0].dtype(),
true,
platform::errors::InvalidArgument(
"MPI AlltoAll: input and output are not equal in "
"size or data type."));
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[this](std::unique_ptr<TaskEntry>& entry) {
auto srcdata = (entry->src_)[0];
auto dstdata = (entry->dst_)[0];
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Alltoall(srcdata.data(),
srcdata.numel() / size_,
mpiDatatype.at(srcdata.dtype()),
dstdata.data(),
dstdata.numel() / size_,
mpiDatatype.at(dstdata.dtype()),
pg_comm));
};
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Reduce(
std::vector<phi::DenseTensor>& tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) {
mpi::CheckValidInputs(tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->src_)[0];
auto dataPtr = (entry->src_)[0].data();
void* sendbuf = (rank_ == opts.root_rank) ? MPI_IN_PLACE : dataPtr;
void* recvbuf = (rank_ == opts.root_rank) ? dataPtr : nullptr;
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Reduce(sendbuf,
recvbuf,
data.numel(),
mpiDatatype.at(data.dtype()),
mpi::ToMPIType(opts.reduce_op),
opts.root_rank,
pg_comm));
};
auto entry =
std::make_unique<TaskEntry>(&tensors, &tensors, std::move(runFunc));
return Enqueue(std::move(entry), tensors);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupMPI::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts) {
mpi::CheckValidInputs(in_tensors);
std::function<void(std::unique_ptr<TaskEntry>&)> runFunc =
[opts, this](std::unique_ptr<TaskEntry>& entry) {
auto data = (entry->dst_)[0];
void* sendbuf = nullptr;
if (rank_ == opts.root_rank) {
std::vector<phi::DenseTensor>& inputData = entry->src_;
sendbuf = inputData[0].data();
}
std::unique_lock<std::mutex> lock(pg_global_mutex);
MPI_CHECK(MPI_Scatter(sendbuf,
data.numel(),
mpiDatatype.at(data.dtype()),
data.data(),
data.numel(),
mpiDatatype.at(data.dtype()),
opts.root_rank,
pg_comm));
};
if (rank_ == opts.root_rank) {
auto entry = std::make_unique<TaskEntry>(
&in_tensors, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
} else {
auto entry =
std::make_unique<TaskEntry>(nullptr, &out_tensors, std::move(runFunc));
return Enqueue(std::move(entry), in_tensors);
}
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include <condition_variable>
#include <deque>
#include <exception>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_MPI)
#include "paddle/fluid/distributed/collective/MPITools.h"
#endif
constexpr const char* MPI_BACKEND_NAME = "MPI";
namespace paddle {
namespace distributed {
struct TaskEntry {
explicit TaskEntry(std::vector<phi::DenseTensor>* src_ptr,
std::vector<phi::DenseTensor>* dst_ptr,
std::function<void(std::unique_ptr<TaskEntry>&)> run)
: dst_(dst_ptr ? *dst_ptr : std::vector<phi::DenseTensor>()),
run_(std::move(run)) {
if (src_ptr) {
src_ = *src_ptr;
}
}
TaskEntry(const TaskEntry&) = delete;
TaskEntry& operator=(const TaskEntry&) = delete;
std::vector<phi::DenseTensor> src_;
std::vector<phi::DenseTensor> dst_;
int* srcRank_ = nullptr;
std::function<void(std::unique_ptr<TaskEntry>&)> run_;
};
class ProcessGroupMPI : public ProcessGroup {
public:
class MPITask : public ProcessGroup::Task {
public:
explicit MPITask(std::vector<phi::DenseTensor> outputTensors,
const std::vector<phi::DenseTensor>& inputTensors)
: ProcessGroup::Task(-1, inputTensors, CommType::UNKNOWN),
outputs_(std::move(outputTensors)) {}
void Synchronize() { Wait(); }
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout) {
std::unique_lock<std::mutex> lock(mutex_);
if (timeout == kWaitTimeout) {
// This waits without a timeout.
cv_.wait(lock, [&] { return is_completed_; });
} else {
// Waits for the user-provided timeout.
cv_.wait_for(lock, timeout, [&] { return is_completed_; });
PADDLE_ENFORCE_EQ(
is_completed_,
true,
platform::errors::InvalidArgument("MPI operation timeout! "));
}
if (exception_) {
std::rethrow_exception(exception_);
}
return true;
}
protected:
friend class ProcessGroupMPI;
private:
// about mpi
void Finish(std::exception_ptr exception = nullptr) {
is_completed_ = true;
exception_ = exception;
cv_.notify_all();
}
void FinishMPITask();
void FinishMPITaskError(std::exception_ptr eptr);
std::vector<phi::DenseTensor> outputs_;
std::condition_variable cv_;
std::exception_ptr exception_;
};
public:
class MPIAsyncTask : public ProcessGroup::Task {
public:
MPIAsyncTask(MPI_Request request,
const std::vector<phi::DenseTensor>& inputs);
bool IsCompleted();
void Synchronize() {}
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void SetOutputs(std::vector<phi::DenseTensor>& outputs); // NOLINT
virtual ~MPIAsyncTask();
protected:
void AppearException();
private:
std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
MPI_Request request_;
MPI_Status status_;
std::exception_ptr exception_;
};
ProcessGroupMPI(int rank, int size, MPI_Comm pgComm, int gid);
virtual ~ProcessGroupMPI();
const std::string GetBackendName() const override {
return std::string(MPI_BACKEND_NAME);
}
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override;
std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& tensors, int src_rank) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>& in,
std::vector<phi::DenseTensor>& out) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions&) override;
static std::shared_ptr<ProcessGroupMPI> CreateProcessGroupMPI(
const std::vector<int>& ranks, int gid);
protected:
void workLoop();
std::shared_ptr<ProcessGroup::Task> Enqueue(
std::unique_ptr<TaskEntry> entry,
const std::vector<phi::DenseTensor>& inputs);
private:
bool stop_{false};
std::mutex pg_mutex;
std::thread worker_thread;
std::deque<std::tuple<std::unique_ptr<TaskEntry>, std::shared_ptr<MPITask>>>
queue_;
std::condition_variable queue_produce;
std::condition_variable queue_consume;
static void InitOneTimeMPI();
static void ExitMPI();
static std::once_flag onceFlag;
static std::mutex pg_global_mutex;
static int mpi_thread_support;
MPI_Comm pg_comm;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h"
DECLARE_bool(nccl_blocking_wait);
DECLARE_bool(use_stream_safe_cuda_allocator);
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
void SyncDefaultStream(
const std::vector<Place>& places,
std::vector<EventManager>& ncclEvents, // NOLINT
std::vector<std::unique_ptr<phi::GPUContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i]));
ncclEvents[i].Record(*default_ctx);
ncclEvents[i].Block(*dev_ctx[i]);
}
}
std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
std::vector<Place> places,
int rank,
CommType comm_type,
const std::vector<phi::DenseTensor>& inputs) {
return std::make_shared<ProcessGroupNCCL::NCCLTask>(
places, rank, comm_type, inputs);
}
std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
const std::vector<Place>& places,
int rank,
CommType comm_type,
const std::vector<phi::DenseTensor>& inputs,
bool is_sync,
bool use_calc_stream) {
return std::make_shared<ProcessGroupNCCL::NCCLTask>(
places, rank, comm_type, inputs, is_sync, use_calc_stream);
}
ProcessGroupNCCL::NCCLTask::NCCLTask(
const std::vector<Place>& places,
int rank,
CommType CommType,
const std::vector<phi::DenseTensor>& inputs)
: TaskStream(rank, inputs, CommType), places_(places) {
control_events_.resize(places.size());
ncclComms_.resize(places.size());
}
ProcessGroupNCCL::NCCLTask::NCCLTask(
const std::vector<Place>& places,
int rank,
CommType comm_type,
const std::vector<phi::DenseTensor>& inputs,
bool sync_op,
bool use_calc_stream)
: TaskStream(rank, inputs, comm_type, sync_op, use_calc_stream),
places_(places) {
control_events_.resize(places.size());
ncclComms_.resize(places.size());
}
ProcessGroupNCCL::NCCLTask::~NCCLTask() {}
void ProcessGroupNCCL::NCCLTask::SetOutputs(
std::vector<phi::DenseTensor>& outputs) { // NOLINT
outputs_ = std::make_shared<std::vector<phi::DenseTensor>>(outputs);
}
void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places_[i]));
default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
}
}
bool ProcessGroupNCCL::NCCLTask::IsCompleted() {
for (size_t i = 0; i < places_.size(); ++i) {
if (!control_events_[i].Query()) {
return false;
}
}
return true;
}
void ProcessGroupNCCL::CheckSplitSizes(std::vector<int64_t>* split_sizes,
std::vector<int64_t> tensor_shape) {
int64_t len_size = (*split_sizes).size();
if (len_size == 0) {
PADDLE_ENFORCE_EQ(tensor_shape[0] % size_ == 0,
true,
platform::errors::InvalidArgument(
"Tensor's dim[0] must be divisible by group size "
"when split_sizes not given."));
(*split_sizes)
.insert((*split_sizes).end(),
size_,
static_cast<int64_t>(tensor_shape[0] / size_));
} else {
PADDLE_ENFORCE_EQ(
len_size == size_,
true,
platform::errors::InvalidArgument(
"The length of split_sizes must be equal to group size."));
auto sum_size = std::accumulate(
(*split_sizes).begin(), (*split_sizes).end(), static_cast<int64_t>(0));
PADDLE_ENFORCE_EQ(
sum_size == tensor_shape[0],
true,
platform::errors::InvalidArgument(
"The sum of split_sizes must be equal to tensor's dim[0]."));
}
}
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
// Warning here when use calc stream but also invoke waiting explicitly.
if (UseCalcStream()) {
VLOG(3) << "Warning: The communication is on calc stream, wait here is "
"useless.";
return true;
}
SynchronizeStreams();
if (FLAGS_nccl_blocking_wait) {
// NOTE(shenliang03): It will block host for sync
while (!IsCompleted()) {
std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
}
}
if (!barrierTensors_.empty()) {
// If we use the work to do barrier, we should block cpu
for (auto& place : places_) {
platform::CUDADeviceGuard gpuGuard(place);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
#endif
}
}
return true;
}
// Same as Wait
void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid)
: ProcessGroupStream(rank, size, place, gid), store_(store) {
platform::SetDeviceId(place_.device);
}
void ProcessGroupNCCL::BroadcastUniqueNCCLID(
std::vector<ncclUniqueId>& nccl_ids) { // NOLINT
if (rank_ == 0) {
for (size_t i = 0; i < nccl_ids.size(); i++) {
auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" +
std::to_string(i);
auto nccl_id = std::vector<uint8_t>(
reinterpret_cast<uint8_t*>(&nccl_ids[i]),
reinterpret_cast<uint8_t*>(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES);
store_->set(key, nccl_id);
}
} else {
for (size_t i = 0; i < nccl_ids.size(); i++) {
auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" +
std::to_string(i);
auto ret = store_->get(key);
std::memcpy(&nccl_ids[i], ret.data(), ret.size());
}
}
}
// create NCCLManager cache for places_key
void ProcessGroupNCCL::CreateNCCLManagerCache(
const std::string& places_key, const std::vector<Place>& places) {
PADDLE_ENFORCE_EQ(places_key.empty(),
false,
platform::errors::PreconditionNotMet(
"Not able to create/get the NCCL Communicator since "
"the GPU place are not known"));
std::vector<std::shared_ptr<NCCLCommManager>> nccl_comms;
nccl_comms.resize(places.size());
// using vector just for broadcast
std::vector<ncclUniqueId> nccl_ids;
nccl_ids.resize(1);
auto& nccl_id = nccl_ids.front();
for (auto& place : places) {
used_place_ids_.insert(place.GetDeviceId());
}
if (rank_ == 0) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
}
BroadcastUniqueNCCLID(nccl_ids);
VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_
<< ", place: " << places_key
<< ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
std::vector<std::unique_ptr<phi::GPUContext>> dev_ctx;
dev_ctx.resize(places.size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (size_t i = 0; i < places.size(); ++i) {
platform::CUDADeviceGuard guard(places[i]);
nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
dev_ctx[i].reset(new phi::GPUContext(places[i]));
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
std::vector<EventManager> events;
events.resize(places.size());
// These caches will be useful to process sync/wait/communicate
places_to_events_.emplace(places_key, std::move(events));
places_to_ncclcomm_.emplace(places_key, std::move(nccl_comms));
places_to_ctx_.emplace(places_key, std::move(dev_ctx));
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
Fn fn,
CommType comm_type,
bool sync_op,
bool use_calc_stream) {
const auto& places = GetPlaceList(inputs);
const auto& key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
CreateNCCLManagerCache(key, places);
}
}
auto& nccl_comms = places_to_ncclcomm_[key];
if (!use_calc_stream) {
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
}
auto task =
CreateTask(places, rank_, comm_type, inputs, sync_op, use_calc_stream);
platform::CUDADeviceGuard cuda_guard;
{
platform::NCCLGroupGuard nccl_guard;
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
gpuStream_t nccl_stream;
if (use_calc_stream) {
nccl_stream =
static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i]))
->stream();
} else {
nccl_stream = places_to_ctx_[key][i]->stream();
}
fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream);
}
}
if (FLAGS_use_stream_safe_cuda_allocator) {
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
gpuStream_t nccl_stream;
if (use_calc_stream) {
nccl_stream =
static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i]))
->stream();
} else {
nccl_stream = places_to_ctx_[key][i]->stream();
}
memory::RecordStream(inputs[i].Holder(), nccl_stream);
}
}
// Adding stream event dependency only when use comm stream
if (!use_calc_stream) {
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
}
return task;
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
Fn fn,
CommType op_type) {
const auto places = GetPlaceList(inputs);
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
CreateNCCLManagerCache(key, places);
}
}
auto& nccl_comms = places_to_ncclcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
auto task = CreateTask(places, rank_, op_type, inputs);
// construct uninitialize guard for device
platform::CUDADeviceGuard cuda_guard;
{
platform::NCCLGroupGuard nccl_guard;
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
const auto& nccl_stream = places_to_ctx_[key][i]->stream();
fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream);
}
}
if (FLAGS_use_stream_safe_cuda_allocator) {
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
memory::RecordStream(inputs[i].Holder(),
places_to_ctx_[key][i]->stream());
}
}
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
return task;
}
template <typename Fn>
void ProcessGroupNCCL::Collective(const phi::DenseTensor* in,
phi::DenseTensor* out,
Fn fn,
CommType op_type) {
std::vector<Place> places;
places.push_back(in->place());
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
CreateNCCLManagerCache(key, places);
}
}
auto& nccl_comms = places_to_ncclcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
// construct uninitialize guard for device
platform::CUDADeviceGuard cuda_guard;
if (FLAGS_use_stream_safe_cuda_allocator) {
cuda_guard.SetDevice(places[0]);
memory::RecordStream(in->Holder(), places_to_ctx_[key][0]->stream());
}
{
platform::NCCLGroupGuard nccl_guard;
cuda_guard.SetDevice(places[0]);
const auto& nccl_stream = places_to_ctx_[key][0]->stream();
fn(in, out, nccl_comms[0]->GetNcclComm(), nccl_stream);
}
cuda_guard.SetDevice(places[0]);
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
std::vector<phi::DenseTensor>& tensors,
Fn fn,
int dst_rank,
CommType op_type,
bool sync_op,
bool use_calc_stream) {
const auto& places = GetPlaceList(tensors);
const auto& key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
CreateNCCLManagerCache(key, places);
}
}
auto& nccl_comms = places_to_ncclcomm_[key];
if (!use_calc_stream) {
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
}
auto task =
CreateTask(places, rank_, op_type, tensors, sync_op, use_calc_stream);
platform::CUDADeviceGuard cuda_guard;
if (FLAGS_use_stream_safe_cuda_allocator) {
for (size_t i = 0; i < tensors.size(); ++i) {
cuda_guard.SetDevice(places[i]);
gpuStream_t nccl_stream;
if (use_calc_stream) {
nccl_stream =
static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i]))
->stream();
} else {
nccl_stream = places_to_ctx_[key][i]->stream();
}
memory::RecordStream(tensors[i].Holder(), nccl_stream);
}
}
{
platform::NCCLGroupGuard nccl_guard;
for (size_t i = 0; i < tensors.size(); ++i) {
cuda_guard.SetDevice(places[i]);
gpuStream_t nccl_stream;
if (use_calc_stream) {
nccl_stream =
static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i]))
->stream();
} else {
nccl_stream = places_to_ctx_[key][i]->stream();
}
fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
}
}
if (!use_calc_stream) {
for (size_t i = 0; i < tensors.size(); ++i) {
cuda_guard.SetDevice(places[i]);
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
}
return task;
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
std::vector<phi::DenseTensor>& tensors,
Fn fn,
int dst_rank,
CommType op_type) {
const auto places = GetPlaceList(tensors);
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
CreateNCCLManagerCache(key, places);
}
}
auto& nccl_comms = places_to_ncclcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
auto task = CreateTask(places, rank_, op_type, tensors);
// construct uninitialize guard for device
platform::CUDADeviceGuard cuda_guard;
if (FLAGS_use_stream_safe_cuda_allocator) {
for (size_t i = 0; i < tensors.size(); ++i) {
cuda_guard.SetDevice(places[i]);
memory::RecordStream(tensors[i].Holder(),
places_to_ctx_[key][i]->stream());
}
}
{
platform::NCCLGroupGuard nccl_guard;
for (size_t i = 0; i < tensors.size(); ++i) {
cuda_guard.SetDevice(places[i]);
const auto& nccl_stream = places_to_ctx_[key][i]->stream();
fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
}
}
for (size_t i = 0; i < tensors.size(); ++i) {
cuda_guard.SetDevice(places[i]);
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](const phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
return platform::dynload::ncclAllReduce(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.type()),
ToNCCLRedType(opts.reduce_op),
comm,
stream);
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](const phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
return platform::dynload::ncclAllReduce(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.type()),
ToNCCLRedType(opts.reduce_op),
comm,
stream);
},
CommType::ALLREDUCE,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
const auto root =
opts.source_rank * in_tensors.size() + opts.source_root;
return platform::dynload::ncclBroadcast(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.type()),
root,
comm,
stream);
},
CommType::BROADCAST);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
const auto root =
opts.source_rank * in_tensors.size() + opts.source_root;
return platform::dynload::ncclBroadcast(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.type()),
root,
comm,
stream);
},
CommType::BROADCAST,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
const BarrierOptions& opts) {
// Only support single card single process
std::vector<phi::GPUPlace> places = {place_};
std::vector<phi::DenseTensor> barrierTensors;
barrierTensors.reserve(places.size());
platform::CUDADeviceGuard gpuGuard;
for (auto& place : places) {
gpuGuard.SetDeviceIndex(place.GetDeviceId());
phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim({1}));
auto allocator = std::unique_ptr<phi::Allocator>(
new paddle::experimental::DefaultAllocator(place));
barrierTensors.emplace_back(allocator.get(), meta);
}
auto task = ProcessGroupNCCL::AllReduce(
barrierTensors, barrierTensors, AllreduceOptions());
auto nccl_task = dynamic_cast<ProcessGroupNCCL::NCCLTask*>(task.get());
nccl_task->barrierTensors_ = std::move(barrierTensors);
return task;
}
void CheckTensorsInDifferentDevices(
const std::vector<phi::DenseTensor>& tensors, const size_t num_devices) {
PADDLE_ENFORCE_EQ(
tensors.size() == 0,
false,
platform::errors::InvalidArgument("Tensor list must be nonempty."));
PADDLE_ENFORCE_LE(
tensors.size(),
num_devices,
platform::errors::InvalidArgument(
"Tensor list mustn't be larger than the number of available GPUs."));
std::set<Place> used_devices;
for (const auto& t : tensors) {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(t.place()),
true,
platform::errors::InvalidArgument(
"Tensors must be CUDA and dense tensor."));
const auto inserted = used_devices.insert(t.place()).second;
PADDLE_ENFORCE_EQ(inserted,
true,
platform::errors::InvalidArgument(
"Tensors must be on distinct GPU devices."));
}
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
std::vector<phi::DenseTensor>& tensors, int dst_rank) {
CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
auto task = PointToPoint(
tensors,
[&](phi::DenseTensor& input,
ncclComm_t comm,
const gpuStream_t& stream,
int dst_rank) {
return platform::dynload::ncclSend(
input.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
dst_rank,
comm,
stream);
},
dst_rank,
CommType::SEND);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
std::vector<phi::DenseTensor>& tensors,
int dst_rank,
bool sync_op,
bool use_calc_stream) {
CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
auto task = PointToPoint(
tensors,
[&](phi::DenseTensor& input,
ncclComm_t comm,
const gpuStream_t& stream,
int dst_rank) {
return platform::dynload::ncclSend(
input.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
dst_rank,
comm,
stream);
},
dst_rank,
CommType::SEND,
sync_op,
use_calc_stream);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
std::vector<phi::DenseTensor>& tensors, int src_rank) {
CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
auto task = PointToPoint(
tensors,
[&](phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream,
int src_rank) {
return platform::dynload::ncclRecv(
output.data(),
output.numel(),
platform::ToNCCLDataType(output.dtype()),
src_rank,
comm,
stream);
},
src_rank,
CommType::RECV);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
std::vector<phi::DenseTensor>& tensors,
int src_rank,
bool sync_op,
bool use_calc_stream) {
CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
auto task = PointToPoint(
tensors,
[&](phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream,
int src_rank) {
return platform::dynload::ncclRecv(
output.data(),
output.numel(),
platform::ToNCCLDataType(output.dtype()),
src_rank,
comm,
stream);
},
src_rank,
CommType::RECV,
sync_op,
use_calc_stream);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
phi::DenseTensor& tensors, int dst_rank, int64_t offset, int64_t length) {
// CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
phi::DenseTensor flatten_tensor;
flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
std::vector<phi::DenseTensor> shared_tensors{
flatten_tensor.Slice(offset, offset + length)};
auto task = PointToPoint(
shared_tensors,
[&](phi::DenseTensor& input,
ncclComm_t comm,
const gpuStream_t& stream,
int dst_rank) {
return platform::dynload::ncclSend(
input.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
dst_rank,
comm,
stream);
},
dst_rank,
CommType::SEND);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
phi::DenseTensor& tensors,
int dst_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) {
phi::DenseTensor flatten_tensor;
flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
std::vector<phi::DenseTensor> shared_tensors{
flatten_tensor.Slice(offset, offset + length)};
auto task = PointToPoint(
shared_tensors,
[&](phi::DenseTensor& input,
ncclComm_t comm,
const gpuStream_t& stream,
int dst_rank) {
return platform::dynload::ncclSend(
input.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
dst_rank,
comm,
stream);
},
dst_rank,
CommType::SEND,
sync_op,
use_calc_stream);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
phi::DenseTensor& tensors, int src_rank, int64_t offset, int64_t length) {
// phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
phi::DenseTensor flatten_tensor;
flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
std::vector<phi::DenseTensor> shared_tensors{
flatten_tensor.Slice(offset, offset + length)};
auto task = PointToPoint(
shared_tensors,
[&](phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream,
int src_rank) {
return platform::dynload::ncclRecv(
output.data(),
output.numel(),
platform::ToNCCLDataType(output.dtype()),
src_rank,
comm,
stream);
},
src_rank,
CommType::RECV);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
phi::DenseTensor& tensors,
int src_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) {
phi::DenseTensor flatten_tensor;
flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
std::vector<phi::DenseTensor> shared_tensors{
flatten_tensor.Slice(offset, offset + length)};
auto task = PointToPoint(
shared_tensors,
[&](phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream,
int src_rank) {
return platform::dynload::ncclRecv(
output.data(),
output.numel(),
platform::ToNCCLDataType(output.dtype()),
src_rank,
comm,
stream);
},
src_rank,
CommType::RECV,
sync_op,
use_calc_stream);
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](const phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
return platform::dynload::ncclAllGather(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
comm,
stream);
},
CommType::ALLGATHER);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](const phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
return platform::dynload::ncclAllGather(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
comm,
stream);
},
CommType::ALLGATHER,
sync_op,
use_calc_stream);
}
void* GetPointerByOffset(void* raw_pointer,
size_t offset,
experimental::DataType type) {
if (type == experimental::DataType::FLOAT32) {
return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT64) {
return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT32) {
return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT64) {
return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT8) {
return reinterpret_cast<void*>(reinterpret_cast<int8_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::UINT8) {
return reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::BOOL) {
return reinterpret_cast<void*>(reinterpret_cast<bool*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::BFLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<uint16_t*>(raw_pointer) +
offset);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"This datatype in nccl is not supported."));
}
return nullptr;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
return platform::dynload::ncclAllGather(
GetPointerByOffset(input.data(), offset, input.dtype()),
output.data(),
length,
platform::ToNCCLDataType(input.dtype()),
comm,
stream);
},
CommType::ALLGATHER);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
return platform::dynload::ncclAllGather(
GetPointerByOffset(input.data(), offset, input.dtype()),
output.data(),
length,
platform::ToNCCLDataType(input.dtype()),
comm,
stream);
},
CommType::ALLGATHER,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
size_t offset = 0;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input.data(), offset, input.dtype()),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output.data(), offset, input.dtype()),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
offset += input.numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLTOALL);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
size_t offset = 0;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input.data(), offset, input.dtype()),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output.data(), offset, input.dtype()),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
offset += input.numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLTOALL,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(),
true,
platform::errors::InvalidArgument(
"The dtypes of input and output must be equal."));
std::vector<int64_t> in_dims = phi::vectorize(input.dims());
std::vector<int64_t> out_dims = phi::vectorize(output.dims());
CheckSplitSizes(&in_sizes, in_dims);
CheckSplitSizes(&out_sizes, out_dims);
size_t in_offset = 0, out_offset = 0;
size_t in_length = 0, out_length = 0;
size_t in_row_size = input.numel() / in_dims[0];
size_t out_row_size = output.numel() / out_dims[0];
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
in_length = in_sizes[i] * in_row_size;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input.data(), in_offset, input.dtype()),
in_length,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
in_offset += in_length;
out_length = out_sizes[i] * out_row_size;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output.data(), out_offset, input.dtype()),
out_length,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
out_offset += out_length;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLTOALL_SINGLE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAllSingle(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(),
true,
platform::errors::InvalidArgument(
"The dtypes of input and output must be equal."));
std::vector<int64_t> in_dims = phi::vectorize(input.dims());
std::vector<int64_t> out_dims = phi::vectorize(output.dims());
CheckSplitSizes(&in_sizes, in_dims);
CheckSplitSizes(&out_sizes, out_dims);
size_t in_offset = 0, out_offset = 0;
size_t in_length = 0, out_length = 0;
size_t in_row_size = input.numel() / in_dims[0];
size_t out_row_size = output.numel() / out_dims[0];
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
in_length = in_sizes[i] * in_row_size;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input.data(), in_offset, input.dtype()),
in_length,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
in_offset += in_length;
out_length = out_sizes[i] * out_row_size;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output.data(), out_offset, input.dtype()),
out_length,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
out_offset += out_length;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLTOALL_SINGLE,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](const phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
ToNCCLRedType(opts.reduce_op),
opts.root_rank,
comm,
stream));
},
CommType::REDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](const phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
input.data(),
output.data(),
input.numel(),
platform::ToNCCLDataType(input.dtype()),
ToNCCLRedType(opts.reduce_op),
opts.root_rank,
comm,
stream));
},
CommType::REDUCE,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::ReduceScatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceScatterOptions& opts,
bool sync_op,
bool use_calc_stream) {
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
platform::CUDADeviceGuard cuda_guard;
cuda_guard.SetDevice(output.place());
memory::RecordStream(output.Holder(), stream);
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
input.data(),
output.data(),
output.numel(),
platform::ToNCCLDataType(input.dtype()),
ToNCCLRedType(opts.reduce_op),
comm,
stream));
},
CommType::REDUCE_SCATTER,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
size_t offset = 0;
if (rank_ == opts.root_rank) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input.data(), offset, input.dtype()),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
offset += input.numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output.data(),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
opts.root_rank,
comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output.data(),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
opts.root_rank,
comm,
stream));
}
},
CommType::SCATTER);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(
output.numel(),
input.numel() / size_,
platform::errors::InvalidArgument(
"Input and output tensors should have the same shape."));
size_t offset = 0;
if (rank_ == opts.root_rank) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input.data(), offset, input.dtype()),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
offset += input.numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output.data(),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
opts.root_rank,
comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output.data(),
input.numel() / size_,
platform::ToNCCLDataType(input.dtype()),
opts.root_rank,
comm,
stream));
}
},
CommType::SCATTER,
sync_op,
use_calc_stream);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::_ReduceScatterBase(
phi::DenseTensor& out_tensor,
phi::DenseTensor& in_tensor,
const ReduceScatterOptions& opts) {
// auto tensor = out_tensors.back();
PADDLE_ENFORCE_EQ(
out_tensor.dtype(),
in_tensor.dtype(),
platform::errors::InvalidArgument(
"Input tensor and output tensor should be same dtype."));
PADDLE_ENFORCE_EQ(
out_tensor.numel() * size_,
in_tensor.numel(),
platform::errors::InvalidArgument("input tensor must be the same size as "
"output tensor size times world_size"));
auto inputs = std::vector<phi::DenseTensor>{in_tensor};
auto outputs = std::vector<phi::DenseTensor>{out_tensor};
return Collective(
inputs,
outputs,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
platform::CUDADeviceGuard cuda_guard;
cuda_guard.SetDevice(output.place());
memory::RecordStream(output.Holder(), stream);
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
input.data(),
output.data(),
output.numel(),
platform::ToNCCLDataType(input.dtype()),
ToNCCLRedType(opts.reduce_op),
comm,
stream));
},
CommType::REDUCE_SCATTER);
}
void ProcessGroupNCCL::GroupStart() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
}
void ProcessGroupNCCL::GroupEnd() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
}
ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
std::vector<Place> places = {place};
const auto& iter = places_to_ncclcomm_.find(GetKeyFromPlaces(places));
PADDLE_ENFORCE_NE(iter,
places_to_ncclcomm_.end(),
platform::errors::InvalidArgument(
"Cannot find nccl comm in process group."));
return iter->second[0]->GetNcclComm();
}
phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
const Place& place) const {
return GetDeviceContext(place, /*use_calc_stream*/ false);
}
phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
const Place& place, bool use_calc_stream) const {
if (use_calc_stream) {
return platform::DeviceContextPool::Instance().Get(place);
} else {
std::vector<Place> places = {place};
const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places));
PADDLE_ENFORCE_NE(iter,
places_to_ctx_.end(),
platform::errors::InvalidArgument(
"Cannot find device context in process group."));
return iter->second[0].get();
}
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
constexpr const char* NCCL_BACKEND_NAME = "NCCL";
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
class ProcessGroupNCCL : public ProcessGroupStream {
public:
class NCCLTask : public ProcessGroupStream::TaskStream,
public std::enable_shared_from_this<NCCLTask> {
public:
NCCLTask(const std::vector<Place>& places,
int rank,
CommType CommType,
const std::vector<phi::DenseTensor>& inputs);
NCCLTask(const std::vector<Place>& places,
int rank,
CommType comm_type,
const std::vector<phi::DenseTensor>& inputs,
bool sync_op,
bool use_calc_stream);
bool IsCompleted();
void SynchronizeStreams();
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void Synchronize();
void SetOutputs(std::vector<phi::DenseTensor>& outputs); // NOLINT
virtual ~NCCLTask();
std::vector<EventManager> control_events_;
std::vector<phi::DenseTensor> barrierTensors_;
protected:
std::vector<Place> places_;
std::vector<std::shared_ptr<NCCLCommManager>> ncclComms_;
std::shared_ptr<std::vector<phi::DenseTensor>> outputs_;
private:
};
ProcessGroupNCCL(const std::shared_ptr<Store>& store,
int rank,
int size,
const platform::Place& place,
int gid);
const std::string GetBackendName() const override {
return std::string(NCCL_BACKEND_NAME);
}
phi::DeviceContext* GetDeviceContext(const Place& place) const override;
phi::DeviceContext* GetDeviceContext(const Place& place,
bool use_calc_stream) const override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const AllreduceOptions& options,
bool sync_op,
bool use_calc_stream) override;
// TODO(liyurui): This API will be moved later
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Barrier(
const BarrierOptions& = BarrierOptions()) override;
std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& tensors,
int dst_rank,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& tensors, int src_rank) override;
std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& tensors,
int src_rank,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
int dst_rank,
int64_t offset,
int64_t length) override;
std::shared_ptr<ProcessGroup::Task> Send_Partial(
phi::DenseTensor& tensors,
int dst_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
int src_rank,
int64_t offset,
int64_t length) override;
std::shared_ptr<ProcessGroup::Task> Recv_Partial(
phi::DenseTensor& tensors,
int src_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length) override;
std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
std::vector<phi::DenseTensor>& in,
std::vector<phi::DenseTensor>& out,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes) override;
std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> ReduceScatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceScatterOptions& opts,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts,
bool sync_op,
bool use_calc_stream) override;
std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
phi::DenseTensor&, // NOLINT
phi::DenseTensor&, // NOLINT
const ReduceScatterOptions&) override;
static void GroupStart();
static void GroupEnd();
ncclComm_t NCCLComm(const Place& place) const;
protected:
virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
std::vector<Place> places,
int rank,
CommType op_type,
const std::vector<phi::DenseTensor>& inputs);
virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
const std::vector<Place>& places,
int rank,
CommType op_type,
const std::vector<phi::DenseTensor>& inputs,
bool sync_op,
bool use_calc_stream);
protected:
std::shared_ptr<Store> store_;
std::shared_ptr<NCCLCommManager> nccl_comm_;
std::mutex mutex_;
std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
places_to_ncclcomm_;
std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
std::unordered_map<std::string, std::vector<std::unique_ptr<phi::GPUContext>>>
places_to_ctx_;
std::set<int> used_place_ids_;
private:
void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, // NOLINT
int root, // NOLINT
int server_fd);
void BroadcastUniqueNCCLID(std::vector<ncclUniqueId>& nccl_ids); // NOLINT
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> Collective(
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
Fn fn,
CommType op_type);
template <typename Fn>
std::shared_ptr<ProcessGroupStream::Task> Collective(
std::vector<phi::DenseTensor>& inputs, // NOLINT
std::vector<phi::DenseTensor>& outputs, // NOLINT
Fn fn,
CommType comm_type,
bool sync_op,
bool use_calc_stream);
template <typename Fn>
void Collective(const phi::DenseTensor*,
phi::DenseTensor*,
Fn fn,
CommType op_type);
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> PointToPoint(
std::vector<phi::DenseTensor>& tensors, // NOLINT
Fn fn,
int dst_rank,
CommType op_type);
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> PointToPoint(
std::vector<phi::DenseTensor>& tensors, // NOLINT
Fn fn,
int dst_rank,
CommType op_type,
bool sync_op,
bool use_calc_stream);
void CreateNCCLManagerCache(const std::string& places_key,
const std::vector<Place>& places);
void CheckSplitSizes(std::vector<int64_t>* split_sizes,
std::vector<int64_t> tensor_shape);
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
namespace paddle {
namespace distributed {
ProcessGroupStream::ProcessGroupStream(int rank,
int size,
const platform::Place& place,
int gid)
: ProcessGroup(rank, size, place, gid) {}
phi::DeviceContext* ProcessGroupStream::GetDeviceContext(
const Place& place, bool use_calc_stream) const {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support get device_context.", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
std::vector<phi::DenseTensor>& input_tensors, // NOLINT
std::vector<phi::DenseTensor>& output_tensors, // NOLINT
bool sync_op) {
return AllGather(input_tensors,
output_tensors,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
std::vector<phi::DenseTensor>& input_tensors, // NOLINT
std::vector<phi::DenseTensor>& output_tensors, // NOLINT
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do all_gather", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
std::vector<phi::DenseTensor>& input_tensors, // NOLINT
std::vector<phi::DenseTensor>& output_tensors, // NOLINT
const AllreduceOptions& options,
bool sync_op) {
return AllReduce(input_tensors,
output_tensors,
options,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
std::vector<phi::DenseTensor>& input_tensors, // NOLINT
std::vector<phi::DenseTensor>& output_tensors, // NOLINT
const AllreduceOptions& options,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do all_reduce", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
bool sync_op) {
return AllToAll(in_tensors,
out_tensors,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAll(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do alltoall", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAllSingle(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes,
bool sync_op) {
return AllToAllSingle(in_tensors,
out_tensors,
in_sizes,
out_sizes,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAllSingle(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do alltoall_single", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts,
bool sync_op) {
return Broadcast(in_tensors,
out_tensors,
opts,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Broadcast(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const BroadcastOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do broadcast", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts,
bool sync_op) {
return Reduce(in_tensors,
out_tensors,
opts,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do reduce", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::ReduceScatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceScatterOptions& opts,
bool sync_op) {
return ReduceScatter(in_tensors,
out_tensors,
opts,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::ReduceScatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceScatterOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do reduce_scatter", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts,
bool sync_op) {
return Scatter(in_tensors,
out_tensors,
opts,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do scatter", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
std::vector<phi::DenseTensor>& tensors, int dst_rank, bool sync_op) {
return Send(tensors,
dst_rank,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
std::vector<phi::DenseTensor>& tensors,
int dst_rank,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do send", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
phi::DenseTensor& tensors,
int dst_rank,
int64_t offset,
int64_t length,
bool sync_op) {
return Send_Partial(tensors,
dst_rank,
offset,
length,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
phi::DenseTensor& tensors,
int dst_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do send_partial", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
std::vector<phi::DenseTensor>& tensors, int src_rank, bool sync_op) {
return Recv(tensors,
src_rank,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
std::vector<phi::DenseTensor>& tensors,
int src_rank,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do recv", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
phi::DenseTensor& tensors,
int src_rank,
int64_t offset,
int64_t length,
bool sync_op) {
return Recv_Partial(tensors,
src_rank,
offset,
length,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
phi::DenseTensor& tensors,
int src_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do recv_partial", GetBackendName()));
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length,
bool sync_op) {
return AllGather_Partial(in_tensors,
out_tensors,
offset,
length,
sync_op,
/*use_calc_stream*/ false);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support do recv_partial", GetBackendName()));
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
namespace paddle {
namespace distributed {
// NOTE(liyurui): Notice that some backends use `stream` as an abstract
// conception of hardward resource. We provide this base class allowing users to
// put communications on calculation stream. In some scenorios, we found this
// will save the time of switching streams.
class ProcessGroupStream : public ProcessGroup {
public:
class TaskStream : public ProcessGroup::Task {
public:
// TODO(liyurui): This constructor is temporary here for compatible reason,
// will be deleted soon.
TaskStream(int rank,
const std::vector<phi::DenseTensor>& inputs,
CommType comm_type)
: Task(rank, inputs, comm_type) {}
TaskStream(int rank,
const std::vector<phi::DenseTensor>& inputs,
CommType comm_type,
bool sync_op,
bool use_calc_stream)
: Task(rank, inputs, comm_type, sync_op),
use_calc_stream_(use_calc_stream) {}
virtual ~TaskStream() = default;
protected:
bool UseCalcStream() const { return use_calc_stream_; }
private:
bool use_calc_stream_{false};
};
ProcessGroupStream(int rank, int size, const platform::Place& place, int gid);
virtual ~ProcessGroupStream() = default;
virtual phi::DeviceContext* GetDeviceContext(const Place& place,
bool use_calc_stream) const;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& input_tensors, // NOLINT
std::vector<phi::DenseTensor>& output_tensors, // NOLINT
const AllreduceOptions& options,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& input_tensors, // NOLINT
std::vector<phi::DenseTensor>& output_tensors, // NOLINT
const AllreduceOptions& options,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
std::vector<int64_t>& in_sizes, // NOLINT
std::vector<int64_t>& out_sizes, // NOLINT
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
std::vector<int64_t>& in_sizes, // NOLINT
std::vector<int64_t>& out_sizes, // NOLINT
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const BroadcastOptions& opts,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const BroadcastOptions& opts,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const ReduceOptions& opts,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const ReduceOptions& opts,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> ReduceScatter(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const ReduceScatterOptions& opts,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const ReduceScatterOptions& opts,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const ScatterOptions& opts,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
const ScatterOptions& opts,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& tensors, // NOLINT
int dst_rank,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& tensors, // NOLINT
int dst_rank,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> Send_Partial(
phi::DenseTensor& tensors, // NOLINT
int dst_rank,
int64_t offset,
int64_t length,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
phi::DenseTensor& tensors, // NOLINT
int dst_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& tensors, // NOLINT
int src_rank,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& tensors, // NOLINT
int src_rank,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> Recv_Partial(
phi::DenseTensor& tensors, // NOLINT
int src_rank,
int64_t offset,
int64_t length,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
phi::DenseTensor& tensors, // NOLINT
int src_rank,
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream);
std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
int64_t offset,
int64_t length,
bool sync_op) override;
virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
int64_t offset,
int64_t length,
bool sync_op,
bool use_calc_stream);
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <cstdint>
#include <vector>
namespace paddle {
namespace distributed {
// TODO(shenliang03): To support AVG for reduce
enum class ReduceOp : std::uint8_t { SUM = 0, AVG, MAX, MIN, PRODUCT };
struct AllreduceOptions {
ReduceOp reduce_op = ReduceOp::SUM;
};
struct BroadcastOptions {
int source_rank = 0;
int source_root = 0;
};
struct BarrierOptions {
std::vector<int> place_ids;
};
struct ReduceOptions {
ReduceOp reduce_op = ReduceOp::SUM;
int root_rank = 0;
};
struct ScatterOptions {
int root_rank = 0;
};
struct ReduceScatterOptions {
ReduceOp reduce_op = ReduceOp::SUM;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/backends/device_guard.h"
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
namespace paddle {
namespace distributed {
template <typename DeviceContext, typename T>
struct ConcatDenseTensor {
void operator()(const DeviceContext *context,
const std::vector<phi::DenseTensor> &in,
phi::DenseTensor *out,
int axis = 0) {
phi::funcs::ConcatFunctor<DeviceContext, T> concat_functor;
concat_functor(*context, in, axis, out);
}
};
template <typename DeviceContext, typename T>
struct SplitDenseTensor {
void operator()(const DeviceContext *context,
const phi::DenseTensor &in,
std::vector<phi::DenseTensor *> *out,
int axis = 0) {
std::vector<const phi::DenseTensor *> shape_refer;
shape_refer.reserve(out->size());
for (auto *p_tensor : *out) {
shape_refer.emplace_back(p_tensor);
}
phi::funcs::SplitFunctor<DeviceContext, T> split_functor;
split_functor(*context, in, shape_refer, axis, out);
}
};
#ifdef PADDLE_WITH_CUSTOM_DEVICE
template <typename T>
struct ConcatDenseTensor<platform::CustomDeviceContext, T> {
void operator()(const platform::CustomDeviceContext *context,
const std::vector<phi::DenseTensor> &in,
phi::DenseTensor *out,
int axis = 0) {
auto *out_data = out->data<T>();
auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace());
size_t offset = 0;
for (const auto &tensor : in) {
const auto *in_data = tensor.data<T>();
auto sz = tensor.numel() * sizeof(T);
device->MemoryCopyD2D(out_data + offset, in_data, sz, nullptr);
offset += sz;
}
}
};
template <typename T>
struct SplitDenseTensor<platform::CustomDeviceContext, T> {
void operator()(const platform::CustomDeviceContext *context,
const phi::DenseTensor &in,
std::vector<phi::DenseTensor *> *out,
int axis = 0) {
auto *in_data = in.data<T>();
auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace());
size_t offset = 0;
for (auto *p_tensor : *out) {
auto *out_data = p_tensor->data<T>();
auto sz = p_tensor->numel() * sizeof(T);
device->MemoryCopyD2D(out_data, in_data + offset, sz, nullptr);
offset += sz;
}
}
};
#endif
template <typename DeviceContext>
void ConcatDenseTensorWithType(const DeviceContext *dev_ctx,
const std::vector<phi::DenseTensor> &t_list,
phi::DenseTensor *p_out,
phi::DataType type) {
switch (type) {
case phi::DataType::BOOL:
ConcatDenseTensor<DeviceContext, bool>()(dev_ctx, t_list, p_out);
break;
case phi::DataType::UINT8:
ConcatDenseTensor<DeviceContext, uint8_t>()(dev_ctx, t_list, p_out);
break;
case phi::DataType::INT8:
ConcatDenseTensor<DeviceContext, int8_t>()(dev_ctx, t_list, p_out);
break;
case phi::DataType::INT32:
ConcatDenseTensor<DeviceContext, int32_t>()(dev_ctx, t_list, p_out);
break;
case phi::DataType::INT64:
ConcatDenseTensor<DeviceContext, int64_t>()(dev_ctx, t_list, p_out);
break;
case phi::DataType::FLOAT16:
ConcatDenseTensor<DeviceContext, platform::float16>()(
dev_ctx, t_list, p_out);
break;
case phi::DataType::FLOAT32:
ConcatDenseTensor<DeviceContext, float>()(dev_ctx, t_list, p_out);
break;
case phi::DataType::FLOAT64:
ConcatDenseTensor<DeviceContext, double>()(dev_ctx, t_list, p_out);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it concats tensors.", type));
}
}
template <typename DeviceContext>
void SplitDenseTensorWithType(const DeviceContext *dev_ctx,
const phi::DenseTensor &t_in,
std::vector<phi::DenseTensor *> *p_list,
phi::DataType type) {
switch (type) {
case phi::DataType::BOOL:
SplitDenseTensor<DeviceContext, bool>()(dev_ctx, t_in, p_list);
break;
case phi::DataType::UINT8:
SplitDenseTensor<DeviceContext, uint8_t>()(dev_ctx, t_in, p_list);
break;
case phi::DataType::INT8:
SplitDenseTensor<DeviceContext, int8_t>()(dev_ctx, t_in, p_list);
break;
case phi::DataType::INT32:
SplitDenseTensor<DeviceContext, int32_t>()(dev_ctx, t_in, p_list);
break;
case phi::DataType::INT64:
SplitDenseTensor<DeviceContext, int64_t>()(dev_ctx, t_in, p_list);
break;
case phi::DataType::FLOAT16:
SplitDenseTensor<DeviceContext, platform::float16>()(
dev_ctx, t_in, p_list);
break;
case phi::DataType::FLOAT32:
SplitDenseTensor<DeviceContext, float>()(dev_ctx, t_in, p_list);
break;
case phi::DataType::FLOAT64:
SplitDenseTensor<DeviceContext, double>()(dev_ctx, t_in, p_list);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it splits tensors.", type));
}
}
void ConcatTensor(const phi::DeviceContext *dev_ctx,
const std::vector<phi::DenseTensor> &tensor_list,
const experimental::Tensor *tensor) {
auto *dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl()).get();
const auto &place = dev_ctx->GetPlace();
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ConcatDenseTensorWithType(static_cast<const phi::GPUContext *>(dev_ctx),
tensor_list,
dense_tensor,
tensor->dtype());
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat tensor since it's not support GPU, please "
"recompile or reinstall Paddle with GPU support."));
#endif
} else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
ConcatDenseTensorWithType(
static_cast<const platform::CustomDeviceContext *>(dev_ctx),
tensor_list,
dense_tensor,
tensor->dtype());
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat tensor since it's not compiled with "
"CUSTOM_DEVICE, please recompile or reinstall Paddle with "
"CUSTOM_DEVICE support."));
#endif
} else if (platform::is_cpu_place(place)) {
ConcatDenseTensorWithType(static_cast<const phi::CPUContext *>(dev_ctx),
tensor_list,
dense_tensor,
tensor->dtype());
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Concat tensor not supported on place (%s)", place));
}
}
void SplitTensor(const phi::DeviceContext *dev_ctx,
const phi::DenseTensor &tensor,
const std::vector<experimental::Tensor> *tensor_list) {
std::vector<phi::DenseTensor *> dense_list;
for (auto &tensor : *tensor_list) {
auto p_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()).get();
dense_list.emplace_back(p_tensor);
}
const auto &place = dev_ctx->GetPlace();
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
SplitDenseTensorWithType(static_cast<const phi::GPUContext *>(dev_ctx),
tensor,
&dense_list,
tensor.dtype());
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split tensor since it's not support GPU, please "
"recompile or reinstall Paddle with GPU support."));
#endif
} else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
SplitDenseTensorWithType(
static_cast<const platform::CustomDeviceContext *>(dev_ctx),
tensor,
&dense_list,
tensor.dtype());
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split tensor since it's not compiled with CUSTOM_DEVICE, "
"please recompile or reinstall Paddle with CUSTOM_DEVICE support."));
#endif
} else if (platform::is_cpu_place(place)) {
SplitDenseTensorWithType(static_cast<const phi::CPUContext *>(dev_ctx),
tensor,
&dense_list,
tensor.dtype());
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Split tensor not supported on place (%s)", place));
}
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/phi/backends/device_guard.h"
#include "paddle/phi/backends/device_manager.h"
namespace paddle {
namespace distributed {
static Backend TransToBackend(platform::Place place) {
static const std::map<phi::AllocationType, Backend> type_backend = {
{phi::AllocationType::GPU, Backend::GPU},
{phi::AllocationType::CPU, Backend::CPU},
};
phi::AllocationType type = place.GetType();
auto it = type_backend.find(type);
PADDLE_ENFORCE_EQ(it != type_backend.end(),
true,
platform::errors::InvalidArgument(
"Place type (%s) is not supported. ", place));
return it->second;
}
std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor> tensors,
const std::vector<bool> &is_sparse_gradient,
const std::vector<size_t> &group_size_limits,
const std::vector<int64_t> &tensor_indices) {
PADDLE_ENFORCE_EQ(
tensors.size(),
is_sparse_gradient.size(),
platform::errors::PreconditionNotMet(
"tensors len must be equal to is_sparse_gradient len, but "
"[%lu] != [%lu]",
tensors.size(),
is_sparse_gradient.size()));
auto check_perm = [](const std::vector<int64_t> &x) -> bool {
size_t len = x.size();
std::vector<size_t> cnt(len, 0);
for (size_t i = 0; i < len; ++i) {
if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
return false;
}
cnt[x[i]]++;
}
return true;
};
PADDLE_ENFORCE_EQ(true,
check_perm(tensor_indices),
platform::errors::PreconditionNotMet(
"tensor_indices must be a permutation from 0 to %lu",
tensor_indices.size()));
// the return vector
std::vector<std::vector<size_t>> res;
// Key: the var type
// Value: should use which index in group_size_limits for group size limit
std::map<experimental::DataType, size_t> group_limit_index;
// Key: the var type
// Value: <the var index in input tensors, total numel in this group>
std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
next_group;
for (size_t i = 0; i < tensors.size(); ++i) {
const auto &var = tensors[i];
size_t tensor_real_index = i;
if (!tensor_indices.empty()) {
tensor_real_index = tensor_indices[i];
}
if (is_sparse_gradient[tensor_real_index]) {
// we keep sparse var a single group
res.push_back({tensor_real_index});
continue;
}
const auto &var_dtype = var.dtype();
VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
auto &group_info = next_group[var_dtype];
int64_t var_size = -1;
if (var.is_dense_tensor()) {
var_size =
std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
} else {
VLOG(3) << "var " << var.name()
<< " is not tensor or selected_rows, so skip it";
continue;
}
group_info.first.push_back(tensor_real_index);
group_info.second += experimental::SizeOf(var_dtype) * var_size;
// group_info.second += framework::SizeOfType(var_dtype) * var_size;
if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
// means it is the first var of var_dtype
group_limit_index[var_dtype] = 0;
}
auto &cur_limit_index = group_limit_index[var_dtype];
if (group_info.second >= group_size_limits[cur_limit_index]) {
// exceed group capacity and create a new group
res.emplace_back(std::move(group_info.first));
group_info = std::pair<std::vector<size_t>, size_t>();
cur_limit_index =
(std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
}
}
// add the final groups
for (auto &e : next_group) {
auto &group_info = e.second;
if (!group_info.first.empty()) {
res.emplace_back(std::move(group_info.first));
}
}
for (const auto &group_index : res) {
PADDLE_ENFORCE_NE(
group_index.empty(),
true,
platform::errors::PreconditionNotMet(
"AssignGroupBySize construct empty group, please check."));
}
if (tensor_indices.empty()) {
std::sort(res.begin(),
res.end(),
[](const std::vector<size_t> &x, const std::vector<size_t> &y) {
return x.front() < y.front();
});
}
return res;
}
template <typename DeviceContext, typename T>
struct ConcatTensorsForAllReduce {
void operator()(const DeviceContext &context,
const std::vector<phi::DenseTensor> &dense_tensors_,
Tensor *p_dense_contents) {
operators::math::ConcatFunctor<DeviceContext, T> concat_functor_;
concat_functor_(
context,
dense_tensors_,
0,
std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
.get());
}
};
template <typename DeviceContext, typename T>
struct SplitTensorsForAllReduce {
void operator()(const DeviceContext &context,
Tensor *p_dense_contents,
std::vector<phi::DenseTensor> *p_dense_tensors) {
auto *in =
std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
.get();
std::vector<phi::DenseTensor *> outs;
std::vector<const phi::DenseTensor *> shape_refer;
outs.reserve(p_dense_tensors->size());
shape_refer.reserve(p_dense_tensors->size());
for (auto &tensor : *p_dense_tensors) {
outs.emplace_back(&tensor);
shape_refer.emplace_back(&tensor);
}
operators::math::SplitFunctor<DeviceContext, T> split_functor_;
split_functor_(context, *in, shape_refer, 0, &outs);
}
};
#ifdef PADDLE_WITH_CUSTOM_DEVICE
// note(wangran16): A temporary solution for all backends.
template <typename T>
struct ConcatTensorsForAllReduce<platform::CustomDeviceContext, T> {
void operator()(const platform::CustomDeviceContext &context,
const std::vector<phi::DenseTensor> &dense_tensors_,
Tensor *p_dense_contents) {
phi::DeviceGuard guard(context.GetPlace());
auto *out =
std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
.get();
uint8_t *out_data = reinterpret_cast<uint8_t *>(out->data<T>());
auto *device = phi::DeviceManager::GetDeviceWithPlace(context.GetPlace());
size_t offset = 0;
for (const auto &tensor : dense_tensors_) {
const uint8_t *in_data =
reinterpret_cast<const uint8_t *>(tensor.data<T>());
auto sz = tensor.numel() * sizeof(T);
device->MemoryCopyD2D(out_data + offset, in_data, sz, nullptr);
offset += sz;
}
}
};
template <typename T>
struct SplitTensorsForAllReduce<platform::CustomDeviceContext, T> {
void operator()(const platform::CustomDeviceContext &context,
Tensor *p_dense_contents,
std::vector<phi::DenseTensor> *p_dense_tensors) {
auto *in =
std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
.get();
uint8_t *in_data = reinterpret_cast<uint8_t *>(in->data<T>());
auto *device = phi::DeviceManager::GetDeviceWithPlace(context.GetPlace());
size_t offset = 0;
for (auto &tensor : *p_dense_tensors) {
uint8_t *out_data = reinterpret_cast<uint8_t *>(tensor.data<T>());
auto sz = tensor.numel() * sizeof(T);
device->MemoryCopyD2D(out_data, in_data + offset, sz, nullptr);
offset += sz;
}
}
};
#endif
// context is used to select the stream for concat
template <typename DeviceContext>
static void ConcatTensorsWithType(
const DeviceContext &context,
const std::vector<phi::DenseTensor> &dense_tensors_,
Tensor *p_dense_contents,
phi::DataType type) {
switch (type) {
case phi::DataType::FLOAT16:
ConcatTensorsForAllReduce<DeviceContext, platform::float16>()(
context, dense_tensors_, p_dense_contents);
break;
case phi::DataType::FLOAT32:
ConcatTensorsForAllReduce<DeviceContext, float>()(
context, dense_tensors_, p_dense_contents);
break;
case phi::DataType::FLOAT64:
ConcatTensorsForAllReduce<DeviceContext, double>()(
context, dense_tensors_, p_dense_contents);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it concats tensors for "
"allreduce.",
type));
}
}
// context is used to select the stream for split
template <typename DeviceContext>
static void SplitTensorsWithType(const DeviceContext &context,
Tensor *p_dense_contents,
std::vector<phi::DenseTensor> *p_dense_tensors,
phi::DataType type) {
switch (type) {
case phi::DataType::FLOAT16:
SplitTensorsForAllReduce<DeviceContext, platform::float16>()(
context, p_dense_contents, p_dense_tensors);
break;
case phi::DataType::FLOAT32:
SplitTensorsForAllReduce<DeviceContext, float>()(
context, p_dense_contents, p_dense_tensors);
break;
case phi::DataType::FLOAT64:
SplitTensorsForAllReduce<DeviceContext, double>()(
context, p_dense_contents, p_dense_tensors);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it splits tensors for "
"allreduce.",
type));
}
}
void EagerGroup::ConcatTensors(const platform::Place &place) {
dense_contents_ =
paddle::experimental::empty(IntArray({all_length_}), dtype_, place);
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType(
*default_ctx, dense_tensors_, &dense_contents_, dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."));
#endif
} else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto *default_ctx = static_cast<platform::CustomDeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType(
*default_ctx, dense_tensors_, &dense_contents_, dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat grad tensors since it's not compiled with "
"CUSTOM_DEVICE,"
"Please recompile or reinstall Paddle with CUSTOM_DEVICE support."));
#endif
} else if (platform::is_cpu_place(place)) {
auto *default_ctx = static_cast<phi::CPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType(
*default_ctx, dense_tensors_, &dense_contents_, dtype_);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Concat grad tensor not supported on place (%s)", place));
}
}
void EagerGroup::SplitTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType(
*default_ctx, &dense_contents_, &dense_tensors_, dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split grad tensor since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."));
#endif
} else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto *default_ctx = static_cast<platform::CustomDeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType(
*default_ctx, &dense_contents_, &dense_tensors_, dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split grad tensor since it's not compiled with "
"CUSTOM_DEVICE,"
"Please recompile or reinstall Paddle with CUSTOM_DEVICE support."));
#endif
} else if (platform::is_cpu_place(place)) {
auto *default_ctx = static_cast<phi::CPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType(
*default_ctx, &dense_contents_, &dense_tensors_, dtype_);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Split grad tensor not supported on place (%s)", place));
}
}
EagerReducer::EagerReducer(
const std::vector<Tensor> tensors,
const std::vector<std::vector<size_t>> &group_indices,
const std::vector<bool> &is_sparse_gradient,
std::shared_ptr<distributed::ProcessGroup> process_group,
const std::vector<size_t> &group_size_limits,
bool find_unused_parameters)
: tensors_(tensors),
group_indices_(group_indices),
is_sparse_gradient_(is_sparse_gradient),
process_group_(process_group),
group_size_limits_(group_size_limits),
find_unused_vars_each_step_(find_unused_parameters) {
VLOG(3) << "Start construct the Reducer ...";
nranks_ = process_group_->GetSize();
// initialize groups
InitializeGroups(group_indices);
for (size_t global_var_index = 0; global_var_index < tensors_.size();
++global_var_index) {
auto tensor = tensors_[global_var_index];
auto reduce_hook = [=](void) -> void {
this->AddDistHook(global_var_index);
};
const auto &grad_node = GetGradNodeFromTensor(&tensor);
PADDLE_ENFORCE(
grad_node.get() != nullptr,
paddle::platform::errors::Fatal("Detected NULL grad_node,"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation"));
const auto &accumulation_grad_node =
std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
accumulation_grad_node->RegisterReduceHook(
std::make_shared<egr::CppVoidHook>(reduce_hook));
gradnode_index_map_[grad_node.get()] = global_var_index;
}
vars_marked_ready_.resize(tensors_.size(), false);
local_used_vars_.resize(tensors_.size(), 0);
if (find_unused_vars_each_step_) {
global_used_vars_ = paddle::experimental::empty(
IntArray({static_cast<int32_t>(tensors_.size())}),
DataType::INT32,
inner_place_);
}
}
std::shared_ptr<egr::GradNodeBase> EagerReducer::GetGradNodeFromTensor(
Tensor *tensor) {
auto *autograd_meta = tensor->get_autograd_meta();
const auto &grad_node =
static_cast<egr::AutogradMeta *>(autograd_meta)->GetMutableGradNode();
return grad_node;
}
void EagerReducer::InitializeGroups(
const std::vector<std::vector<size_t>> &group_indices) {
VLOG(3) << "Start initialize groups ..";
// clear the group
groups_.clear();
groups_.reserve(group_indices.size());
variable_locators_.clear();
variable_locators_.resize(tensors_.size());
auto group_nums = group_indices.size();
for (size_t group_index = 0; group_index < group_nums; ++group_index) {
const auto &tensor_indices_ = group_indices[group_index];
PADDLE_ENFORCE_GT(
tensor_indices_.size(),
0,
platform::errors::PreconditionNotMet(
"The number of group[%d]'s elements is 0.", group_index));
EagerGroup group;
// It's just for check the sparse or dense
auto first_var = tensors_[tensor_indices_.front()];
if (tensor_indices_.size() == 1 &&
is_sparse_gradient_[tensor_indices_.front()]) {
// process the sparse gradient. one sparse, one group
group.dtype_ = first_var.dtype();
group.is_sparse_ = true;
} else {
// process the dense gradient.
InitializeDenseGroups(tensor_indices_, &group);
}
// map tensors to this group by VariableLocator
size_t inside_group_index = 0;
for (const auto var_index : tensor_indices_) {
TensorLocator tensor_locator;
tensor_locator.group_index = group_index;
tensor_locator.inside_group_index = inside_group_index++;
variable_locators_[var_index] = tensor_locator;
}
group.tensor_indices_ = std::move(tensor_indices_);
groups_.emplace_back(std::move(group));
VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
}
}
void EagerReducer::InitializeDenseGroups(
const std::vector<size_t> &tensor_indices_, EagerGroup *p_group) {
VLOG(3) << "InitializeDenseGroups.";
int64_t all_length = 0;
for (size_t index = 0; index < tensor_indices_.size(); ++index) {
auto tensor_index = tensor_indices_[index];
auto &tensor = tensors_[tensor_index];
auto &tensor_name = tensor.name();
PADDLE_ENFORCE_EQ(is_sparse_gradient_[tensor_index],
false,
platform::errors::PreconditionNotMet(
"Tensor %s's GRAD must be Tensor, but received "
"GRAD is SelectedRows",
tensor_name));
PADDLE_ENFORCE_EQ(tensor.initialized(),
true,
platform::errors::PreconditionNotMet(
"Tensor %s is not initialized.", tensor_name));
const auto size = tensor.numel();
PADDLE_ENFORCE_GT(
size,
0,
platform::errors::PreconditionNotMet(
"The number of tensor %s's elements is 0.", tensor_name));
all_length += size;
p_group->length_.push_back(size);
// for concat operator
p_group->origin_shapes_.push_back(IntArray(tensor.shape()));
p_group->dense_tensors_.push_back(phi::DenseTensor());
const auto &dtype = tensor.dtype();
const auto &inner_place = tensor.impl()->place();
if (index > 0) {
PADDLE_ENFORCE_EQ(dtype,
p_group->dtype_,
platform::errors::PreconditionNotMet(
"Tensor %s has unexpected dtype.", tensor_name));
} else {
p_group->dtype_ = dtype;
inner_place_ = inner_place;
}
}
p_group->all_length_ = all_length;
}
void EagerReducer::TraverseBackwardGraph(const std::vector<Tensor> &outputs) {
std::queue<egr::GradNodeBase *> queue;
std::set<egr::GradNodeBase *> visited;
for (const auto &output : outputs) {
auto *auto_grad_meta =
static_cast<egr::AutogradMeta *>(output.get_autograd_meta());
if (!auto_grad_meta) continue;
auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
auto_grad_meta->StopGradient()) {
continue;
}
egr::GradNodeBase *grad_node = shared_grad_node.get();
queue.emplace(grad_node);
}
while (!queue.empty()) {
egr::GradNodeBase *node = queue.front();
queue.pop();
const paddle::small_vector<std::vector<egr::GradSlotMeta>,
egr::kSlotSmallVectorSize> &metas =
node->OutputMeta();
for (size_t i = 0; i < metas.size(); i++) {
for (size_t j = 0; j < metas[i].size(); j++) {
const egr::Edge &edge = metas[i][j].GetEdge();
auto next_node_shared = edge.GetMutableGradNode();
if (!next_node_shared || !next_node_shared.get()) {
continue;
}
auto *next_node = next_node_shared.get();
const bool was_inserted = visited.insert(next_node).second;
if (was_inserted) {
queue.emplace(next_node);
}
}
}
}
for (const auto &it : gradnode_index_map_) {
if (visited.count(it.first) == 0) {
unused_vars_.push_back(it.second);
VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
<< "Tensor " << tensors_[it.second].name() << " at index "
<< it.second << " is marked as unused.";
}
}
}
void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
VLOG(3) << "after forward, then reset count for backward.";
grad_need_hooks_ = true;
next_group_ = 0;
std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) {
group.pending_ = group.tensor_indices_.size();
group.sparse_contents_ = Tensor();
});
// reinitialize vars_marked_ready_ for next iteration
vars_marked_ready_.clear();
vars_marked_ready_.resize(tensors_.size(), false);
PADDLE_ENFORCE_EQ(
groups_need_finalize_,
false,
platform::errors::PreconditionNotMet(
"A serious error has occurred here. Please "
"set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have "
"set, There may be several reasons for this error: "
"1) Please note that all forward outputs derived from the module "
"parameters must participate in the calculation of losses and "
"subsequent gradient calculations. If not, the wrapper will hang, "
"waiting for autograd to generate gradients for these parameters. "
"you can use detach or stop_gradient to make the unused parameters "
"detached from the autograd graph. "
"2) Used multiple forwards and one backward. You may be able to wrap "
"multiple forwards in a model."));
// The first var to trigger the unused parameter
has_marked_unused_vars_ = false;
if (find_unused_vars_once_ || find_unused_vars_each_step_) {
unused_vars_.clear();
TraverseBackwardGraph(outputs);
// only check once in first step
find_unused_vars_once_ = false;
}
if (find_unused_vars_each_step_ && unused_vars_.empty()) {
LOG_FIRST_N(WARNING, 1)
<< "All parameters are involved in the backward pass. "
"It is recommended to set find_unused_parameters to False "
"to improve performance. However, if unused parameters "
"appear in subsequent iterative training, then an error "
"will occur. Please make it clear that in the subsequent "
"training, there will be no parameters that are not used "
"in the backward pass, and then set find_unused_parameters";
}
if (unused_vars_.size() == tensors_.size()) {
LOG_FIRST_N(WARNING, 1)
<< "There is no parameter in the device involved "
"in the backward calculation. If there are "
"parameters on other devices involved in the "
"backward, then a serious error will occur here.";
}
}
void EagerReducer::AddDistHook(size_t var_index) {
PADDLE_ENFORCE_LT(var_index,
variable_locators_.size(),
platform::errors::OutOfRange(
"Out of bounds variable index. it must be less"
"than %d, but it is %d",
variable_locators_.size(),
var_index));
// gradient synchronization is not required when grad_need_hooks_ is false.
if (!grad_need_hooks_) {
return;
}
VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name()
<< "@Grad] arrived and triggered disthook";
local_used_vars_[var_index] = 1;
if (!has_marked_unused_vars_) {
has_marked_unused_vars_ = true;
for (const auto unused_index : unused_vars_) {
MarkVarReady(unused_index, false);
}
}
MarkVarReady(var_index, true);
}
void EagerReducer::MarkVarReady(const size_t var_index,
const bool is_used_var) {
VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name()
<< "] is marked ready.";
// error happened, if the var is ready before.
if (vars_marked_ready_[var_index]) {
auto error_info = string::Sprintf(
"Error happened, when parameter[%d][%s] has been ready before. "
"Please set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have set, "
"there may be several reasons for this error: "
"1) In multiple reentrant backward phase, some parameters are reused."
"2) Using model parameters outside of forward function. Please "
"make sure that model parameters are not shared in concurrent "
"forward-backward passes.",
var_index,
tensors_[var_index].name());
PADDLE_ENFORCE_EQ(has_marked_unused_vars_,
false,
platform::errors::PreconditionNotMet(error_info));
error_info +=
"3) Unused parameters retrieval is incorrect. "
"The return value of forward will be used to retrieve"
" the unused parameters of the entire model. These "
"gradients of unused parameters will not be synchronized "
"between multiple cards. However, if the unused "
"parameters participate in the backward calculation "
"again at a later time (e.g. after the forward function, "
"the loss calculation uses the unused "
"paramters of the forward and trigger backward), "
"its gradient will be wrong.";
PADDLE_ENFORCE_EQ(has_marked_unused_vars_,
true,
platform::errors::PreconditionNotMet(error_info));
} else {
vars_marked_ready_[var_index] = true;
}
groups_need_finalize_ = true;
const auto &var_locator = variable_locators_[var_index];
const auto group_index = var_locator.group_index;
const auto inside_group_index = var_locator.inside_group_index;
auto &group = groups_[group_index];
auto &group_tensor = group.dense_tensors_[inside_group_index];
const auto length = group.length_[inside_group_index];
if (!group.is_sparse_) {
if (is_used_var) {
auto *autograd_meta = tensors_[var_index].get_autograd_meta();
auto &grad_tensor =
static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
group_tensor
.ShareDataWith(*(
std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
.Resize({grad_tensor.numel()});
} else {
// TODO(shenliang03): maybe save the memory by avoiding tensor
// construction
if (!group_tensor.initialized()) {
group_tensor.Resize({static_cast<int64_t>(length)});
group_tensor.mutable_data(inner_place_, group.dtype_);
}
if (HasGrad(var_index)) {
VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad";
auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]);
group_tensor
.ShareDataWith(*(std::dynamic_pointer_cast<phi::DenseTensor>(
grad_tensor->impl())))
.Resize({length});
} else {
VLOG(3) << "Tensor[" << tensors_[var_index].name()
<< "] doesn't have grad";
auto *dev_ctx =
platform::DeviceContextPool::Instance().Get(inner_place_);
group_tensor.Resize({static_cast<int64_t>(length)});
phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0);
}
}
} else {
auto *autograd_meta = tensors_[var_index].get_autograd_meta();
auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
// process sparse group
PADDLE_ENFORCE_EQ(
HasGrad(var_index),
true,
platform::errors::PreconditionNotMet(
"The sparse parameter[%d][%s] should have gradient. "
"Currently, DataParallel does not support sparse "
"parameters without generating gradients during training. "
"For example, if is_sparese=True is used in Embedding, "
"the current step of this parameter cannot generate gradient "
"because of stop_gradient/detatch, where error will occur.",
var_index,
tensors_[var_index].name()));
// need to check tensor type
PADDLE_ENFORCE_EQ(
grad_tensor.is_selected_rows(),
true,
platform::errors::PreconditionNotMet(
"The sparse parameter[%d][%s] must have a selectedrows gradient. "
"Before forward pass, the parameter type is inferred to be "
"SelectedRows, but after backward pass, its actual type becomes "
"LodTensor. It is currently not supported by DataParallel. "
"For example, if sparse embedding is used, and the weight of "
"embedding is shared with subsequent dense parameters, then "
"the parameter gradient of the embedding will be converted "
"to dense parameters.",
var_index,
tensors_[var_index].name()));
group.sparse_contents_.set_impl(grad_tensor.impl());
}
if (--group.pending_ == 0) {
// can start allreduce
MarkGroupReady(group_index);
}
if (next_group_ == groups_.size()) {
FinalizeBackward();
}
}
void EagerReducer::MarkGroupReady(size_t group_index) {
VLOG(3) << "Group[" << group_index << "] is ready";
PADDLE_ENFORCE_GE(
group_index,
next_group_,
platform::errors::PreconditionNotMet(
"The index of the incoming group must be greater "
"than or equal to the previously synchronized group index, "
"expect it to greater than or equal to %d, but got %d.",
next_group_,
group_index));
if (group_index > next_group_) {
VLOG(3) << "It will adjust the order of group in next batch automatically";
return;
}
for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
++next_group_) {
UNUSED auto &group = groups_[next_group_];
if (group.is_sparse_) {
AllReduceSparse(&group, next_group_);
} else {
FusedAllReduceSchedule(&group, next_group_);
}
}
}
bool EagerReducer::HasGrad(size_t var_index) {
auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]);
if (grad && grad->initialized()) {
return true;
} else {
return false;
}
}
void EagerReducer::ProcessUnusedDenseVars() {
// The calculation stream must be used here to
// avoid conflicts with communication.
VLOG(3) << "Local used vars : "
<< string::join_strings(local_used_vars_, ',');
const auto *dev_ctx =
platform::DeviceContextPool::Instance().Get(inner_place_);
auto *global_used_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(global_used_vars_.impl())
.get();
framework::TensorFromVector<int32_t>(
local_used_vars_, *dev_ctx, global_used_tensor);
distributed::AllreduceOptions opts;
opts.reduce_op = ReduceOp::SUM;
std::vector<Tensor> reduce_tensors = {global_used_vars_};
std::vector<phi::DenseTensor> in_out;
for (auto &t : reduce_tensors) {
in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
}
process_group_->AllReduce(in_out, in_out, opts)->Synchronize();
framework::TensorToVector<int>(
*global_used_tensor, *dev_ctx, &local_used_vars_);
dev_ctx->Wait();
// sync compute stream to get global used var message,
// but maybe affect speed performance
VLOG(3) << "Global used vars : "
<< string::join_strings(local_used_vars_, ',');
for (const auto var_index : unused_vars_) {
const bool global_unused = (local_used_vars_[var_index] == 0);
// global used but local unused, set grad
VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
<< "Var [" << var_index << "] [" << tensors_[var_index].name()
<< "] global_unused: " << global_unused
<< " has grad: " << HasGrad(var_index);
if (!global_unused) {
VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank "
<< process_group_->GetRank() << "]";
const auto &var_locator = variable_locators_[var_index];
const auto group_index = var_locator.group_index;
const auto &group = groups_[group_index];
const auto inside_group_index = var_locator.inside_group_index;
auto &src_tensor = group.dense_tensors_[inside_group_index];
// sparse no need to check and no support find_unused_parameters
if (group.is_sparse_) {
continue;
}
// NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
// gradient accumulation can continue normally after clear_gradients()
// especiall in cases including complex control flow.
std::static_pointer_cast<egr::GradNodeAccumulation>(
GetGradNodeFromTensor(&tensors_[var_index]))
->SetFakeEmpty(false);
Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
auto dest_var_base = tensors_[var_index];
auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base);
grad_tensor->copy_(grad_value, inner_place_, true);
grad_tensor->reshape(dest_var_base.shape());
}
}
}
void EagerReducer::FinalizeBackward() {
groups_need_finalize_ = false;
grad_need_hooks_ = false;
for (auto &group : groups_) {
if (!group.is_sparse_) {
group.task->Synchronize();
}
}
for (auto &group : groups_) {
if (!group.is_sparse_) {
group.SplitTensors(inner_place_);
group.dense_contents_.reset();
}
}
if (find_unused_vars_each_step_) {
ProcessUnusedDenseVars();
local_used_vars_.clear();
local_used_vars_.resize(tensors_.size(), 0);
VLOG(3) << "ProcessUnusedDenseVars is finished.";
}
VLOG(3) << "In the batch, Reducer is finished.";
}
void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
const int curr_group_index) {
// The overall timeline: concat > div_nranks > allreduce > split
distributed::AllreduceOptions opts;
opts.reduce_op = ReduceOp::SUM;
VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce.";
// concat tensors
group->ConcatTensors(inner_place_);
// div nranks
paddle::experimental::scale_(
group->dense_contents_, 1.0 / nranks_, 0.0, false);
// all_reduce
std::vector<Tensor> reduce_tensors = {group->dense_contents_};
std::vector<phi::DenseTensor> in_out;
for (auto &t : reduce_tensors) {
in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
}
group->task = process_group_->AllReduce(in_out, in_out, opts);
// split in FinalizeBackward()
}
void EagerReducer::AllReduceSparse(EagerGroup *group,
const int curr_group_index) {
// div nranks
Tensor sparse_tensor(group->sparse_contents_);
paddle::experimental::scale_(sparse_tensor, 1.0 / nranks_, 0.0, false);
VLOG(3) << "sparse_group [" << curr_group_index << "] start allreduce.";
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
if (platform::is_gpu_place(inner_place_)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(inner_place_));
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."));
#endif
} else if (platform::is_custom_place(inner_place_)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
dev_ctx = static_cast<platform::CustomDeviceContext *>(
platform::DeviceContextPool::Instance().Get(inner_place_));
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat grad tensors since it's not compiled with "
"CUSTOM_DEVICE,"
"Please recompile or reinstall Paddle with CUSTOM_DEVICE support."));
#endif
} else if (platform::is_cpu_place(inner_place_)) {
dev_ctx = static_cast<phi::CPUContext *>(
platform::DeviceContextPool::Instance().Get(inner_place_));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Split grad tensor not supported on place (%s)", inner_place_));
}
auto src = std::dynamic_pointer_cast<phi::SelectedRows>(
group->sparse_contents_.impl());
const auto &src_rows = src->rows();
const auto &rank_ = process_group_->GetRank();
const auto &size_ = process_group_->GetSize();
framework::Vector<int64_t> rows_num_vector(size_);
rows_num_vector[rank_] = static_cast<int64_t>(src_rows.size());
Tensor rows_num_tensor = paddle::experimental::empty(
IntArray({static_cast<int64_t>(size_)}), DataType::INT64, inner_place_);
auto *rows_num_dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(rows_num_tensor.impl()).get();
framework::TensorFromVector<int64_t>(
rows_num_vector, *dev_ctx, rows_num_dense_tensor);
distributed::AllreduceOptions opts;
opts.reduce_op = ReduceOp::SUM;
std::vector<Tensor> reduce_tensors = {rows_num_tensor};
std::vector<phi::DenseTensor> in_out;
for (auto &t : reduce_tensors) {
in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
}
process_group_->AllReduce(in_out, in_out, opts)->Synchronize();
framework::TensorToVector<int64_t>(
*rows_num_dense_tensor, *dev_ctx, &rows_num_vector);
dev_ctx->Wait();
const auto *cpu_rows_num_ptr = rows_num_vector.data();
auto rows_num = std::accumulate(
cpu_rows_num_ptr, cpu_rows_num_ptr + size_, static_cast<int64_t>(0));
VLOG(3) << "Gather rows: " << string::join_strings(rows_num_vector, ',')
<< ", total rows number: " << rows_num
<< ", height: " << src->height();
dev_ctx->Wait();
Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
std::vector<int64_t> dst_shape = src_value_tensor.shape();
if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + size_, [&](int64_t row) {
return row == cpu_rows_num_ptr[0];
})) {
// During sparse communication, the number of each card is same.
// allgather is used to speed up the allreduce by replacing broadcast.
VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
Tensor dst_rows_tensor =
paddle::experimental::empty(IntArray({static_cast<int64_t>(rows_num)}),
DataType::INT64,
inner_place_);
Tensor src_rows_tensor = paddle::experimental::empty(
IntArray({static_cast<int64_t>((*src).rows().size())}),
DataType::INT64,
inner_place_);
auto *src_rows_dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(src_rows_tensor.impl())
.get();
framework::TensorFromVector<int64_t>(
(*src).rows(), *dev_ctx, src_rows_dense_tensor);
std::vector<Tensor> src_rows_tensors = {src_rows_tensor};
std::vector<Tensor> dst_rows_tensors = {dst_rows_tensor};
std::vector<phi::DenseTensor> in;
std::vector<phi::DenseTensor> out;
for (auto &t : src_rows_tensors) {
in.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
}
for (auto &t : dst_rows_tensors) {
out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
}
process_group_->AllGather(in, out)->Synchronize();
framework::Vector<int64_t> dst_rows_vector(rows_num, 0);
auto *dst_rows_dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(dst_rows_tensor.impl())
.get();
framework::TensorToVector<int64_t>(
*dst_rows_dense_tensor, *dev_ctx, &dst_rows_vector);
dev_ctx->Wait();
dst_shape[dst_shape.size() - 2] = rows_num;
auto dst_dense_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(
paddle::experimental::full(
IntArray(dst_shape), 0, src_value_tensor.dtype(), inner_place_)
.impl());
auto dst =
std::make_shared<phi::SelectedRows>(dst_rows_vector, (*src).height());
*(dst->mutable_value()) = *dst_dense_tensor;
Tensor dst_value_tensor(std::make_shared<phi::DenseTensor>(dst->value()));
std::vector<Tensor> src_value_tensors = {src_value_tensor};
std::vector<Tensor> dst_value_tensors = {dst_value_tensor};
std::vector<phi::DenseTensor> src_dense;
std::vector<phi::DenseTensor> dst_dense;
for (auto &t : src_value_tensors) {
src_dense.push_back(
*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
}
for (auto &t : dst_value_tensors) {
dst_dense.push_back(
*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
}
process_group_->AllGather(src_dense, dst_dense)->Synchronize();
src->set_rows(dst_rows_vector);
*(src->mutable_value()) =
*(std::dynamic_pointer_cast<phi::DenseTensor>(dst_value_tensor.impl()));
} else {
std::vector<Tensor> rows_tensors;
std::vector<Tensor> values_tensors;
for (int i = 0; i < size_; ++i) {
std::vector<int64_t> value_tensor_shape = {
cpu_rows_num_ptr[i], dst_shape[dst_shape.size() - 1]};
Tensor rows_tensor = paddle::experimental::full(
IntArray({static_cast<int64_t>(cpu_rows_num_ptr[i])}),
0,
DataType::INT64,
inner_place_);
Tensor values_tensor = paddle::experimental::full(
IntArray(value_tensor_shape), 0, src->value().dtype(), inner_place_);
std::vector<phi::DenseTensor> rows_dense_vector;
std::vector<phi::DenseTensor> values_dense_vector;
if (i == rank_) {
auto *rows_dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl())
.get();
framework::TensorFromVector<int64_t>(
src_rows, *dev_ctx, rows_dense_tensor);
values_tensor.set_impl(
std::make_shared<phi::DenseTensor>(src->value()));
}
rows_dense_vector.push_back(
*std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl()));
values_dense_vector.push_back(
*std::dynamic_pointer_cast<phi::DenseTensor>(values_tensor.impl()));
auto b_opts = BroadcastOptions();
b_opts.source_rank = i;
process_group_->Broadcast(rows_dense_vector, rows_dense_vector, b_opts);
process_group_
->Broadcast(values_dense_vector, values_dense_vector, b_opts)
->Wait();
rows_tensors.push_back(rows_tensor);
values_tensors.push_back(values_tensor);
}
Tensor dst_rows_tensor =
paddle::experimental::concat(rows_tensors, phi::Scalar(0));
framework::Vector<int64_t> dst_rows_vector(rows_num, 0);
auto *dst_rows_dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(dst_rows_tensor.impl())
.get();
framework::TensorToVector<int64_t>(
*dst_rows_dense_tensor, *dev_ctx, &dst_rows_vector);
src->set_rows(dst_rows_vector);
Tensor dst_values_tensor =
paddle::experimental::concat(values_tensors, phi::Scalar(0));
*(src->mutable_value()) = *(
std::dynamic_pointer_cast<phi::DenseTensor>(dst_values_tensor.impl()));
}
}
std::ostream &operator<<(std::ostream &out, const EagerGroup &group) {
const auto &tensors_ = group.tensor_indices_;
out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size()
<< "\n";
auto begin = tensors_.begin();
auto end = tensors_.end();
out << "[";
for (int i = 0; begin != end && i < 100; ++i, ++begin) {
if (i > 0) out << ' ';
out << *begin;
}
if (begin != end) {
out << " ...";
}
out << "]\n";
return out;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/utils/hook_utils.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/utils/string/string_helper.h"
namespace paddle {
namespace distributed {
using Tensor = paddle::experimental::Tensor;
using Scalar = paddle::experimental::ScalarBase<paddle::experimental::Tensor>;
using IntArray =
paddle::experimental::IntArrayBase<paddle::experimental::Tensor>;
using Backend = paddle::experimental::Backend;
std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor>,
const std::vector<bool> &is_sparse_gradient,
const std::vector<size_t> &group_size_limits,
const std::vector<int64_t> &tensor_indices = {});
class EagerGroup {
public:
Tensor dense_contents_;
Tensor sparse_contents_;
bool is_sparse_ = false;
// for concat kernel
std::vector<phi::DenseTensor> dense_tensors_;
std::vector<int64_t> length_;
int64_t all_length_{0};
std::vector<IntArray> origin_shapes_;
// Global indices of participating tensors in the group
std::vector<size_t> tensor_indices_;
// Number of params that haven't been ready. When it is 0, it means
// the group is ready.
size_t pending_ = -1;
// external message of group
phi::DataType dtype_;
// help to sync
std::shared_ptr<ProcessGroup::Task> task;
// context is used to select the stream for concat
void ConcatTensors(const platform::Place &);
// context is used to select the stream for split
void SplitTensors(const platform::Place &);
friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
};
struct TensorLocator {
// record the index in groups_
size_t group_index;
size_t inside_group_index;
};
class EagerReducer {
public:
explicit EagerReducer(
const std::vector<Tensor> tensors,
const std::vector<std::vector<size_t>> &group_indices,
const std::vector<bool> &is_sparse_gradient,
std::shared_ptr<distributed::ProcessGroup> process_group,
const std::vector<size_t> &group_size_limits,
bool find_unused_parameters);
virtual ~EagerReducer() {}
std::shared_ptr<egr::GradNodeBase> GetGradNodeFromTensor(Tensor *tensor);
void InitializeGroups(const std::vector<std::vector<size_t>> &group_indices);
void InitializeDenseGroups(const std::vector<size_t> &tensor_indices_,
EagerGroup *p_group);
void PrepareForBackward(const std::vector<Tensor> &outputs);
void AddDistHook(size_t var_index);
void MarkVarReady(const size_t var_index, const bool is_used_var);
void MarkGroupReady(const size_t group_index);
void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index);
void AllReduceSparse(EagerGroup *group, const int curr_group_index);
void FinalizeBackward();
void TraverseBackwardGraph(const std::vector<Tensor> &outputs);
void ProcessUnusedDenseVars();
bool HasGrad(size_t var_index);
private:
std::vector<Tensor> tensors_;
std::vector<std::vector<size_t>> group_indices_;
std::vector<bool> is_sparse_gradient_;
std::shared_ptr<distributed::ProcessGroup> process_group_;
std::vector<size_t> group_size_limits_;
std::vector<EagerGroup> groups_;
std::vector<TensorLocator> variable_locators_;
platform::Place inner_place_;
size_t next_group_ = 0;
int64_t nranks_ = -1;
bool grad_need_hooks_{false};
std::vector<bool> vars_marked_ready_;
std::vector<int32_t> local_used_vars_;
// Following variables are to help unused vars
std::vector<size_t> unused_vars_;
std::map<egr::GradNodeBase *, size_t> gradnode_index_map_;
bool has_marked_unused_vars_{false};
bool find_unused_vars_each_step_{false};
bool find_unused_vars_once_{true};
bool groups_need_finalize_{false};
Tensor global_used_vars_;
};
} // namespace distributed
} // namespace paddle
cc_library(
afs_wrapper
SRCS afs_warpper.cc
DEPS fs ps_framework_proto)
#set_property(GLOBAL PROPERTY COMMON_DEPS afs_warpper)
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/common/afs_warpper.h"
#include "paddle/fluid/framework/io/fs.h"
namespace paddle {
namespace distributed {
// AfsClient impl
int AfsClient::initialize(const FsClientParameter& fs_client_param) {
// temporarily implemented with hdfs-client
return initialize(fs_client_param.hadoop_bin(),
fs_client_param.uri(),
fs_client_param.user(),
fs_client_param.passwd(),
fs_client_param.buffer_size());
}
int AfsClient::initialize(const std::string& hadoop_bin,
const std::string& uri,
const std::string& user,
const std::string& passwd,
int buffer_size_param) {
return initialize(
hadoop_bin,
uri,
paddle::string::format_string("%s,%s", user.c_str(), passwd.c_str()),
buffer_size_param);
}
int AfsClient::initialize(const std::string& hadoop_bin,
const std::string& uri,
const std::string& ugi,
int buffer_size_param) {
// temporarily implemented with hdfs-client
size_t buffer_size = 1L << 25; // 32MB
if (buffer_size_param > static_cast<int>(buffer_size)) {
buffer_size = buffer_size_param;
}
paddle::framework::hdfs_set_buffer_size(buffer_size);
paddle::framework::hdfs_set_command(paddle::string::format_string(
"2>>./hdfs_err.log %s fs -Dfs.default.name=%s -Dhadoop.job.ugi=%s "
"-Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=300000",
hadoop_bin.c_str(),
uri.c_str(),
ugi.c_str()));
return 0;
}
// open file in 'w' or 'r'
std::shared_ptr<FsReadChannel> AfsClient::open_r(const FsChannelConfig& config,
uint32_t buffer_size,
int* err_no) {
std::shared_ptr<FsReadChannel> channel =
std::make_shared<FsReadChannel>(buffer_size);
std::shared_ptr<FILE> fp =
paddle::framework::fs_open_read(config.path, err_no, config.deconverter);
channel->open(fp, config);
return channel;
}
std::shared_ptr<FsWriteChannel> AfsClient::open_w(const FsChannelConfig& config,
uint32_t buffer_size,
int* err_no) {
std::shared_ptr<FsWriteChannel> channel =
std::make_shared<FsWriteChannel>(buffer_size);
std::shared_ptr<FILE> fp =
paddle::framework::fs_open_write(config.path, err_no, config.converter);
channel->open(fp, config);
return channel;
}
// remove file in path, path maybe a reg, such as 'part-000-*'
void AfsClient::remove(const std::string& path) {
return paddle::framework::fs_remove(path);
}
void AfsClient::remove_dir(const std::string& dir) {
return paddle::framework::fs_remove(dir);
}
// list files in path, path maybe a dir with reg
std::vector<std::string> AfsClient::list(const std::string& path) {
return paddle::framework::fs_list(path);
}
// exist or not
bool AfsClient::exist(const std::string& dir) {
return paddle::framework::fs_exists(dir);
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/the_one_ps.pb.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace distributed {
struct FsDataConverter {
std::string converter;
std::string deconverter;
};
struct FsChannelConfig {
std::string path; // path of file
std::string converter; // data converter
std::string deconverter;
};
class FsReadChannel {
public:
FsReadChannel() : _buffer_size(0) {}
explicit FsReadChannel(uint32_t buffer_size) : _buffer_size(buffer_size) {}
virtual ~FsReadChannel() {}
FsReadChannel(FsReadChannel&&) = delete;
FsReadChannel(const FsReadChannel&) = delete;
int open(std::shared_ptr<FILE> fp, const FsChannelConfig& config) {
_file = fp;
return 0;
}
inline int close() {
_file.reset();
return 0;
}
inline uint32_t read_line(std::string& line_data) { // NOLINT
line_data.clear();
char buffer = '\0';
size_t read_count = 0;
while (1 == fread(&buffer, 1, 1, _file.get()) && buffer != '\n') {
++read_count;
line_data.append(&buffer, 1);
}
if (read_count == 0 && buffer != '\n') {
return -1;
}
return 0;
}
private:
uint32_t _buffer_size;
FsChannelConfig _config;
std::shared_ptr<FILE> _file;
};
class FsWriteChannel {
public:
FsWriteChannel() : _buffer_size(0) {}
explicit FsWriteChannel(uint32_t buffer_size) : _buffer_size(buffer_size) {}
virtual ~FsWriteChannel() {}
FsWriteChannel(FsWriteChannel&&) = delete;
FsWriteChannel(const FsWriteChannel&) = delete;
int open(std::shared_ptr<FILE> fp, const FsChannelConfig& config) {
_file = fp;
// the buffer has set in fs.cc
// if (_buffer_size != 0) {
// _buffer = std::shared_ptr<char>(new char[_buffer_size]);
// CHECK(0 == setvbuf(&*_file, _buffer.get(), _IOFBF, _buffer_size));
//}
return 0;
}
inline void flush() { return; }
inline int close() {
flush();
_file.reset();
return 0;
}
inline uint32_t write_line(const char* data, uint32_t size) {
size_t write_count = fwrite_unlocked(data, 1, size, _file.get());
if (write_count != size) {
return -1;
}
write_count = fwrite_unlocked("\n", 1, 1, _file.get());
if (write_count != 1) {
return -1;
}
return 0;
}
inline uint32_t write_line(const std::string& data) {
return write_line(data.c_str(), data.size());
}
private:
uint32_t _buffer_size;
FsChannelConfig _config;
std::shared_ptr<FILE> _file;
std::shared_ptr<char> _buffer;
};
class AfsClient {
public:
AfsClient() {}
virtual ~AfsClient() {}
AfsClient(AfsClient&&) = delete;
AfsClient(const AfsClient&) = delete;
int initialize(const FsClientParameter& fs_client_param);
int initialize(const std::string& hadoop_bin,
const std::string& uri,
const std::string& user,
const std::string& passwd,
int buffer_size_param = (1L << 25));
int initialize(const std::string& hadoop_bin,
const std::string& uri,
const std::string& ugi,
int buffer_size_param = (1L << 25));
// open file in 'w' or 'r'
std::shared_ptr<FsReadChannel> open_r(const FsChannelConfig& config,
uint32_t buffer_size = 0,
int* err_no = nullptr);
std::shared_ptr<FsWriteChannel> open_w(const FsChannelConfig& config,
uint32_t buffer_size = 0,
int* err_no = nullptr);
// remove file in path, path maybe a reg, such as 'part-000-*'
void remove(const std::string& path);
void remove_dir(const std::string& dir);
// list files in path, path maybe a dir with reg
std::vector<std::string> list(const std::string& path);
// exist or not
bool exist(const std::string& dir);
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <glog/logging.h>
namespace paddle {
namespace distributed {
// Fast allocation and deallocation of objects by allocating them in chunks.
template <class T>
class ChunkAllocator {
public:
explicit ChunkAllocator(size_t chunk_size = 64) {
CHECK(sizeof(Node) == std::max(sizeof(void*), sizeof(T)));
_chunk_size = chunk_size;
_chunks = NULL;
_free_nodes = NULL;
_counter = 0;
}
ChunkAllocator(const ChunkAllocator&) = delete;
~ChunkAllocator() {
while (_chunks != NULL) {
Chunk* x = _chunks;
_chunks = _chunks->next;
free(x);
}
}
template <class... ARGS>
T* acquire(ARGS&&... args) {
if (_free_nodes == NULL) {
create_new_chunk();
}
T* x = (T*)(void*)_free_nodes; // NOLINT
_free_nodes = _free_nodes->next;
new (x) T(std::forward<ARGS>(args)...);
_counter++;
return x;
}
void release(T* x) {
x->~T();
Node* node = (Node*)(void*)x; // NOLINT
node->next = _free_nodes;
_free_nodes = node;
_counter--;
}
size_t size() const { return _counter; }
private:
struct alignas(T) Node {
union {
Node* next;
char data[sizeof(T)];
};
};
struct Chunk {
Chunk* next;
Node nodes[];
};
size_t _chunk_size; // how many elements in one chunk
Chunk* _chunks; // a list
Node* _free_nodes; // a list
size_t _counter; // how many elements are acquired
void create_new_chunk() {
Chunk* chunk;
posix_memalign(reinterpret_cast<void**>(&chunk),
std::max<size_t>(sizeof(void*), alignof(Chunk)),
sizeof(Chunk) + sizeof(Node) * _chunk_size);
chunk->next = _chunks;
_chunks = chunk;
for (size_t i = 0; i < _chunk_size; i++) {
Node* node = &chunk->nodes[i];
node->next = _free_nodes;
_free_nodes = node;
}
}
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <unordered_map>
#include "butil/time.h"
#include "bvar/latency_recorder.h"
#include "glog/logging.h"
namespace paddle {
namespace distributed {
struct CostProfilerNode {
std::shared_ptr<bvar::LatencyRecorder> recorder;
};
class CostProfiler {
public:
~CostProfiler() {}
static CostProfiler& instance() {
static CostProfiler profiler;
return profiler;
}
void register_profiler(const std::string& label) {
if (_cost_profiler_map.find(label) != _cost_profiler_map.end()) {
return;
}
auto profiler_node = std::make_shared<CostProfilerNode>();
profiler_node->recorder.reset(
new bvar::LatencyRecorder("cost_profiler", label));
_cost_profiler_map[label] = profiler_node;
}
CostProfilerNode* profiler(const std::string& label) {
auto itr = _cost_profiler_map.find(label);
if (itr != _cost_profiler_map.end()) {
return itr->second.get();
}
return NULL;
}
private:
CostProfiler() {}
std::unordered_map<std::string, std::shared_ptr<CostProfilerNode>>
_cost_profiler_map;
};
class CostTimer {
public:
explicit CostTimer(const std::string& label) {
_label = label;
auto& profiler = CostProfiler::instance();
_profiler_node = profiler.profiler(label);
// 如果不在profiler中,则使用log输出耗时信息
_is_print_cost = _profiler_node == NULL;
_start_time_ms = butil::gettimeofday_ms();
}
explicit CostTimer(CostProfilerNode& profiler_node) { // NOLINT
_is_print_cost = false;
_profiler_node = &profiler_node;
_start_time_ms = butil::gettimeofday_ms();
}
~CostTimer() {
if (_is_print_cost) {
VLOG(3) << "CostTimer label:" << _label
<< ", cost:" << butil::gettimeofday_ms() - _start_time_ms << "ms";
} else {
*(_profiler_node->recorder) << butil::gettimeofday_ms() - _start_time_ms;
}
}
private:
std::string _label;
bool _is_print_cost;
uint64_t _start_time_ms;
CostProfilerNode* _profiler_node;
};
} // namespace distributed
} // namespace paddle
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment