Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
f0ef3442
Commit
f0ef3442
authored
Apr 26, 2023
by
yuguo960516yuguo
Browse files
2.3.2-dtk-22.10.1
parent
ad08b8ce
Pipeline
#227
failed with stages
in 0 seconds
Changes
274
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
6385 additions
and
0 deletions
+6385
-0
paddle/fluid/distributed/collective/ProcessGroupGloo.h
paddle/fluid/distributed/collective/ProcessGroupGloo.h
+167
-0
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+276
-0
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+135
-0
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+393
-0
paddle/fluid/distributed/collective/ProcessGroupHeter.h
paddle/fluid/distributed/collective/ProcessGroupHeter.h
+144
-0
paddle/fluid/distributed/collective/ProcessGroupMPI.cc
paddle/fluid/distributed/collective/ProcessGroupMPI.cc
+466
-0
paddle/fluid/distributed/collective/ProcessGroupMPI.h
paddle/fluid/distributed/collective/ProcessGroupMPI.h
+211
-0
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+1615
-0
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+348
-0
paddle/fluid/distributed/collective/ProcessGroupStream.cc
paddle/fluid/distributed/collective/ProcessGroupStream.cc
+315
-0
paddle/fluid/distributed/collective/ProcessGroupStream.h
paddle/fluid/distributed/collective/ProcessGroupStream.h
+231
-0
paddle/fluid/distributed/collective/Types.h
paddle/fluid/distributed/collective/Types.h
+53
-0
paddle/fluid/distributed/collective/Utils.h
paddle/fluid/distributed/collective/Utils.h
+253
-0
paddle/fluid/distributed/collective/reducer.cc
paddle/fluid/distributed/collective/reducer.cc
+1178
-0
paddle/fluid/distributed/collective/reducer.h
paddle/fluid/distributed/collective/reducer.h
+145
-0
paddle/fluid/distributed/common/CMakeLists.txt
paddle/fluid/distributed/common/CMakeLists.txt
+6
-0
paddle/fluid/distributed/common/afs_warpper.cc
paddle/fluid/distributed/common/afs_warpper.cc
+100
-0
paddle/fluid/distributed/common/afs_warpper.h
paddle/fluid/distributed/common/afs_warpper.h
+161
-0
paddle/fluid/distributed/common/chunk_allocator.h
paddle/fluid/distributed/common/chunk_allocator.h
+95
-0
paddle/fluid/distributed/common/cost_timer.h
paddle/fluid/distributed/common/cost_timer.h
+93
-0
No files found.
Too many changes to show.
To preserve performance only
274 of 274+
files are displayed.
Plain diff
Email patch
paddle/fluid/distributed/collective/ProcessGroupGloo.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <future>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
constexpr
const
char
*
GLOO_BACKEND_NAME
=
"GLOO"
;
namespace
paddle
{
namespace
distributed
{
class
ProcessGroupGloo
:
public
ProcessGroup
{
public:
class
GlooTask
:
public
ProcessGroup
::
Task
,
public
std
::
enable_shared_from_this
<
GlooTask
>
{
public:
explicit
GlooTask
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
input_tensors
,
CommType
comm_type
);
~
GlooTask
()
=
default
;
virtual
void
Run
()
=
0
;
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
)
override
{
return
true
;
}
bool
IsCompleted
()
override
{
return
true
;
}
void
Synchronize
()
override
{}
protected:
friend
class
ProcessGroupGloo
;
};
class
GlooStore
:
public
::
gloo
::
rendezvous
::
Store
{
public:
explicit
GlooStore
(
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>&
store
)
:
_store
(
store
)
{}
~
GlooStore
()
=
default
;
std
::
vector
<
char
>
get
(
const
std
::
string
&
key
)
override
{
VLOG
(
3
)
<<
"GlooStore::get"
;
auto
value
=
_store
->
get
(
key
);
return
std
::
vector
<
char
>
(
value
.
begin
(),
value
.
end
());
}
void
wait
(
const
std
::
vector
<
std
::
string
>&
keys
)
override
{
VLOG
(
3
)
<<
"GlooStore::wait"
;
for
(
auto
&
key
:
keys
)
{
_store
->
wait
(
key
);
}
}
void
set
(
const
std
::
string
&
key
,
const
std
::
vector
<
char
>&
value
)
override
{
VLOG
(
3
)
<<
"GlooStore::set"
;
std
::
vector
<
uint8_t
>
tmp
(
value
.
begin
(),
value
.
end
());
_store
->
set
(
key
,
tmp
);
}
void
wait
(
const
std
::
vector
<
std
::
string
>&
keys
,
const
std
::
chrono
::
milliseconds
&
timeout
)
override
{
VLOG
(
3
)
<<
"GlooStore::wait"
;
for
(
auto
&
key
:
keys
)
{
_store
->
wait
(
key
);
}
// wait(keys);
}
protected:
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
_store
;
};
class
GlooOptions
{
public:
GlooOptions
()
=
default
;
~
GlooOptions
()
=
default
;
static
std
::
shared_ptr
<
GlooOptions
>
create
()
{
return
std
::
make_shared
<
GlooOptions
>
();
}
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
device
;
};
explicit
ProcessGroupGloo
(
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>&
store
,
int
rank
,
int
world_size
,
const
platform
::
Place
&
place
,
int
gid
,
std
::
shared_ptr
<
GlooOptions
>
options
);
~
ProcessGroupGloo
()
=
default
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
AllreduceOptions
&
opts
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
AllreduceOptions
&
opts
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
)
override
;
std
::
shared_ptr
<::
gloo
::
Context
>
get_context
()
{
return
_context
;
}
uint64_t
next_tag
()
{
return
_tag
++
;
}
const
std
::
string
GetBackendName
()
const
override
{
return
GLOO_BACKEND_NAME
;
}
// Helper functions for Gloo.
static
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
createDeviceForHostname
(
const
std
::
string
&
hostname
);
static
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
createDeviceForInterface
(
const
std
::
string
&
ifname
);
static
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
createDefaultDevice
();
protected:
uint32_t
_tag
;
std
::
shared_ptr
<
gloo
::
rendezvous
::
Context
>
_context
;
std
::
shared_ptr
<::
gloo
::
rendezvous
::
Store
>
_store
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/npu/hccl_helper.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
DECLARE_bool
(
hccl_blocking_wait
);
// DECLARE_bool(use_stream_safe_npu_allocator);
constexpr
int64_t
kWaitBlockTImeout
=
10
;
namespace
paddle
{
namespace
distributed
{
void
SyncDefaultStream
(
const
std
::
vector
<
Place
>&
places
,
std
::
vector
<
NPUEventManager
>&
hcclEvents
,
// NOLINT
std
::
vector
<
std
::
unique_ptr
<
NPUDeviceContext
>>&
dev_ctx
)
{
// NOLINT
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]));
hcclEvents
[
i
].
Record
(
*
dev_ctx
[
i
]);
hcclEvents
[
i
].
Block
(
*
default_ctx
);
}
}
std
::
shared_ptr
<
ProcessGroupHCCL
::
HCCLTask
>
ProcessGroupHCCL
::
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
{
return
std
::
make_shared
<
ProcessGroupHCCL
::
HCCLTask
>
(
places
,
rank
,
comm_type
,
inputs
);
}
ProcessGroupHCCL
::
HCCLTask
::
HCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
:
Task
(
rank
,
inputs
,
CommType
),
places_
(
places
)
{
control_events_
.
resize
(
places
.
size
());
hcclComms_
.
resize
(
places
.
size
());
}
ProcessGroupHCCL
::
HCCLTask
::~
HCCLTask
()
{}
void
ProcessGroupHCCL
::
HCCLTask
::
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
)
{
// NOLINT
outputs_
=
std
::
make_shared
<
std
::
vector
<
phi
::
DenseTensor
>>
(
outputs
);
}
void
ProcessGroupHCCL
::
HCCLTask
::
SynchronizeStreams
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places_
[
i
]));
platform
::
NPUStreamWaitEvent
(
default_ctx
->
stream
(),
control_events_
[
i
].
GetRawNPUEvent
());
}
}
bool
ProcessGroupHCCL
::
HCCLTask
::
IsCompleted
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
if
(
!
control_events_
[
i
].
Query
())
{
return
false
;
}
}
return
true
;
}
// TODO(sandyhouse): Add timeout for wait, now timeout unused
bool
ProcessGroupHCCL
::
HCCLTask
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
SynchronizeStreams
();
// NOTE(sandyhouse): It will block host for sync
while
(
!
IsCompleted
())
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
kWaitBlockTImeout
));
}
return
true
;
}
// Same as Wait
void
ProcessGroupHCCL
::
HCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupHCCL
::
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
),
store_
(
store
)
{
platform
::
SetNPUDeviceId
(
place_
.
device
);
}
void
ProcessGroupHCCL
::
BroadcastUniqueHCCLID
(
std
::
vector
<
HcclRootInfo
>&
hccl_ids
)
{
// NOLINT
if
(
rank_
==
0
)
{
for
(
size_t
i
=
0
;
i
<
hccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupHCCL/hccl_ids/"
+
std
::
to_string
(
i
);
auto
hccl_id
=
std
::
vector
<
uint8_t
>
(
reinterpret_cast
<
uint8_t
*>
(
&
hccl_ids
[
i
]),
reinterpret_cast
<
uint8_t
*>
(
&
hccl_ids
[
i
])
+
sizeof
(
HcclRootInfo
));
store_
->
set
(
key
,
hccl_id
);
}
}
else
{
for
(
size_t
i
=
0
;
i
<
hccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupHCCL/hccl_ids/"
+
std
::
to_string
(
i
);
auto
ret
=
store_
->
get
(
key
);
std
::
memcpy
(
&
hccl_ids
[
i
],
ret
.
data
(),
ret
.
size
());
}
}
}
// create HCCLManager cache for places_key
void
ProcessGroupHCCL
::
CreateHCCLManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
)
{
PADDLE_ENFORCE_EQ
(
places_key
.
empty
(),
false
,
platform
::
errors
::
PreconditionNotMet
(
"Not able to create/get the HCCL Communicator since "
"the NPU place are not known"
));
std
::
vector
<
std
::
shared_ptr
<
HCCLCommManager
>>
hccl_comms
;
hccl_comms
.
resize
(
places
.
size
());
// using vector just for broadcast
std
::
vector
<
HcclRootInfo
>
hccl_ids
;
hccl_ids
.
resize
(
1
);
auto
&
hccl_id
=
hccl_ids
.
front
();
if
(
rank_
==
0
)
{
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclGetRootInfo
(
&
hccl_id
));
}
BroadcastUniqueHCCLID
(
hccl_ids
);
VLOG
(
3
)
<<
"init hccl rank: "
<<
rank_
<<
", nranks: "
<<
size_
<<
", place: "
<<
places_key
<<
", hccl uniqueid: "
<<
SerializeHCCLUniqueId
(
hccl_id
);
std
::
vector
<
std
::
unique_ptr
<
NPUDeviceContext
>>
dev_ctx
;
dev_ctx
.
resize
(
places
.
size
());
std
::
unique_ptr
<
HcclComm
[]
>
comms
(
new
HcclComm
[
places
.
size
()]);
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
platform
::
NPUDeviceGuard
guard
(
places
[
i
].
GetDeviceId
());
hccl_comms
[
i
]
=
HCCLCommManager
::
Create
(
GetSize
(),
GetRank
(),
&
hccl_id
,
comms
.
get
()
+
i
);
dev_ctx
[
i
].
reset
(
new
NPUDeviceContext
(
places
[
i
]));
}
std
::
vector
<
NPUEventManager
>
events
;
events
.
resize
(
places
.
size
());
// These caches will be useful to process sync/wait/communicate
places_to_events_
.
emplace
(
places_key
,
std
::
move
(
events
));
places_to_hcclcomm_
.
emplace
(
places_key
,
std
::
move
(
hccl_comms
));
places_to_ctx_
.
emplace
(
places_key
,
std
::
move
(
dev_ctx
));
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHCCL
::
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
Fn
fn
,
CommType
op_type
)
{
const
auto
places
=
GetPlaceList
(
inputs
);
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_hcclcomm_
.
find
(
key
)
==
places_to_hcclcomm_
.
end
())
{
CreateHCCLManagerCache
(
key
,
places
);
}
}
auto
&
hccl_comms
=
places_to_hcclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
inputs
);
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
platform
::
NPUDeviceGuard
guard
(
places
[
i
].
GetDeviceId
());
const
auto
&
hccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
fn
(
inputs
[
i
],
outputs
[
i
],
hccl_comms
[
i
]
->
GetHcclComm
(),
hccl_stream
);
}
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
platform
::
NPUDeviceGuard
guard
(
places
[
i
].
GetDeviceId
());
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHCCL
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
AllreduceOptions
&
opts
)
{
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
HcclComm
comm
,
const
aclrtStream
&
stream
)
{
return
platform
::
dynload
::
HcclAllReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToHCCLDataType
(
input
.
dtype
()),
ToHCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
);
},
CommType
::
ALLREDUCE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHCCL
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
BroadcastOptions
&
opts
)
{
// PADDLE_ENFORCE_EQ(
// CheckTensorsInNPUPlace(tensors), true,
// platform::errors::InvalidArgument("All inputs should be in
// CudaPlace."));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
HcclComm
comm
,
const
aclrtStream
&
stream
)
{
int
root
=
opts
.
source_rank
*
in_tensors
.
size
()
+
opts
.
source_root
;
if
(
rank_
==
root
)
{
return
platform
::
dynload
::
HcclBroadcast
(
input
.
data
(),
input
.
numel
(),
platform
::
ToHCCLDataType
(
input
.
dtype
()),
root
,
comm
,
stream
);
}
else
{
return
platform
::
dynload
::
HcclBroadcast
(
output
.
data
(),
output
.
numel
(),
platform
::
ToHCCLDataType
(
output
.
dtype
()),
root
,
comm
,
stream
);
}
},
CommType
::
BROADCAST
);
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/device/npu/npu_stream.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
constexpr
const
char
*
HCCL_BACKEND_NAME
=
"HCCL"
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
using
NPUStream
=
platform
::
stream
::
NPUStream
;
using
NPUDeviceContext
=
paddle
::
platform
::
NPUDeviceContext
;
class
ProcessGroupHCCL
:
public
ProcessGroup
{
public:
class
HCCLTask
:
public
ProcessGroup
::
Task
,
public
std
::
enable_shared_from_this
<
HCCLTask
>
{
public:
HCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
bool
IsCompleted
();
void
SynchronizeStreams
();
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
Synchronize
();
void
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
);
// NOLINT
virtual
~
HCCLTask
();
std
::
vector
<
NPUEventManager
>
control_events_
;
protected:
std
::
vector
<
Place
>
places_
;
std
::
vector
<
std
::
shared_ptr
<
HCCLCommManager
>>
hcclComms_
;
std
::
shared_ptr
<
std
::
vector
<
phi
::
DenseTensor
>>
outputs_
;
private:
};
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
HCCL_BACKEND_NAME
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
protected:
virtual
std
::
shared_ptr
<
ProcessGroupHCCL
::
HCCLTask
>
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
opType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
HCCLCommManager
>
hccl_comm_
;
std
::
mutex
mutex_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
HCCLCommManager
>>>
places_to_hcclcomm_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
NPUEventManager
>>
places_to_events_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
unique_ptr
<
NPUDeviceContext
>>>
places_to_ctx_
;
std
::
set
<
int
>
used_place_ids_
;
private:
void
BcastHCCLId
(
std
::
vector
<
HcclRootInfo
>&
hccl_ids
,
int
root
,
// NOLINT
int
server_fd
);
void
BroadcastUniqueHCCLID
(
std
::
vector
<
HcclRootInfo
>&
hccl_ids
);
// NOLINT
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
Fn
fn
,
CommType
op_type
);
void
CreateHCCLManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
);
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
#include <chrono>
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
constexpr
int64_t
kWaitBlockTImeout
=
10
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
int
ProcessGroupHeter
::
send_count
=
0
;
int
ProcessGroupHeter
::
recv_count
=
0
;
std
::
shared_ptr
<
ProcessGroupHeter
::
HeterTask
>
ProcessGroupHeter
::
CreateTask
(
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
{
return
std
::
make_shared
<
ProcessGroupHeter
::
HeterTask
>
(
rank
,
comm_type
,
inputs
);
}
ProcessGroupHeter
::
HeterTask
::
HeterTask
(
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
:
Task
(
rank
,
inputs
,
CommType
)
{}
ProcessGroupHeter
::
HeterTask
::~
HeterTask
()
{}
bool
ProcessGroupHeter
::
HeterTask
::
IsCompleted
()
{
return
true
;
}
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool
ProcessGroupHeter
::
HeterTask
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
return
true
;
}
ProcessGroupHeter
::
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoint
,
int
src_rank
,
int
dst_rank
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
),
store_
(
store
),
local_rank_
(
local_rank
),
local_size_
(
local_size
),
gloo_rank_
(
gloo_rank
),
gloo_size_
(
gloo_size
),
with_switch_
(
with_switch
),
switch_endpoint_
(
switch_endpoint
),
src_rank_
(
src_rank
),
dst_rank_
(
dst_rank
)
{
return
;
#ifdef PADDLE_WITH_CUSTOM
if
(
paddle
::
platform
::
is_custom_place
(
place_
))
{
inner_pg_
=
std
::
make_shared
<
ProcessGroupCustom
>
(
store
,
local_rank
,
local_size
,
place_
,
IGNORE_ID
);
}
else
{
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
inner_pg_
=
std
::
make_shared
<
ProcessGroupNCCL
>
(
store
,
local_rank
,
local_size
,
place_
,
IGNORE_ID
);
#elif defined(PADDLE_WITH_ASCEND_CL)
inner_pg_
=
std
::
make_shared
<
ProcessGroupHCCL
>
(
store
,
local_rank
,
local_size
,
place_
,
IGNORE_ID
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"ProcessGroupHeter only supports NCCL, RCCL and HCCL now."
));
#endif
#ifdef PADDLE_WITH_CUSTOM
}
#endif
if
(
local_rank_
==
0
&&
!
with_switch_
)
{
auto
opts
=
ProcessGroupGloo
::
GlooOptions
::
create
();
opts
->
device
=
ProcessGroupGloo
::
createDefaultDevice
();
inter_pg_
=
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
gloo_rank_
,
gloo_size_
,
place_
,
IGNORE_ID
,
opts
);
}
}
template
<
typename
T
>
static
void
_do_add
(
T
*
dst
,
T
*
src
,
size_t
size
)
{
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
*
dst
+=
*
src
;
dst
++
;
src
++
;
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHeter
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
opts
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CudaPlace."
));
#endif
// Step1: do allreduce in inner cluster
auto
task
=
inner_pg_
->
AllReduce
(
in_tensors
,
in_tensors
,
opts
);
task
->
Wait
();
// Step2: copy tensors to CPU
if
(
local_rank_
==
0
)
{
std
::
vector
<
phi
::
DenseTensor
>
cpu_tensors
;
cpu_tensors
.
reserve
(
in_tensors
.
size
());
phi
::
DenseTensor
cpu_tensor
;
for
(
size_t
i
=
0
;
i
<
in_tensors
.
size
();
i
++
)
{
auto
gpu_tensor
=
in_tensors
[
i
];
cpu_tensor
.
Resize
(
gpu_tensor
.
dims
());
framework
::
TensorCopySync
(
gpu_tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
cpu_tensors
.
push_back
(
cpu_tensor
);
}
// Step3: do inter cluster allreduce
if
(
with_switch_
)
{
if
(
local_rank_
==
0
)
{
HeterClient
*
client_
=
HeterClient
::
GetInstance
({
switch_endpoint_
},
{},
0
).
get
();
auto
dense_cpu_tensor
=
cpu_tensors
[
0
];
std
::
vector
<
int64_t
>
send_size
;
send_size
.
push_back
(
dense_cpu_tensor
.
numel
());
int
ret
=
client_
->
Send
(
gid_
,
{
dense_cpu_tensor
.
name
()},
send_size
,
dense_cpu_tensor
.
data
(),
dense_cpu_tensor
.
numel
()
*
framework
::
DataTypeSize
(
dense_cpu_tensor
.
dtype
()));
PADDLE_ENFORCE_EQ
(
ret
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"Send to the switch module error."
));
phi
::
DenseTensor
cpu_tensor2
;
cpu_tensor2
.
AllocateFrom
(
std
::
make_unique
<
paddle
::
experimental
::
DefaultAllocator
>
(
paddle
::
platform
::
CPUPlace
())
.
get
(),
dense_cpu_tensor
.
dtype
(),
dense_cpu_tensor
.
numel
());
ret
=
client_
->
Recv
(
gid_
,
{
dense_cpu_tensor
.
name
()},
cpu_tensor2
.
data
(),
cpu_tensor2
.
numel
()
*
framework
::
DataTypeSize
(
cpu_tensor2
.
dtype
()));
PADDLE_ENFORCE_EQ
(
ret
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"Recv from the switch module error."
));
switch
(
dense_cpu_tensor
.
dtype
())
{
case
DataType
::
FLOAT32
:
_do_add
<
float
>
(
reinterpret_cast
<
float
*>
(
dense_cpu_tensor
.
data
()),
reinterpret_cast
<
float
*>
(
cpu_tensor2
.
data
()),
dense_cpu_tensor
.
numel
());
break
;
case
DataType
::
FLOAT64
:
_do_add
<
double
>
(
reinterpret_cast
<
double
*>
(
dense_cpu_tensor
.
data
()),
reinterpret_cast
<
double
*>
(
cpu_tensor2
.
data
()),
dense_cpu_tensor
.
numel
());
break
;
case
DataType
::
INT32
:
_do_add
<
int
>
(
reinterpret_cast
<
int
*>
(
dense_cpu_tensor
.
data
()),
reinterpret_cast
<
int
*>
(
cpu_tensor2
.
data
()),
dense_cpu_tensor
.
numel
());
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Unsupported data type (%s) to do add."
,
framework
::
DataType2String
(
dense_cpu_tensor
.
dtype
())));
}
}
}
else
{
auto
gloo_task
=
inter_pg_
->
AllReduce
(
cpu_tensors
,
cpu_tensors
,
opts
);
gloo_task
->
Wait
();
}
// Step4: copy cpu tensors to gpu
// copy cpu tensors to gpu
for
(
size_t
i
=
0
;
i
<
in_tensors
.
size
();
i
++
)
{
auto
gpu_tensor
=
out_tensors
[
i
];
auto
cpu_tensor
=
cpu_tensors
[
i
];
framework
::
TensorCopySync
(
cpu_tensor
,
cpu_tensor
.
place
(),
&
gpu_tensor
);
}
}
// Step5: broadcast among inner cluster
auto
b_opts
=
BroadcastOptions
();
b_opts
.
source_rank
=
0
;
auto
broadcast_task
=
inner_pg_
->
Broadcast
(
out_tensors
,
out_tensors
,
b_opts
);
broadcast_task
->
Wait
();
return
CreateTask
(
rank_
,
CommType
::
ALLREDUCE
,
in_tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHeter
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CudaPlace."
));
#endif
// Step1: do broadcast in inner cluster
auto
b_opts
=
BroadcastOptions
();
b_opts
.
source_rank
=
0
;
inner_pg_
->
Broadcast
(
in_tensors
,
out_tensors
,
b_opts
);
if
(
local_rank_
==
0
)
{
std
::
vector
<
phi
::
DenseTensor
>
cpu_tensors
;
cpu_tensors
.
reserve
(
in_tensors
.
size
());
for
(
size_t
i
=
0
;
i
<
in_tensors
.
size
();
i
++
)
{
auto
gpu_tensor
=
in_tensors
[
i
];
phi
::
DenseTensor
cpu_tensor
;
cpu_tensor
.
Resize
(
gpu_tensor
.
dims
());
framework
::
TensorCopySync
(
gpu_tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
cpu_tensors
.
push_back
(
cpu_tensor
);
}
if
(
with_switch_
)
{
if
(
local_rank_
==
0
)
{
HeterClient
*
client_
=
HeterClient
::
GetInstance
({
switch_endpoint_
},
{},
0
).
get
();
auto
dense_cpu_tensor
=
cpu_tensors
[
0
];
if
(
gloo_rank_
==
0
)
{
std
::
vector
<
int64_t
>
send_size
;
send_size
.
push_back
(
dense_cpu_tensor
.
numel
());
int
ret
=
client_
->
Send
(
gid_
,
{
dense_cpu_tensor
.
name
()},
send_size
,
dense_cpu_tensor
.
data
(),
dense_cpu_tensor
.
numel
()
*
framework
::
DataTypeSize
(
dense_cpu_tensor
.
dtype
()));
PADDLE_ENFORCE_EQ
(
ret
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"Send to the switch module error."
));
}
else
{
int
ret
=
client_
->
Recv
(
gid_
,
{
dense_cpu_tensor
.
name
()},
dense_cpu_tensor
.
data
(),
dense_cpu_tensor
.
numel
()
*
framework
::
DataTypeSize
(
dense_cpu_tensor
.
dtype
()));
PADDLE_ENFORCE_EQ
(
ret
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"Receive from the switch module error."
));
}
}
}
else
{
auto
gloo_task
=
inter_pg_
->
Broadcast
(
cpu_tensors
,
cpu_tensors
,
opts
);
gloo_task
->
Wait
();
}
for
(
size_t
i
=
0
;
i
<
in_tensors
.
size
();
i
++
)
{
auto
gpu_tensor
=
out_tensors
[
i
];
auto
cpu_tensor
=
cpu_tensors
[
i
];
framework
::
TensorCopySync
(
cpu_tensor
,
gpu_tensor
.
place
(),
&
gpu_tensor
);
}
}
auto
broadcast_task
=
inner_pg_
->
Broadcast
(
out_tensors
,
out_tensors
,
b_opts
);
broadcast_task
->
Wait
();
return
CreateTask
(
rank_
,
CommType
::
BROADCAST
,
in_tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHeter
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
int
peer
)
{
PADDLE_ENFORCE_EQ
(
in_tensors
.
size
(),
1
,
platform
::
errors
::
PreconditionNotMet
(
"For each send operation, there can only be one tensor to send."
));
// Copy Tensor to cpu
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
phi
::
DenseTensor
cpu_tensor
;
auto
&
gpu_tensor
=
in_tensors
[
0
];
framework
::
TensorCopySync
(
gpu_tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
PADDLE_ENFORCE_EQ
(
with_switch_
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"Gloo does not support the send operation."
));
auto
end
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
diff
=
end
-
start
;
VLOG
(
2
)
<<
"Time to copy tensor of dims("
<<
cpu_tensor
.
dims
()
<<
") from gpu to cpu for send "
<<
std
::
setw
(
9
)
<<
" is: "
<<
diff
.
count
()
<<
" s"
<<
std
::
endl
;
// Send to switch
HeterClient
*
client_
=
HeterClient
::
GetInstance
({
switch_endpoint_
},
{},
0
).
get
();
int64_t
tensor_size
=
cpu_tensor
.
numel
()
*
framework
::
DataTypeSize
(
cpu_tensor
.
dtype
());
std
::
vector
<
int64_t
>
send_size
;
send_size
.
push_back
(
tensor_size
);
auto
id
=
src_rank_
*
10000
+
dst_rank_
;
std
::
string
tensor_name
=
std
::
to_string
(
gid_
)
+
"_id_"
+
std
::
to_string
(
id
)
+
std
::
string
(
"_"
)
+
std
::
to_string
(
send_count
++
);
VLOG
(
2
)
<<
"tensor_name:"
<<
tensor_name
;
int
ret
=
client_
->
Send
(
gid_
,
{
tensor_name
},
send_size
,
cpu_tensor
.
data
(),
tensor_size
);
PADDLE_ENFORCE_EQ
(
ret
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"Send to the switch module error."
));
return
CreateTask
(
rank_
,
CommType
::
SEND
,
in_tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHeter
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int
peer
)
{
PADDLE_ENFORCE_EQ
(
out_tensors
.
size
(),
1
,
platform
::
errors
::
PreconditionNotMet
(
"For each rece operation, there can only be one tensor to receive."
));
// Copy Tensor to cpu
phi
::
DenseTensor
cpu_tensor
;
auto
&
gpu_tensor
=
out_tensors
[
0
];
cpu_tensor
.
Resize
(
gpu_tensor
.
dims
());
cpu_tensor
.
set_layout
(
gpu_tensor
.
layout
());
cpu_tensor
.
mutable_data
(
platform
::
CPUPlace
(),
gpu_tensor
.
dtype
());
PADDLE_ENFORCE_EQ
(
with_switch_
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"Gloo does not support the send operation."
));
// recv from switch
HeterClient
*
client_
=
HeterClient
::
GetInstance
({
switch_endpoint_
},
{},
0
).
get
();
auto
id
=
src_rank_
*
10000
+
dst_rank_
;
std
::
string
tensor_name
=
std
::
to_string
(
gid_
)
+
"_id_"
+
std
::
to_string
(
id
)
+
std
::
string
(
"_"
)
+
std
::
to_string
(
recv_count
++
);
VLOG
(
2
)
<<
"tensor_name: "
<<
tensor_name
;
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
int
ret
=
client_
->
Recv
(
gid_
,
{
tensor_name
},
cpu_tensor
.
data
(),
cpu_tensor
.
numel
()
*
framework
::
DataTypeSize
(
cpu_tensor
.
dtype
()));
PADDLE_ENFORCE_EQ
(
ret
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"receive to the switch module error."
));
auto
end
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
diff
=
end
-
start
;
double
goodput
=
cpu_tensor
.
numel
()
*
framework
::
DataTypeSize
(
cpu_tensor
.
dtype
())
/
diff
.
count
();
VLOG
(
2
)
<<
"Goodput: "
<<
goodput
<<
"B/s"
<<
std
::
endl
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
framework
::
TensorCopySync
(
cpu_tensor
,
gpu_tensor
.
place
(),
&
gpu_tensor
);
end
=
std
::
chrono
::
high_resolution_clock
::
now
();
diff
=
end
-
start
;
VLOG
(
2
)
<<
"Time to copy tensor of dims("
<<
cpu_tensor
.
dims
()
<<
") from cpu to gpu for recv "
<<
std
::
setw
(
9
)
<<
" is: "
<<
diff
.
count
()
<<
" s"
<<
std
::
endl
;
return
CreateTask
(
rank_
,
CommType
::
RECV
,
out_tensors
);
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupHeter.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#endif
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
#endif
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL))
#include "paddle/fluid/distributed/ps/service/heter_client.h"
#endif
#include "paddle/fluid/distributed/collective/Common.h"
constexpr
const
char
*
HETER_BACKEND_NAME
=
"HETER_BACKEND"
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
class
ProcessGroupHeter
:
public
ProcessGroup
{
public:
class
HeterTask
:
public
ProcessGroup
::
Task
,
public
std
::
enable_shared_from_this
<
HeterTask
>
{
public:
HeterTask
(
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
);
bool
IsCompleted
();
void
SynchronizeStreams
()
{}
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
Synchronize
()
{}
virtual
~
HeterTask
();
};
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoints
,
int
src_rank
,
int
dst_rank
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
HETER_BACKEND_NAME
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
,
std
::
vector
<
phi
::
DenseTensor
>&
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
,
std
::
vector
<
phi
::
DenseTensor
>&
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
int
peer
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int
peer
)
override
;
protected:
virtual
std
::
shared_ptr
<
ProcessGroupHeter
::
HeterTask
>
CreateTask
(
int
rank
,
CommType
opType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
private:
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
ProcessGroup
>
inner_pg_
;
std
::
shared_ptr
<
ProcessGroupGloo
>
inter_pg_
;
int
local_rank_
;
int
local_size_
;
int
gloo_rank_
;
int
gloo_size_
;
bool
with_switch_
;
std
::
string
switch_endpoint_
;
int
src_rank_
;
int
dst_rank_
;
static
int
send_count
;
static
int
recv_count
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupMPI.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupMPI.h"
#include <chrono>
#include "paddle/fluid/distributed/collective/Common.h"
constexpr
int64_t
kWaitBlockTImeout
=
10
;
namespace
paddle
{
namespace
distributed
{
std
::
map
<
phi
::
DataType
,
MPI_Datatype
>
mpiDatatype
=
{
{
phi
::
DataType
::
INT8
,
MPI_CHAR
},
{
phi
::
DataType
::
UINT8
,
MPI_UNSIGNED_CHAR
},
{
phi
::
DataType
::
FLOAT32
,
MPI_FLOAT
},
{
phi
::
DataType
::
FLOAT64
,
MPI_DOUBLE
},
{
phi
::
DataType
::
INT32
,
MPI_INT
},
{
phi
::
DataType
::
INT64
,
MPI_LONG
}};
void
ProcessGroupMPI
::
MPITask
::
FinishMPITaskError
(
std
::
exception_ptr
eptr
)
{
Finish
(
eptr
);
}
void
ProcessGroupMPI
::
MPITask
::
FinishMPITask
()
{
Finish
();
}
ProcessGroupMPI
::
MPIAsyncTask
::
MPIAsyncTask
(
MPI_Request
request
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
:
ProcessGroup
::
Task
(
-
1
,
inputs
,
CommType
::
UNKNOWN
),
request_
(
request
)
{
memset
(
&
status_
,
0
,
sizeof
(
status_
));
}
ProcessGroupMPI
::
MPIAsyncTask
::~
MPIAsyncTask
()
{
if
(
request_
!=
MPI_REQUEST_NULL
)
{
std
::
cerr
<<
" Task has not completed, try to destruct async mpi task, "
<<
"exit the program."
<<
std
::
endl
;
std
::
terminate
();
}
}
bool
ProcessGroupMPI
::
MPIAsyncTask
::
IsCompleted
()
{
if
(
request_
==
MPI_REQUEST_NULL
)
{
return
true
;
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
int
flag
=
0
;
MPI_CHECK
(
MPI_Test
(
&
request_
,
&
flag
,
&
status_
));
if
(
request_
!=
MPI_REQUEST_NULL
)
{
return
false
;
}
if
(
status_
.
MPI_ERROR
!=
MPI_SUCCESS
)
{
AppearException
();
}
return
true
;
}
bool
ProcessGroupMPI
::
MPIAsyncTask
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
if
(
request_
==
MPI_REQUEST_NULL
)
{
return
true
;
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Wait
(
&
request_
,
&
status_
));
if
(
status_
.
MPI_ERROR
!=
MPI_SUCCESS
)
{
AppearException
();
std
::
rethrow_exception
(
exception_
);
return
false
;
}
return
true
;
}
void
ProcessGroupMPI
::
MPIAsyncTask
::
AppearException
()
{
std
::
array
<
char
,
MPI_MAX_ERROR_STRING
>
buf
;
int
len
=
buf
.
size
();
MPI_CHECK
(
MPI_Error_string
(
status_
.
MPI_ERROR
,
buf
.
data
(),
&
len
));
exception_
=
std
::
make_exception_ptr
(
std
::
runtime_error
(
std
::
string
(
buf
.
data
(),
len
)));
}
void
ProcessGroupMPI
::
MPIAsyncTask
::
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
)
{
outputs_
=
std
::
make_shared
<
std
::
vector
<
phi
::
DenseTensor
>>
(
outputs
);
}
int
ProcessGroupMPI
::
mpi_thread_support
=
0
;
std
::
mutex
ProcessGroupMPI
::
pg_global_mutex
;
std
::
once_flag
ProcessGroupMPI
::
onceFlag
;
void
ProcessGroupMPI
::
ExitMPI
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Finalize
());
}
void
ProcessGroupMPI
::
InitOneTimeMPI
()
{
std
::
call_once
(
onceFlag
,
[]()
{
MPI_CHECK
(
MPI_Init_thread
(
nullptr
,
nullptr
,
MPI_THREAD_SERIALIZED
,
&
mpi_thread_support
));
PADDLE_ENFORCE_EQ
(
mpi_thread_support
<
MPI_THREAD_SERIALIZED
,
false
,
platform
::
errors
::
InvalidArgument
(
"MPI supports the number of threads "
"less than MPI_THREAD_SERIALIZED. "
));
std
::
atexit
(
ProcessGroupMPI
::
ExitMPI
);
});
}
std
::
shared_ptr
<
ProcessGroupMPI
>
ProcessGroupMPI
::
CreateProcessGroupMPI
(
const
std
::
vector
<
int
>&
ranks
,
int
gid
)
{
InitOneTimeMPI
();
MPI_Comm
groupComm
=
MPI_COMM_WORLD
;
int
rank
=
-
1
;
int
size
=
-
1
;
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
pg_global_mutex
);
if
(
!
ranks
.
empty
())
{
MPI_Group
worldGroup
;
MPI_Group
ranksGroup
;
MPI_CHECK
(
MPI_Comm_group
(
MPI_COMM_WORLD
,
&
worldGroup
));
MPI_CHECK
(
MPI_Group_incl
(
worldGroup
,
ranks
.
size
(),
ranks
.
data
(),
&
ranksGroup
));
constexpr
int
maxRetries
=
3
;
bool
create_success
=
false
;
MPI_Barrier
(
MPI_COMM_WORLD
);
for
(
auto
i
=
0
;
i
<
maxRetries
;
i
++
)
{
if
(
MPI_Comm_create
(
MPI_COMM_WORLD
,
ranksGroup
,
&
groupComm
))
{
create_success
=
true
;
break
;
}
}
MPI_CHECK
(
create_success
);
MPI_CHECK
(
MPI_Group_free
(
&
worldGroup
));
MPI_CHECK
(
MPI_Group_free
(
&
ranksGroup
));
}
if
(
groupComm
!=
MPI_COMM_NULL
)
{
MPI_CHECK
(
MPI_Comm_rank
(
groupComm
,
&
rank
));
MPI_CHECK
(
MPI_Comm_size
(
groupComm
,
&
size
));
PADDLE_ENFORCE_EQ
(
rank
<
0
||
size
<
0
,
false
,
platform
::
errors
::
InvalidArgument
(
"get world_size or rank failed!"
));
}
}
if
(
groupComm
==
MPI_COMM_NULL
)
{
return
std
::
shared_ptr
<
ProcessGroupMPI
>
();
}
VLOG
(
3
)
<<
"MPI Group Create Success! rank = "
<<
rank
<<
" size = "
<<
size
<<
" group_id = "
<<
gid
;
return
std
::
make_shared
<
ProcessGroupMPI
>
(
rank
,
size
,
groupComm
,
gid
);
}
ProcessGroupMPI
::
ProcessGroupMPI
(
int
rank
,
int
size
,
MPI_Comm
pg_comm
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
gid
),
stop_
(
false
),
pg_comm
(
pg_comm
)
{
PADDLE_ENFORCE_EQ
(
pg_comm
==
MPI_COMM_NULL
,
false
,
platform
::
errors
::
InvalidArgument
(
"Error! mpi comm is MPI_COMM_NULL!"
));
worker_thread
=
std
::
thread
(
&
ProcessGroupMPI
::
workLoop
,
this
);
}
ProcessGroupMPI
::~
ProcessGroupMPI
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_mutex
);
queue_consume
.
wait
(
lock
,
[
&
]
{
return
queue_
.
empty
();
});
stop_
=
true
;
lock
.
unlock
();
queue_produce
.
notify_all
();
worker_thread
.
join
();
}
void
ProcessGroupMPI
::
workLoop
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_mutex
);
while
(
!
stop_
)
{
if
(
queue_
.
empty
())
{
queue_produce
.
wait
(
lock
);
continue
;
}
auto
taskTuple
=
std
::
move
(
queue_
.
front
());
queue_
.
pop_front
();
auto
&
taskEntry
=
std
::
get
<
0
>
(
taskTuple
);
auto
&
task
=
std
::
get
<
1
>
(
taskTuple
);
lock
.
unlock
();
queue_consume
.
notify_one
();
try
{
taskEntry
->
run_
(
taskEntry
);
task
->
FinishMPITask
();
}
catch
(...)
{
task
->
FinishMPITaskError
(
std
::
current_exception
());
}
lock
.
lock
();
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
Enqueue
(
std
::
unique_ptr
<
TaskEntry
>
entry
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
{
auto
task
=
std
::
make_shared
<
MPITask
>
(
entry
->
dst_
,
inputs
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_mutex
);
queue_
.
push_back
(
std
::
make_tuple
(
std
::
move
(
entry
),
task
));
lock
.
unlock
();
queue_produce
.
notify_one
();
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
)
{
mpi
::
CheckValidInputs
(
in_tensors
);
const
auto
places
=
GetPlaceList
(
in_tensors
);
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
runFunc
=
[
opts
,
this
](
std
::
unique_ptr
<
TaskEntry
>&
entry
)
{
auto
data
=
(
entry
->
src_
)[
0
];
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
const
auto
root
=
opts
.
source_rank
+
opts
.
source_root
;
MPI_CHECK
(
MPI_Bcast
(
data
.
data
(),
data
.
numel
(),
mpiDatatype
.
at
(
data
.
dtype
()),
root
,
pg_comm
));
};
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
&
in_tensors
,
&
out_tensors
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
in_tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
opts
)
{
mpi
::
CheckValidInputs
(
in_tensors
);
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
runFunc
=
[
opts
,
this
](
std
::
unique_ptr
<
TaskEntry
>&
entry
)
{
auto
data
=
(
entry
->
src_
)[
0
];
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Allreduce
(
MPI_IN_PLACE
,
data
.
data
(),
data
.
numel
(),
mpiDatatype
.
at
(
data
.
dtype
()),
mpi
::
ToMPIType
(
opts
.
reduce_op
),
pg_comm
));
};
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
&
in_tensors
,
&
out_tensors
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
in_tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
Barrier
(
const
BarrierOptions
&
opts
)
{
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
runFunc
=
[
this
](
std
::
unique_ptr
<
TaskEntry
>&
entry
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Barrier
(
pg_comm
));
};
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
nullptr
,
nullptr
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
std
::
vector
<
phi
::
DenseTensor
>
{});
}
// NOTE: MPI_send tag set gid_
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
{
mpi
::
CheckValidInputs
(
tensors
);
auto
&
tensor
=
tensors
[
0
];
MPI_Request
request
=
MPI_REQUEST_NULL
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Isend
(
tensor
.
data
(),
tensor
.
numel
(),
mpiDatatype
.
at
(
tensor
.
dtype
()),
dst_rank
,
this
->
gid_
,
pg_comm
,
&
request
));
}
return
std
::
make_shared
<
ProcessGroupMPI
::
MPIAsyncTask
>
(
request
,
tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
{
mpi
::
CheckValidInputs
(
tensors
);
auto
&
tensor
=
tensors
[
0
];
MPI_Request
request
=
MPI_REQUEST_NULL
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Irecv
(
tensor
.
data
(),
tensor
.
numel
(),
mpiDatatype
.
at
(
tensor
.
dtype
()),
src_rank
,
this
->
gid_
,
pg_comm
,
&
request
));
}
return
std
::
make_shared
<
ProcessGroupMPI
::
MPIAsyncTask
>
(
request
,
tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
mpi
::
CheckValidInputs
(
in_tensors
);
PADDLE_ENFORCE_EQ
(
out_tensors
.
size
()
==
1
,
true
,
platform
::
errors
::
InvalidArgument
(
"MPI only support a single tensor op."
));
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
runFunc
=
[
this
](
std
::
unique_ptr
<
TaskEntry
>&
entry
)
{
auto
data
=
(
entry
->
src_
)[
0
];
std
::
vector
<
phi
::
DenseTensor
>
dst
=
entry
->
dst_
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Allgather
(
data
.
data
(),
data
.
numel
(),
mpiDatatype
.
at
(
data
.
dtype
()),
dst
[
0
].
data
(),
data
.
numel
(),
mpiDatatype
.
at
(
data
.
dtype
()),
pg_comm
));
};
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
&
in_tensors
,
&
out_tensors
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
in_tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
mpi
::
CheckValidInputs
(
in_tensors
);
mpi
::
CheckValidInputs
(
out_tensors
);
PADDLE_ENFORCE_EQ
(
in_tensors
[
0
].
numel
()
==
out_tensors
[
0
].
numel
()
&&
in_tensors
[
0
].
dtype
()
==
out_tensors
[
0
].
dtype
(),
true
,
platform
::
errors
::
InvalidArgument
(
"MPI AlltoAll: input and output are not equal in "
"size or data type."
));
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
runFunc
=
[
this
](
std
::
unique_ptr
<
TaskEntry
>&
entry
)
{
auto
srcdata
=
(
entry
->
src_
)[
0
];
auto
dstdata
=
(
entry
->
dst_
)[
0
];
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Alltoall
(
srcdata
.
data
(),
srcdata
.
numel
()
/
size_
,
mpiDatatype
.
at
(
srcdata
.
dtype
()),
dstdata
.
data
(),
dstdata
.
numel
()
/
size_
,
mpiDatatype
.
at
(
dstdata
.
dtype
()),
pg_comm
));
};
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
&
in_tensors
,
&
out_tensors
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
in_tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
{
mpi
::
CheckValidInputs
(
tensors
);
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
runFunc
=
[
opts
,
this
](
std
::
unique_ptr
<
TaskEntry
>&
entry
)
{
auto
data
=
(
entry
->
src_
)[
0
];
auto
dataPtr
=
(
entry
->
src_
)[
0
].
data
();
void
*
sendbuf
=
(
rank_
==
opts
.
root_rank
)
?
MPI_IN_PLACE
:
dataPtr
;
void
*
recvbuf
=
(
rank_
==
opts
.
root_rank
)
?
dataPtr
:
nullptr
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Reduce
(
sendbuf
,
recvbuf
,
data
.
numel
(),
mpiDatatype
.
at
(
data
.
dtype
()),
mpi
::
ToMPIType
(
opts
.
reduce_op
),
opts
.
root_rank
,
pg_comm
));
};
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
&
tensors
,
&
tensors
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupMPI
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
{
mpi
::
CheckValidInputs
(
in_tensors
);
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
runFunc
=
[
opts
,
this
](
std
::
unique_ptr
<
TaskEntry
>&
entry
)
{
auto
data
=
(
entry
->
dst_
)[
0
];
void
*
sendbuf
=
nullptr
;
if
(
rank_
==
opts
.
root_rank
)
{
std
::
vector
<
phi
::
DenseTensor
>&
inputData
=
entry
->
src_
;
sendbuf
=
inputData
[
0
].
data
();
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
pg_global_mutex
);
MPI_CHECK
(
MPI_Scatter
(
sendbuf
,
data
.
numel
(),
mpiDatatype
.
at
(
data
.
dtype
()),
data
.
data
(),
data
.
numel
(),
mpiDatatype
.
at
(
data
.
dtype
()),
opts
.
root_rank
,
pg_comm
));
};
if
(
rank_
==
opts
.
root_rank
)
{
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
&
in_tensors
,
&
out_tensors
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
in_tensors
);
}
else
{
auto
entry
=
std
::
make_unique
<
TaskEntry
>
(
nullptr
,
&
out_tensors
,
std
::
move
(
runFunc
));
return
Enqueue
(
std
::
move
(
entry
),
in_tensors
);
}
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupMPI.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include <condition_variable>
#include <deque>
#include <exception>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_MPI)
#include "paddle/fluid/distributed/collective/MPITools.h"
#endif
constexpr
const
char
*
MPI_BACKEND_NAME
=
"MPI"
;
namespace
paddle
{
namespace
distributed
{
struct
TaskEntry
{
explicit
TaskEntry
(
std
::
vector
<
phi
::
DenseTensor
>*
src_ptr
,
std
::
vector
<
phi
::
DenseTensor
>*
dst_ptr
,
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
run
)
:
dst_
(
dst_ptr
?
*
dst_ptr
:
std
::
vector
<
phi
::
DenseTensor
>
()),
run_
(
std
::
move
(
run
))
{
if
(
src_ptr
)
{
src_
=
*
src_ptr
;
}
}
TaskEntry
(
const
TaskEntry
&
)
=
delete
;
TaskEntry
&
operator
=
(
const
TaskEntry
&
)
=
delete
;
std
::
vector
<
phi
::
DenseTensor
>
src_
;
std
::
vector
<
phi
::
DenseTensor
>
dst_
;
int
*
srcRank_
=
nullptr
;
std
::
function
<
void
(
std
::
unique_ptr
<
TaskEntry
>&
)
>
run_
;
};
class
ProcessGroupMPI
:
public
ProcessGroup
{
public:
class
MPITask
:
public
ProcessGroup
::
Task
{
public:
explicit
MPITask
(
std
::
vector
<
phi
::
DenseTensor
>
outputTensors
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputTensors
)
:
ProcessGroup
::
Task
(
-
1
,
inputTensors
,
CommType
::
UNKNOWN
),
outputs_
(
std
::
move
(
outputTensors
))
{}
void
Synchronize
()
{
Wait
();
}
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
timeout
==
kWaitTimeout
)
{
// This waits without a timeout.
cv_
.
wait
(
lock
,
[
&
]
{
return
is_completed_
;
});
}
else
{
// Waits for the user-provided timeout.
cv_
.
wait_for
(
lock
,
timeout
,
[
&
]
{
return
is_completed_
;
});
PADDLE_ENFORCE_EQ
(
is_completed_
,
true
,
platform
::
errors
::
InvalidArgument
(
"MPI operation timeout! "
));
}
if
(
exception_
)
{
std
::
rethrow_exception
(
exception_
);
}
return
true
;
}
protected:
friend
class
ProcessGroupMPI
;
private:
// about mpi
void
Finish
(
std
::
exception_ptr
exception
=
nullptr
)
{
is_completed_
=
true
;
exception_
=
exception
;
cv_
.
notify_all
();
}
void
FinishMPITask
();
void
FinishMPITaskError
(
std
::
exception_ptr
eptr
);
std
::
vector
<
phi
::
DenseTensor
>
outputs_
;
std
::
condition_variable
cv_
;
std
::
exception_ptr
exception_
;
};
public:
class
MPIAsyncTask
:
public
ProcessGroup
::
Task
{
public:
MPIAsyncTask
(
MPI_Request
request
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
bool
IsCompleted
();
void
Synchronize
()
{}
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
);
// NOLINT
virtual
~
MPIAsyncTask
();
protected:
void
AppearException
();
private:
std
::
shared_ptr
<
std
::
vector
<
phi
::
DenseTensor
>>
outputs_
;
MPI_Request
request_
;
MPI_Status
status_
;
std
::
exception_ptr
exception_
;
};
ProcessGroupMPI
(
int
rank
,
int
size
,
MPI_Comm
pgComm
,
int
gid
);
virtual
~
ProcessGroupMPI
();
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
MPI_BACKEND_NAME
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in
,
std
::
vector
<
phi
::
DenseTensor
>&
out
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
)
override
;
static
std
::
shared_ptr
<
ProcessGroupMPI
>
CreateProcessGroupMPI
(
const
std
::
vector
<
int
>&
ranks
,
int
gid
);
protected:
void
workLoop
();
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Enqueue
(
std
::
unique_ptr
<
TaskEntry
>
entry
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
private:
bool
stop_
{
false
};
std
::
mutex
pg_mutex
;
std
::
thread
worker_thread
;
std
::
deque
<
std
::
tuple
<
std
::
unique_ptr
<
TaskEntry
>
,
std
::
shared_ptr
<
MPITask
>>>
queue_
;
std
::
condition_variable
queue_produce
;
std
::
condition_variable
queue_consume
;
static
void
InitOneTimeMPI
();
static
void
ExitMPI
();
static
std
::
once_flag
onceFlag
;
static
std
::
mutex
pg_global_mutex
;
static
int
mpi_thread_support
;
MPI_Comm
pg_comm
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h"
DECLARE_bool
(
nccl_blocking_wait
);
DECLARE_bool
(
use_stream_safe_cuda_allocator
);
constexpr
int64_t
kWaitBlockTImeout
=
10
;
namespace
paddle
{
namespace
distributed
{
void
SyncDefaultStream
(
const
std
::
vector
<
Place
>&
places
,
std
::
vector
<
EventManager
>&
ncclEvents
,
// NOLINT
std
::
vector
<
std
::
unique_ptr
<
phi
::
GPUContext
>>&
dev_ctx
)
{
// NOLINT
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]));
ncclEvents
[
i
].
Record
(
*
default_ctx
);
ncclEvents
[
i
].
Block
(
*
dev_ctx
[
i
]);
}
}
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
ProcessGroupNCCL
::
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
{
return
std
::
make_shared
<
ProcessGroupNCCL
::
NCCLTask
>
(
places
,
rank
,
comm_type
,
inputs
);
}
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
ProcessGroupNCCL
::
CreateTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
bool
is_sync
,
bool
use_calc_stream
)
{
return
std
::
make_shared
<
ProcessGroupNCCL
::
NCCLTask
>
(
places
,
rank
,
comm_type
,
inputs
,
is_sync
,
use_calc_stream
);
}
ProcessGroupNCCL
::
NCCLTask
::
NCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
:
TaskStream
(
rank
,
inputs
,
CommType
),
places_
(
places
)
{
control_events_
.
resize
(
places
.
size
());
ncclComms_
.
resize
(
places
.
size
());
}
ProcessGroupNCCL
::
NCCLTask
::
NCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
bool
sync_op
,
bool
use_calc_stream
)
:
TaskStream
(
rank
,
inputs
,
comm_type
,
sync_op
,
use_calc_stream
),
places_
(
places
)
{
control_events_
.
resize
(
places
.
size
());
ncclComms_
.
resize
(
places
.
size
());
}
ProcessGroupNCCL
::
NCCLTask
::~
NCCLTask
()
{}
void
ProcessGroupNCCL
::
NCCLTask
::
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
)
{
// NOLINT
outputs_
=
std
::
make_shared
<
std
::
vector
<
phi
::
DenseTensor
>>
(
outputs
);
}
void
ProcessGroupNCCL
::
NCCLTask
::
SynchronizeStreams
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places_
[
i
]));
default_ctx
->
WaitEvent
(
control_events_
[
i
].
GetRawCudaEvent
());
}
}
bool
ProcessGroupNCCL
::
NCCLTask
::
IsCompleted
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
if
(
!
control_events_
[
i
].
Query
())
{
return
false
;
}
}
return
true
;
}
void
ProcessGroupNCCL
::
CheckSplitSizes
(
std
::
vector
<
int64_t
>*
split_sizes
,
std
::
vector
<
int64_t
>
tensor_shape
)
{
int64_t
len_size
=
(
*
split_sizes
).
size
();
if
(
len_size
==
0
)
{
PADDLE_ENFORCE_EQ
(
tensor_shape
[
0
]
%
size_
==
0
,
true
,
platform
::
errors
::
InvalidArgument
(
"Tensor's dim[0] must be divisible by group size "
"when split_sizes not given."
));
(
*
split_sizes
)
.
insert
((
*
split_sizes
).
end
(),
size_
,
static_cast
<
int64_t
>
(
tensor_shape
[
0
]
/
size_
));
}
else
{
PADDLE_ENFORCE_EQ
(
len_size
==
size_
,
true
,
platform
::
errors
::
InvalidArgument
(
"The length of split_sizes must be equal to group size."
));
auto
sum_size
=
std
::
accumulate
(
(
*
split_sizes
).
begin
(),
(
*
split_sizes
).
end
(),
static_cast
<
int64_t
>
(
0
));
PADDLE_ENFORCE_EQ
(
sum_size
==
tensor_shape
[
0
],
true
,
platform
::
errors
::
InvalidArgument
(
"The sum of split_sizes must be equal to tensor's dim[0]."
));
}
}
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool
ProcessGroupNCCL
::
NCCLTask
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
// Warning here when use calc stream but also invoke waiting explicitly.
if
(
UseCalcStream
())
{
VLOG
(
3
)
<<
"Warning: The communication is on calc stream, wait here is "
"useless."
;
return
true
;
}
SynchronizeStreams
();
if
(
FLAGS_nccl_blocking_wait
)
{
// NOTE(shenliang03): It will block host for sync
while
(
!
IsCompleted
())
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
kWaitBlockTImeout
));
}
}
if
(
!
barrierTensors_
.
empty
())
{
// If we use the work to do barrier, we should block cpu
for
(
auto
&
place
:
places_
)
{
platform
::
CUDADeviceGuard
gpuGuard
(
place
);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceSynchronize
());
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
#endif
}
}
return
true
;
}
// Same as Wait
void
ProcessGroupNCCL
::
NCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupNCCL
::
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
ProcessGroupStream
(
rank
,
size
,
place
,
gid
),
store_
(
store
)
{
platform
::
SetDeviceId
(
place_
.
device
);
}
void
ProcessGroupNCCL
::
BroadcastUniqueNCCLID
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
)
{
// NOLINT
if
(
rank_
==
0
)
{
for
(
size_t
i
=
0
;
i
<
nccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupNCCL/nccl_ids/"
+
std
::
to_string
(
gid_
)
+
"/"
+
std
::
to_string
(
i
);
auto
nccl_id
=
std
::
vector
<
uint8_t
>
(
reinterpret_cast
<
uint8_t
*>
(
&
nccl_ids
[
i
]),
reinterpret_cast
<
uint8_t
*>
(
&
nccl_ids
[
i
])
+
NCCL_UNIQUE_ID_BYTES
);
store_
->
set
(
key
,
nccl_id
);
}
}
else
{
for
(
size_t
i
=
0
;
i
<
nccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupNCCL/nccl_ids/"
+
std
::
to_string
(
gid_
)
+
"/"
+
std
::
to_string
(
i
);
auto
ret
=
store_
->
get
(
key
);
std
::
memcpy
(
&
nccl_ids
[
i
],
ret
.
data
(),
ret
.
size
());
}
}
}
// create NCCLManager cache for places_key
void
ProcessGroupNCCL
::
CreateNCCLManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
)
{
PADDLE_ENFORCE_EQ
(
places_key
.
empty
(),
false
,
platform
::
errors
::
PreconditionNotMet
(
"Not able to create/get the NCCL Communicator since "
"the GPU place are not known"
));
std
::
vector
<
std
::
shared_ptr
<
NCCLCommManager
>>
nccl_comms
;
nccl_comms
.
resize
(
places
.
size
());
// using vector just for broadcast
std
::
vector
<
ncclUniqueId
>
nccl_ids
;
nccl_ids
.
resize
(
1
);
auto
&
nccl_id
=
nccl_ids
.
front
();
for
(
auto
&
place
:
places
)
{
used_place_ids_
.
insert
(
place
.
GetDeviceId
());
}
if
(
rank_
==
0
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
));
}
BroadcastUniqueNCCLID
(
nccl_ids
);
VLOG
(
3
)
<<
"init nccl rank: "
<<
rank_
<<
", nranks: "
<<
size_
<<
", place: "
<<
places_key
<<
", nccl uniqueid: "
<<
SerializeNCCLUniqueId
(
nccl_id
);
std
::
vector
<
std
::
unique_ptr
<
phi
::
GPUContext
>>
dev_ctx
;
dev_ctx
.
resize
(
places
.
size
());
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
platform
::
CUDADeviceGuard
guard
(
places
[
i
]);
nccl_comms
[
i
]
=
NCCLCommManager
::
Create
(
GetSize
(),
GetRank
(),
nccl_id
);
dev_ctx
[
i
].
reset
(
new
phi
::
GPUContext
(
places
[
i
]));
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
std
::
vector
<
EventManager
>
events
;
events
.
resize
(
places
.
size
());
// These caches will be useful to process sync/wait/communicate
places_to_events_
.
emplace
(
places_key
,
std
::
move
(
events
));
places_to_ncclcomm_
.
emplace
(
places_key
,
std
::
move
(
nccl_comms
));
places_to_ctx_
.
emplace
(
places_key
,
std
::
move
(
dev_ctx
));
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
Fn
fn
,
CommType
comm_type
,
bool
sync_op
,
bool
use_calc_stream
)
{
const
auto
&
places
=
GetPlaceList
(
inputs
);
const
auto
&
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
if
(
!
use_calc_stream
)
{
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
}
auto
task
=
CreateTask
(
places
,
rank_
,
comm_type
,
inputs
,
sync_op
,
use_calc_stream
);
platform
::
CUDADeviceGuard
cuda_guard
;
{
platform
::
NCCLGroupGuard
nccl_guard
;
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
gpuStream_t
nccl_stream
;
if
(
use_calc_stream
)
{
nccl_stream
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]))
->
stream
();
}
else
{
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
}
fn
(
inputs
[
i
],
outputs
[
i
],
nccl_comms
[
i
]
->
GetNcclComm
(),
nccl_stream
);
}
}
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
gpuStream_t
nccl_stream
;
if
(
use_calc_stream
)
{
nccl_stream
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]))
->
stream
();
}
else
{
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
}
memory
::
RecordStream
(
inputs
[
i
].
Holder
(),
nccl_stream
);
}
}
// Adding stream event dependency only when use comm stream
if
(
!
use_calc_stream
)
{
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
}
return
task
;
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
Fn
fn
,
CommType
op_type
)
{
const
auto
places
=
GetPlaceList
(
inputs
);
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
inputs
);
// construct uninitialize guard for device
platform
::
CUDADeviceGuard
cuda_guard
;
{
platform
::
NCCLGroupGuard
nccl_guard
;
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
const
auto
&
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
fn
(
inputs
[
i
],
outputs
[
i
],
nccl_comms
[
i
]
->
GetNcclComm
(),
nccl_stream
);
}
}
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
memory
::
RecordStream
(
inputs
[
i
].
Holder
(),
places_to_ctx_
[
key
][
i
]
->
stream
());
}
}
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
return
task
;
}
template
<
typename
Fn
>
void
ProcessGroupNCCL
::
Collective
(
const
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
,
Fn
fn
,
CommType
op_type
)
{
std
::
vector
<
Place
>
places
;
places
.
push_back
(
in
->
place
());
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
// construct uninitialize guard for device
platform
::
CUDADeviceGuard
cuda_guard
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
cuda_guard
.
SetDevice
(
places
[
0
]);
memory
::
RecordStream
(
in
->
Holder
(),
places_to_ctx_
[
key
][
0
]
->
stream
());
}
{
platform
::
NCCLGroupGuard
nccl_guard
;
cuda_guard
.
SetDevice
(
places
[
0
]);
const
auto
&
nccl_stream
=
places_to_ctx_
[
key
][
0
]
->
stream
();
fn
(
in
,
out
,
nccl_comms
[
0
]
->
GetNcclComm
(),
nccl_stream
);
}
cuda_guard
.
SetDevice
(
places
[
0
]);
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
Fn
fn
,
int
dst_rank
,
CommType
op_type
,
bool
sync_op
,
bool
use_calc_stream
)
{
const
auto
&
places
=
GetPlaceList
(
tensors
);
const
auto
&
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
if
(
!
use_calc_stream
)
{
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
}
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
tensors
,
sync_op
,
use_calc_stream
);
platform
::
CUDADeviceGuard
cuda_guard
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
gpuStream_t
nccl_stream
;
if
(
use_calc_stream
)
{
nccl_stream
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]))
->
stream
();
}
else
{
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
}
memory
::
RecordStream
(
tensors
[
i
].
Holder
(),
nccl_stream
);
}
}
{
platform
::
NCCLGroupGuard
nccl_guard
;
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
gpuStream_t
nccl_stream
;
if
(
use_calc_stream
)
{
nccl_stream
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]))
->
stream
();
}
else
{
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
}
fn
(
tensors
[
i
],
nccl_comms
[
i
]
->
GetNcclComm
(),
nccl_stream
,
dst_rank
);
}
}
if
(
!
use_calc_stream
)
{
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
}
return
task
;
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
Fn
fn
,
int
dst_rank
,
CommType
op_type
)
{
const
auto
places
=
GetPlaceList
(
tensors
);
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
tensors
);
// construct uninitialize guard for device
platform
::
CUDADeviceGuard
cuda_guard
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
memory
::
RecordStream
(
tensors
[
i
].
Holder
(),
places_to_ctx_
[
key
][
i
]
->
stream
());
}
}
{
platform
::
NCCLGroupGuard
nccl_guard
;
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
const
auto
&
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
fn
(
tensors
[
i
],
nccl_comms
[
i
]
->
GetNcclComm
(),
nccl_stream
,
dst_rank
);
}
}
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
);
},
CommType
::
ALLREDUCE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
);
},
CommType
::
ALLREDUCE
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
const
auto
root
=
opts
.
source_rank
*
in_tensors
.
size
()
+
opts
.
source_root
;
return
platform
::
dynload
::
ncclBroadcast
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
root
,
comm
,
stream
);
},
CommType
::
BROADCAST
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
const
auto
root
=
opts
.
source_rank
*
in_tensors
.
size
()
+
opts
.
source_root
;
return
platform
::
dynload
::
ncclBroadcast
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
root
,
comm
,
stream
);
},
CommType
::
BROADCAST
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Barrier
(
const
BarrierOptions
&
opts
)
{
// Only support single card single process
std
::
vector
<
phi
::
GPUPlace
>
places
=
{
place_
};
std
::
vector
<
phi
::
DenseTensor
>
barrierTensors
;
barrierTensors
.
reserve
(
places
.
size
());
platform
::
CUDADeviceGuard
gpuGuard
;
for
(
auto
&
place
:
places
)
{
gpuGuard
.
SetDeviceIndex
(
place
.
GetDeviceId
());
phi
::
DenseTensorMeta
meta
(
phi
::
DataType
::
FLOAT32
,
phi
::
DDim
({
1
}));
auto
allocator
=
std
::
unique_ptr
<
phi
::
Allocator
>
(
new
paddle
::
experimental
::
DefaultAllocator
(
place
));
barrierTensors
.
emplace_back
(
allocator
.
get
(),
meta
);
}
auto
task
=
ProcessGroupNCCL
::
AllReduce
(
barrierTensors
,
barrierTensors
,
AllreduceOptions
());
auto
nccl_task
=
dynamic_cast
<
ProcessGroupNCCL
::
NCCLTask
*>
(
task
.
get
());
nccl_task
->
barrierTensors_
=
std
::
move
(
barrierTensors
);
return
task
;
}
void
CheckTensorsInDifferentDevices
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
const
size_t
num_devices
)
{
PADDLE_ENFORCE_EQ
(
tensors
.
size
()
==
0
,
false
,
platform
::
errors
::
InvalidArgument
(
"Tensor list must be nonempty."
));
PADDLE_ENFORCE_LE
(
tensors
.
size
(),
num_devices
,
platform
::
errors
::
InvalidArgument
(
"Tensor list mustn't be larger than the number of available GPUs."
));
std
::
set
<
Place
>
used_devices
;
for
(
const
auto
&
t
:
tensors
)
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
t
.
place
()),
true
,
platform
::
errors
::
InvalidArgument
(
"Tensors must be CUDA and dense tensor."
));
const
auto
inserted
=
used_devices
.
insert
(
t
.
place
()).
second
;
PADDLE_ENFORCE_EQ
(
inserted
,
true
,
platform
::
errors
::
InvalidArgument
(
"Tensors must be on distinct GPU devices."
));
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int64_t
offset
,
int64_t
length
)
{
// CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)};
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)};
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int64_t
offset
,
int64_t
length
)
{
// phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)};
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)};
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllGather
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
},
CommType
::
ALLGATHER
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllGather
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
},
CommType
::
ALLGATHER
,
sync_op
,
use_calc_stream
);
}
void
*
GetPointerByOffset
(
void
*
raw_pointer
,
size_t
offset
,
experimental
::
DataType
type
)
{
if
(
type
==
experimental
::
DataType
::
FLOAT32
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
float
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
FLOAT64
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
double
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
FLOAT16
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int16_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
INT32
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int32_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
INT64
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int64_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
INT8
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int8_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
UINT8
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uint8_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
BOOL
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
bool
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
BFLOAT16
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uint16_t
*>
(
raw_pointer
)
+
offset
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"This datatype in nccl is not supported."
));
}
return
nullptr
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllGather
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
output
.
data
(),
length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
},
CommType
::
ALLGATHER
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllGather
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
output
.
data
(),
length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
},
CommType
::
ALLGATHER
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
size_t
offset
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
GetPointerByOffset
(
output
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
input
.
numel
()
/
size_
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
},
CommType
::
ALLTOALL
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
size_t
offset
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
GetPointerByOffset
(
output
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
input
.
numel
()
/
size_
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
},
CommType
::
ALLTOALL
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllToAll_Single
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
input
.
dtype
()
==
output
.
dtype
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The dtypes of input and output must be equal."
));
std
::
vector
<
int64_t
>
in_dims
=
phi
::
vectorize
(
input
.
dims
());
std
::
vector
<
int64_t
>
out_dims
=
phi
::
vectorize
(
output
.
dims
());
CheckSplitSizes
(
&
in_sizes
,
in_dims
);
CheckSplitSizes
(
&
out_sizes
,
out_dims
);
size_t
in_offset
=
0
,
out_offset
=
0
;
size_t
in_length
=
0
,
out_length
=
0
;
size_t
in_row_size
=
input
.
numel
()
/
in_dims
[
0
];
size_t
out_row_size
=
output
.
numel
()
/
out_dims
[
0
];
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
in_length
=
in_sizes
[
i
]
*
in_row_size
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
in_offset
,
input
.
dtype
()),
in_length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
in_offset
+=
in_length
;
out_length
=
out_sizes
[
i
]
*
out_row_size
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
GetPointerByOffset
(
output
.
data
(),
out_offset
,
input
.
dtype
()),
out_length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
out_offset
+=
out_length
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
},
CommType
::
ALLTOALL_SINGLE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllToAllSingle
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
input
.
dtype
()
==
output
.
dtype
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The dtypes of input and output must be equal."
));
std
::
vector
<
int64_t
>
in_dims
=
phi
::
vectorize
(
input
.
dims
());
std
::
vector
<
int64_t
>
out_dims
=
phi
::
vectorize
(
output
.
dims
());
CheckSplitSizes
(
&
in_sizes
,
in_dims
);
CheckSplitSizes
(
&
out_sizes
,
out_dims
);
size_t
in_offset
=
0
,
out_offset
=
0
;
size_t
in_length
=
0
,
out_length
=
0
;
size_t
in_row_size
=
input
.
numel
()
/
in_dims
[
0
];
size_t
out_row_size
=
output
.
numel
()
/
out_dims
[
0
];
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
in_length
=
in_sizes
[
i
]
*
in_row_size
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
in_offset
,
input
.
dtype
()),
in_length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
in_offset
+=
in_length
;
out_length
=
out_sizes
[
i
]
*
out_row_size
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
GetPointerByOffset
(
output
.
data
(),
out_offset
,
input
.
dtype
()),
out_length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
out_offset
+=
out_length
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
},
CommType
::
ALLTOALL_SINGLE
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
opts
.
root_rank
,
comm
,
stream
));
},
CommType
::
REDUCE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
opts
.
root_rank
,
comm
,
stream
));
},
CommType
::
REDUCE
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
platform
::
CUDADeviceGuard
cuda_guard
;
cuda_guard
.
SetDevice
(
output
.
place
());
memory
::
RecordStream
(
output
.
Holder
(),
stream
);
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduceScatter
(
input
.
data
(),
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
));
},
CommType
::
REDUCE_SCATTER
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
size_t
offset
=
0
;
if
(
rank_
==
opts
.
root_rank
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
input
.
numel
()
/
size_
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
}
},
CommType
::
SCATTER
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
output
.
numel
(),
input
.
numel
()
/
size_
,
platform
::
errors
::
InvalidArgument
(
"Input and output tensors should have the same shape."
));
size_t
offset
=
0
;
if
(
rank_
==
opts
.
root_rank
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
input
.
numel
()
/
size_
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
}
},
CommType
::
SCATTER
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
_ReduceScatterBase
(
phi
::
DenseTensor
&
out_tensor
,
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
)
{
// auto tensor = out_tensors.back();
PADDLE_ENFORCE_EQ
(
out_tensor
.
dtype
(),
in_tensor
.
dtype
(),
platform
::
errors
::
InvalidArgument
(
"Input tensor and output tensor should be same dtype."
));
PADDLE_ENFORCE_EQ
(
out_tensor
.
numel
()
*
size_
,
in_tensor
.
numel
(),
platform
::
errors
::
InvalidArgument
(
"input tensor must be the same size as "
"output tensor size times world_size"
));
auto
inputs
=
std
::
vector
<
phi
::
DenseTensor
>
{
in_tensor
};
auto
outputs
=
std
::
vector
<
phi
::
DenseTensor
>
{
out_tensor
};
return
Collective
(
inputs
,
outputs
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
platform
::
CUDADeviceGuard
cuda_guard
;
cuda_guard
.
SetDevice
(
output
.
place
());
memory
::
RecordStream
(
output
.
Holder
(),
stream
);
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduceScatter
(
input
.
data
(),
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
));
},
CommType
::
REDUCE_SCATTER
);
}
void
ProcessGroupNCCL
::
GroupStart
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
}
void
ProcessGroupNCCL
::
GroupEnd
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
}
ncclComm_t
ProcessGroupNCCL
::
NCCLComm
(
const
Place
&
place
)
const
{
std
::
vector
<
Place
>
places
=
{
place
};
const
auto
&
iter
=
places_to_ncclcomm_
.
find
(
GetKeyFromPlaces
(
places
));
PADDLE_ENFORCE_NE
(
iter
,
places_to_ncclcomm_
.
end
(),
platform
::
errors
::
InvalidArgument
(
"Cannot find nccl comm in process group."
));
return
iter
->
second
[
0
]
->
GetNcclComm
();
}
phi
::
DeviceContext
*
ProcessGroupNCCL
::
GetDeviceContext
(
const
Place
&
place
)
const
{
return
GetDeviceContext
(
place
,
/*use_calc_stream*/
false
);
}
phi
::
DeviceContext
*
ProcessGroupNCCL
::
GetDeviceContext
(
const
Place
&
place
,
bool
use_calc_stream
)
const
{
if
(
use_calc_stream
)
{
return
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
}
else
{
std
::
vector
<
Place
>
places
=
{
place
};
const
auto
&
iter
=
places_to_ctx_
.
find
(
GetKeyFromPlaces
(
places
));
PADDLE_ENFORCE_NE
(
iter
,
places_to_ctx_
.
end
(),
platform
::
errors
::
InvalidArgument
(
"Cannot find device context in process group."
));
return
iter
->
second
[
0
].
get
();
}
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
constexpr
const
char
*
NCCL_BACKEND_NAME
=
"NCCL"
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
class
ProcessGroupNCCL
:
public
ProcessGroupStream
{
public:
class
NCCLTask
:
public
ProcessGroupStream
::
TaskStream
,
public
std
::
enable_shared_from_this
<
NCCLTask
>
{
public:
NCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
NCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
bool
sync_op
,
bool
use_calc_stream
);
bool
IsCompleted
();
void
SynchronizeStreams
();
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
Synchronize
();
void
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
);
// NOLINT
virtual
~
NCCLTask
();
std
::
vector
<
EventManager
>
control_events_
;
std
::
vector
<
phi
::
DenseTensor
>
barrierTensors_
;
protected:
std
::
vector
<
Place
>
places_
;
std
::
vector
<
std
::
shared_ptr
<
NCCLCommManager
>>
ncclComms_
;
std
::
shared_ptr
<
std
::
vector
<
phi
::
DenseTensor
>>
outputs_
;
private:
};
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
NCCL_BACKEND_NAME
);
}
phi
::
DeviceContext
*
GetDeviceContext
(
const
Place
&
place
)
const
override
;
phi
::
DeviceContext
*
GetDeviceContext
(
const
Place
&
place
,
bool
use_calc_stream
)
const
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
AllreduceOptions
&
options
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
// TODO(liyurui): This API will be moved later
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int64_t
offset
,
int64_t
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int64_t
offset
,
int64_t
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll_Single
(
std
::
vector
<
phi
::
DenseTensor
>&
in
,
std
::
vector
<
phi
::
DenseTensor
>&
out
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAllSingle
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
_ReduceScatterBase
(
phi
::
DenseTensor
&
,
// NOLINT
phi
::
DenseTensor
&
,
// NOLINT
const
ReduceScatterOptions
&
)
override
;
static
void
GroupStart
();
static
void
GroupEnd
();
ncclComm_t
NCCLComm
(
const
Place
&
place
)
const
;
protected:
virtual
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
op_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
virtual
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
CreateTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
op_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
bool
sync_op
,
bool
use_calc_stream
);
protected:
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
NCCLCommManager
>
nccl_comm_
;
std
::
mutex
mutex_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
NCCLCommManager
>>>
places_to_ncclcomm_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
EventManager
>>
places_to_events_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
unique_ptr
<
phi
::
GPUContext
>>>
places_to_ctx_
;
std
::
set
<
int
>
used_place_ids_
;
private:
void
BcastNCCLId
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
,
// NOLINT
int
root
,
// NOLINT
int
server_fd
);
void
BroadcastUniqueNCCLID
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
);
// NOLINT
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
Fn
fn
,
CommType
op_type
);
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroupStream
::
Task
>
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
Fn
fn
,
CommType
comm_type
,
bool
sync_op
,
bool
use_calc_stream
);
template
<
typename
Fn
>
void
Collective
(
const
phi
::
DenseTensor
*
,
phi
::
DenseTensor
*
,
Fn
fn
,
CommType
op_type
);
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
Fn
fn
,
int
dst_rank
,
CommType
op_type
);
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
Fn
fn
,
int
dst_rank
,
CommType
op_type
,
bool
sync_op
,
bool
use_calc_stream
);
void
CreateNCCLManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
);
void
CheckSplitSizes
(
std
::
vector
<
int64_t
>*
split_sizes
,
std
::
vector
<
int64_t
>
tensor_shape
);
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupStream.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
namespace
paddle
{
namespace
distributed
{
ProcessGroupStream
::
ProcessGroupStream
(
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
)
{}
phi
::
DeviceContext
*
ProcessGroupStream
::
GetDeviceContext
(
const
Place
&
place
,
bool
use_calc_stream
)
const
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support get device_context."
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
input_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
output_tensors
,
// NOLINT
bool
sync_op
)
{
return
AllGather
(
input_tensors
,
output_tensors
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
input_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
output_tensors
,
// NOLINT
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do all_gather"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
input_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
output_tensors
,
// NOLINT
const
AllreduceOptions
&
options
,
bool
sync_op
)
{
return
AllReduce
(
input_tensors
,
output_tensors
,
options
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
input_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
output_tensors
,
// NOLINT
const
AllreduceOptions
&
options
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do all_reduce"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
bool
sync_op
)
{
return
AllToAll
(
in_tensors
,
out_tensors
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do alltoall"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllToAllSingle
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
,
bool
sync_op
)
{
return
AllToAllSingle
(
in_tensors
,
out_tensors
,
in_sizes
,
out_sizes
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllToAllSingle
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do alltoall_single"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
,
bool
sync_op
)
{
return
Broadcast
(
in_tensors
,
out_tensors
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do broadcast"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
{
return
Reduce
(
in_tensors
,
out_tensors
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do reduce"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
)
{
return
ReduceScatter
(
in_tensors
,
out_tensors
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do reduce_scatter"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
{
return
Scatter
(
in_tensors
,
out_tensors
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do scatter"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
)
{
return
Send
(
tensors
,
dst_rank
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do send"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
)
{
return
Send_Partial
(
tensors
,
dst_rank
,
offset
,
length
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do send_partial"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
)
{
return
Recv
(
tensors
,
src_rank
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do recv"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
)
{
return
Recv_Partial
(
tensors
,
src_rank
,
offset
,
length
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do recv_partial"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
)
{
return
AllGather_Partial
(
in_tensors
,
out_tensors
,
offset
,
length
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do recv_partial"
,
GetBackendName
()));
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupStream.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
namespace
paddle
{
namespace
distributed
{
// NOTE(liyurui): Notice that some backends use `stream` as an abstract
// conception of hardward resource. We provide this base class allowing users to
// put communications on calculation stream. In some scenorios, we found this
// will save the time of switching streams.
class
ProcessGroupStream
:
public
ProcessGroup
{
public:
class
TaskStream
:
public
ProcessGroup
::
Task
{
public:
// TODO(liyurui): This constructor is temporary here for compatible reason,
// will be deleted soon.
TaskStream
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
CommType
comm_type
)
:
Task
(
rank
,
inputs
,
comm_type
)
{}
TaskStream
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
CommType
comm_type
,
bool
sync_op
,
bool
use_calc_stream
)
:
Task
(
rank
,
inputs
,
comm_type
,
sync_op
),
use_calc_stream_
(
use_calc_stream
)
{}
virtual
~
TaskStream
()
=
default
;
protected:
bool
UseCalcStream
()
const
{
return
use_calc_stream_
;
}
private:
bool
use_calc_stream_
{
false
};
};
ProcessGroupStream
(
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
);
virtual
~
ProcessGroupStream
()
=
default
;
virtual
phi
::
DeviceContext
*
GetDeviceContext
(
const
Place
&
place
,
bool
use_calc_stream
)
const
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
input_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
output_tensors
,
// NOLINT
const
AllreduceOptions
&
options
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
input_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
output_tensors
,
// NOLINT
const
AllreduceOptions
&
options
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAllSingle
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
std
::
vector
<
int64_t
>&
in_sizes
,
// NOLINT
std
::
vector
<
int64_t
>&
out_sizes
,
// NOLINT
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAllSingle
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
std
::
vector
<
int64_t
>&
in_sizes
,
// NOLINT
std
::
vector
<
int64_t
>&
out_sizes
,
// NOLINT
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
BroadcastOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
BroadcastOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceScatterOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ScatterOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
dst_rank
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
dst_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
dst_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
src_rank
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
src_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
src_rank
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
int64_t
offset
,
int64_t
length
,
bool
sync_op
,
bool
use_calc_stream
);
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/Types.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <cstdint>
#include <vector>
namespace
paddle
{
namespace
distributed
{
// TODO(shenliang03): To support AVG for reduce
enum
class
ReduceOp
:
std
::
uint8_t
{
SUM
=
0
,
AVG
,
MAX
,
MIN
,
PRODUCT
};
struct
AllreduceOptions
{
ReduceOp
reduce_op
=
ReduceOp
::
SUM
;
};
struct
BroadcastOptions
{
int
source_rank
=
0
;
int
source_root
=
0
;
};
struct
BarrierOptions
{
std
::
vector
<
int
>
place_ids
;
};
struct
ReduceOptions
{
ReduceOp
reduce_op
=
ReduceOp
::
SUM
;
int
root_rank
=
0
;
};
struct
ScatterOptions
{
int
root_rank
=
0
;
};
struct
ReduceScatterOptions
{
ReduceOp
reduce_op
=
ReduceOp
::
SUM
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/Utils.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/backends/device_guard.h"
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
namespace
paddle
{
namespace
distributed
{
template
<
typename
DeviceContext
,
typename
T
>
struct
ConcatDenseTensor
{
void
operator
()(
const
DeviceContext
*
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
in
,
phi
::
DenseTensor
*
out
,
int
axis
=
0
)
{
phi
::
funcs
::
ConcatFunctor
<
DeviceContext
,
T
>
concat_functor
;
concat_functor
(
*
context
,
in
,
axis
,
out
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
struct
SplitDenseTensor
{
void
operator
()(
const
DeviceContext
*
context
,
const
phi
::
DenseTensor
&
in
,
std
::
vector
<
phi
::
DenseTensor
*>
*
out
,
int
axis
=
0
)
{
std
::
vector
<
const
phi
::
DenseTensor
*>
shape_refer
;
shape_refer
.
reserve
(
out
->
size
());
for
(
auto
*
p_tensor
:
*
out
)
{
shape_refer
.
emplace_back
(
p_tensor
);
}
phi
::
funcs
::
SplitFunctor
<
DeviceContext
,
T
>
split_functor
;
split_functor
(
*
context
,
in
,
shape_refer
,
axis
,
out
);
}
};
#ifdef PADDLE_WITH_CUSTOM_DEVICE
template
<
typename
T
>
struct
ConcatDenseTensor
<
platform
::
CustomDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CustomDeviceContext
*
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
in
,
phi
::
DenseTensor
*
out
,
int
axis
=
0
)
{
auto
*
out_data
=
out
->
data
<
T
>
();
auto
*
device
=
phi
::
DeviceManager
::
GetDeviceWithPlace
(
context
->
GetPlace
());
size_t
offset
=
0
;
for
(
const
auto
&
tensor
:
in
)
{
const
auto
*
in_data
=
tensor
.
data
<
T
>
();
auto
sz
=
tensor
.
numel
()
*
sizeof
(
T
);
device
->
MemoryCopyD2D
(
out_data
+
offset
,
in_data
,
sz
,
nullptr
);
offset
+=
sz
;
}
}
};
template
<
typename
T
>
struct
SplitDenseTensor
<
platform
::
CustomDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CustomDeviceContext
*
context
,
const
phi
::
DenseTensor
&
in
,
std
::
vector
<
phi
::
DenseTensor
*>
*
out
,
int
axis
=
0
)
{
auto
*
in_data
=
in
.
data
<
T
>
();
auto
*
device
=
phi
::
DeviceManager
::
GetDeviceWithPlace
(
context
->
GetPlace
());
size_t
offset
=
0
;
for
(
auto
*
p_tensor
:
*
out
)
{
auto
*
out_data
=
p_tensor
->
data
<
T
>
();
auto
sz
=
p_tensor
->
numel
()
*
sizeof
(
T
);
device
->
MemoryCopyD2D
(
out_data
,
in_data
+
offset
,
sz
,
nullptr
);
offset
+=
sz
;
}
}
};
#endif
template
<
typename
DeviceContext
>
void
ConcatDenseTensorWithType
(
const
DeviceContext
*
dev_ctx
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
t_list
,
phi
::
DenseTensor
*
p_out
,
phi
::
DataType
type
)
{
switch
(
type
)
{
case
phi
::
DataType
::
BOOL
:
ConcatDenseTensor
<
DeviceContext
,
bool
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
case
phi
::
DataType
::
UINT8
:
ConcatDenseTensor
<
DeviceContext
,
uint8_t
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
case
phi
::
DataType
::
INT8
:
ConcatDenseTensor
<
DeviceContext
,
int8_t
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
case
phi
::
DataType
::
INT32
:
ConcatDenseTensor
<
DeviceContext
,
int32_t
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
case
phi
::
DataType
::
INT64
:
ConcatDenseTensor
<
DeviceContext
,
int64_t
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
case
phi
::
DataType
::
FLOAT16
:
ConcatDenseTensor
<
DeviceContext
,
platform
::
float16
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
case
phi
::
DataType
::
FLOAT32
:
ConcatDenseTensor
<
DeviceContext
,
float
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
case
phi
::
DataType
::
FLOAT64
:
ConcatDenseTensor
<
DeviceContext
,
double
>
()(
dev_ctx
,
t_list
,
p_out
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors."
,
type
));
}
}
template
<
typename
DeviceContext
>
void
SplitDenseTensorWithType
(
const
DeviceContext
*
dev_ctx
,
const
phi
::
DenseTensor
&
t_in
,
std
::
vector
<
phi
::
DenseTensor
*>
*
p_list
,
phi
::
DataType
type
)
{
switch
(
type
)
{
case
phi
::
DataType
::
BOOL
:
SplitDenseTensor
<
DeviceContext
,
bool
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
case
phi
::
DataType
::
UINT8
:
SplitDenseTensor
<
DeviceContext
,
uint8_t
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
case
phi
::
DataType
::
INT8
:
SplitDenseTensor
<
DeviceContext
,
int8_t
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
case
phi
::
DataType
::
INT32
:
SplitDenseTensor
<
DeviceContext
,
int32_t
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
case
phi
::
DataType
::
INT64
:
SplitDenseTensor
<
DeviceContext
,
int64_t
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
case
phi
::
DataType
::
FLOAT16
:
SplitDenseTensor
<
DeviceContext
,
platform
::
float16
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
case
phi
::
DataType
::
FLOAT32
:
SplitDenseTensor
<
DeviceContext
,
float
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
case
phi
::
DataType
::
FLOAT64
:
SplitDenseTensor
<
DeviceContext
,
double
>
()(
dev_ctx
,
t_in
,
p_list
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors."
,
type
));
}
}
void
ConcatTensor
(
const
phi
::
DeviceContext
*
dev_ctx
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
tensor_list
,
const
experimental
::
Tensor
*
tensor
)
{
auto
*
dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
->
impl
()).
get
();
const
auto
&
place
=
dev_ctx
->
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ConcatDenseTensorWithType
(
static_cast
<
const
phi
::
GPUContext
*>
(
dev_ctx
),
tensor_list
,
dense_tensor
,
tensor
->
dtype
());
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat tensor since it's not support GPU, please "
"recompile or reinstall Paddle with GPU support."
));
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
ConcatDenseTensorWithType
(
static_cast
<
const
platform
::
CustomDeviceContext
*>
(
dev_ctx
),
tensor_list
,
dense_tensor
,
tensor
->
dtype
());
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat tensor since it's not compiled with "
"CUSTOM_DEVICE, please recompile or reinstall Paddle with "
"CUSTOM_DEVICE support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
ConcatDenseTensorWithType
(
static_cast
<
const
phi
::
CPUContext
*>
(
dev_ctx
),
tensor_list
,
dense_tensor
,
tensor
->
dtype
());
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Concat tensor not supported on place (%s)"
,
place
));
}
}
void
SplitTensor
(
const
phi
::
DeviceContext
*
dev_ctx
,
const
phi
::
DenseTensor
&
tensor
,
const
std
::
vector
<
experimental
::
Tensor
>
*
tensor_list
)
{
std
::
vector
<
phi
::
DenseTensor
*>
dense_list
;
for
(
auto
&
tensor
:
*
tensor_list
)
{
auto
p_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
()).
get
();
dense_list
.
emplace_back
(
p_tensor
);
}
const
auto
&
place
=
dev_ctx
->
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
SplitDenseTensorWithType
(
static_cast
<
const
phi
::
GPUContext
*>
(
dev_ctx
),
tensor
,
&
dense_list
,
tensor
.
dtype
());
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split tensor since it's not support GPU, please "
"recompile or reinstall Paddle with GPU support."
));
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
SplitDenseTensorWithType
(
static_cast
<
const
platform
::
CustomDeviceContext
*>
(
dev_ctx
),
tensor
,
&
dense_list
,
tensor
.
dtype
());
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split tensor since it's not compiled with CUSTOM_DEVICE, "
"please recompile or reinstall Paddle with CUSTOM_DEVICE support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
SplitDenseTensorWithType
(
static_cast
<
const
phi
::
CPUContext
*>
(
dev_ctx
),
tensor
,
&
dense_list
,
tensor
.
dtype
());
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Split tensor not supported on place (%s)"
,
place
));
}
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/reducer.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/phi/backends/device_guard.h"
#include "paddle/phi/backends/device_manager.h"
namespace
paddle
{
namespace
distributed
{
static
Backend
TransToBackend
(
platform
::
Place
place
)
{
static
const
std
::
map
<
phi
::
AllocationType
,
Backend
>
type_backend
=
{
{
phi
::
AllocationType
::
GPU
,
Backend
::
GPU
},
{
phi
::
AllocationType
::
CPU
,
Backend
::
CPU
},
};
phi
::
AllocationType
type
=
place
.
GetType
();
auto
it
=
type_backend
.
find
(
type
);
PADDLE_ENFORCE_EQ
(
it
!=
type_backend
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Place type (%s) is not supported. "
,
place
));
return
it
->
second
;
}
std
::
vector
<
std
::
vector
<
size_t
>>
Eager_AssignGroupBySize
(
const
std
::
vector
<
Tensor
>
tensors
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
const
std
::
vector
<
int64_t
>
&
tensor_indices
)
{
PADDLE_ENFORCE_EQ
(
tensors
.
size
(),
is_sparse_gradient
.
size
(),
platform
::
errors
::
PreconditionNotMet
(
"tensors len must be equal to is_sparse_gradient len, but "
"[%lu] != [%lu]"
,
tensors
.
size
(),
is_sparse_gradient
.
size
()));
auto
check_perm
=
[](
const
std
::
vector
<
int64_t
>
&
x
)
->
bool
{
size_t
len
=
x
.
size
();
std
::
vector
<
size_t
>
cnt
(
len
,
0
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
{
if
(
x
[
i
]
>=
static_cast
<
int64_t
>
(
len
)
||
x
[
i
]
<
0
||
cnt
[
x
[
i
]])
{
return
false
;
}
cnt
[
x
[
i
]]
++
;
}
return
true
;
};
PADDLE_ENFORCE_EQ
(
true
,
check_perm
(
tensor_indices
),
platform
::
errors
::
PreconditionNotMet
(
"tensor_indices must be a permutation from 0 to %lu"
,
tensor_indices
.
size
()));
// the return vector
std
::
vector
<
std
::
vector
<
size_t
>>
res
;
// Key: the var type
// Value: should use which index in group_size_limits for group size limit
std
::
map
<
experimental
::
DataType
,
size_t
>
group_limit_index
;
// Key: the var type
// Value: <the var index in input tensors, total numel in this group>
std
::
map
<
experimental
::
DataType
,
std
::
pair
<
std
::
vector
<
size_t
>
,
size_t
>>
next_group
;
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
const
auto
&
var
=
tensors
[
i
];
size_t
tensor_real_index
=
i
;
if
(
!
tensor_indices
.
empty
())
{
tensor_real_index
=
tensor_indices
[
i
];
}
if
(
is_sparse_gradient
[
tensor_real_index
])
{
// we keep sparse var a single group
res
.
push_back
({
tensor_real_index
});
continue
;
}
const
auto
&
var_dtype
=
var
.
dtype
();
VLOG
(
3
)
<<
"var["
<<
var
.
name
()
<<
"] 's type is "
<<
var_dtype
;
auto
&
group_info
=
next_group
[
var_dtype
];
int64_t
var_size
=
-
1
;
if
(
var
.
is_dense_tensor
())
{
var_size
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
var
.
impl
())
->
numel
();
}
else
{
VLOG
(
3
)
<<
"var "
<<
var
.
name
()
<<
" is not tensor or selected_rows, so skip it"
;
continue
;
}
group_info
.
first
.
push_back
(
tensor_real_index
);
group_info
.
second
+=
experimental
::
SizeOf
(
var_dtype
)
*
var_size
;
// group_info.second += framework::SizeOfType(var_dtype) * var_size;
if
(
group_limit_index
.
find
(
var_dtype
)
==
group_limit_index
.
end
())
{
// means it is the first var of var_dtype
group_limit_index
[
var_dtype
]
=
0
;
}
auto
&
cur_limit_index
=
group_limit_index
[
var_dtype
];
if
(
group_info
.
second
>=
group_size_limits
[
cur_limit_index
])
{
// exceed group capacity and create a new group
res
.
emplace_back
(
std
::
move
(
group_info
.
first
));
group_info
=
std
::
pair
<
std
::
vector
<
size_t
>
,
size_t
>
();
cur_limit_index
=
(
std
::
min
)(
cur_limit_index
+
1
,
group_size_limits
.
size
()
-
1
);
}
}
// add the final groups
for
(
auto
&
e
:
next_group
)
{
auto
&
group_info
=
e
.
second
;
if
(
!
group_info
.
first
.
empty
())
{
res
.
emplace_back
(
std
::
move
(
group_info
.
first
));
}
}
for
(
const
auto
&
group_index
:
res
)
{
PADDLE_ENFORCE_NE
(
group_index
.
empty
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"AssignGroupBySize construct empty group, please check."
));
}
if
(
tensor_indices
.
empty
())
{
std
::
sort
(
res
.
begin
(),
res
.
end
(),
[](
const
std
::
vector
<
size_t
>
&
x
,
const
std
::
vector
<
size_t
>
&
y
)
{
return
x
.
front
()
<
y
.
front
();
});
}
return
res
;
}
template
<
typename
DeviceContext
,
typename
T
>
struct
ConcatTensorsForAllReduce
{
void
operator
()(
const
DeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
dense_tensors_
,
Tensor
*
p_dense_contents
)
{
operators
::
math
::
ConcatFunctor
<
DeviceContext
,
T
>
concat_functor_
;
concat_functor_
(
context
,
dense_tensors_
,
0
,
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
p_dense_contents
->
impl
())
.
get
());
}
};
template
<
typename
DeviceContext
,
typename
T
>
struct
SplitTensorsForAllReduce
{
void
operator
()(
const
DeviceContext
&
context
,
Tensor
*
p_dense_contents
,
std
::
vector
<
phi
::
DenseTensor
>
*
p_dense_tensors
)
{
auto
*
in
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
p_dense_contents
->
impl
())
.
get
();
std
::
vector
<
phi
::
DenseTensor
*>
outs
;
std
::
vector
<
const
phi
::
DenseTensor
*>
shape_refer
;
outs
.
reserve
(
p_dense_tensors
->
size
());
shape_refer
.
reserve
(
p_dense_tensors
->
size
());
for
(
auto
&
tensor
:
*
p_dense_tensors
)
{
outs
.
emplace_back
(
&
tensor
);
shape_refer
.
emplace_back
(
&
tensor
);
}
operators
::
math
::
SplitFunctor
<
DeviceContext
,
T
>
split_functor_
;
split_functor_
(
context
,
*
in
,
shape_refer
,
0
,
&
outs
);
}
};
#ifdef PADDLE_WITH_CUSTOM_DEVICE
// note(wangran16): A temporary solution for all backends.
template
<
typename
T
>
struct
ConcatTensorsForAllReduce
<
platform
::
CustomDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CustomDeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
dense_tensors_
,
Tensor
*
p_dense_contents
)
{
phi
::
DeviceGuard
guard
(
context
.
GetPlace
());
auto
*
out
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
p_dense_contents
->
impl
())
.
get
();
uint8_t
*
out_data
=
reinterpret_cast
<
uint8_t
*>
(
out
->
data
<
T
>
());
auto
*
device
=
phi
::
DeviceManager
::
GetDeviceWithPlace
(
context
.
GetPlace
());
size_t
offset
=
0
;
for
(
const
auto
&
tensor
:
dense_tensors_
)
{
const
uint8_t
*
in_data
=
reinterpret_cast
<
const
uint8_t
*>
(
tensor
.
data
<
T
>
());
auto
sz
=
tensor
.
numel
()
*
sizeof
(
T
);
device
->
MemoryCopyD2D
(
out_data
+
offset
,
in_data
,
sz
,
nullptr
);
offset
+=
sz
;
}
}
};
template
<
typename
T
>
struct
SplitTensorsForAllReduce
<
platform
::
CustomDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CustomDeviceContext
&
context
,
Tensor
*
p_dense_contents
,
std
::
vector
<
phi
::
DenseTensor
>
*
p_dense_tensors
)
{
auto
*
in
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
p_dense_contents
->
impl
())
.
get
();
uint8_t
*
in_data
=
reinterpret_cast
<
uint8_t
*>
(
in
->
data
<
T
>
());
auto
*
device
=
phi
::
DeviceManager
::
GetDeviceWithPlace
(
context
.
GetPlace
());
size_t
offset
=
0
;
for
(
auto
&
tensor
:
*
p_dense_tensors
)
{
uint8_t
*
out_data
=
reinterpret_cast
<
uint8_t
*>
(
tensor
.
data
<
T
>
());
auto
sz
=
tensor
.
numel
()
*
sizeof
(
T
);
device
->
MemoryCopyD2D
(
out_data
,
in_data
+
offset
,
sz
,
nullptr
);
offset
+=
sz
;
}
}
};
#endif
// context is used to select the stream for concat
template
<
typename
DeviceContext
>
static
void
ConcatTensorsWithType
(
const
DeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
dense_tensors_
,
Tensor
*
p_dense_contents
,
phi
::
DataType
type
)
{
switch
(
type
)
{
case
phi
::
DataType
::
FLOAT16
:
ConcatTensorsForAllReduce
<
DeviceContext
,
platform
::
float16
>
()(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
case
phi
::
DataType
::
FLOAT32
:
ConcatTensorsForAllReduce
<
DeviceContext
,
float
>
()(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
case
phi
::
DataType
::
FLOAT64
:
ConcatTensorsForAllReduce
<
DeviceContext
,
double
>
()(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors for "
"allreduce."
,
type
));
}
}
// context is used to select the stream for split
template
<
typename
DeviceContext
>
static
void
SplitTensorsWithType
(
const
DeviceContext
&
context
,
Tensor
*
p_dense_contents
,
std
::
vector
<
phi
::
DenseTensor
>
*
p_dense_tensors
,
phi
::
DataType
type
)
{
switch
(
type
)
{
case
phi
::
DataType
::
FLOAT16
:
SplitTensorsForAllReduce
<
DeviceContext
,
platform
::
float16
>
()(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
case
phi
::
DataType
::
FLOAT32
:
SplitTensorsForAllReduce
<
DeviceContext
,
float
>
()(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
case
phi
::
DataType
::
FLOAT64
:
SplitTensorsForAllReduce
<
DeviceContext
,
double
>
()(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors for "
"allreduce."
,
type
));
}
}
void
EagerGroup
::
ConcatTensors
(
const
platform
::
Place
&
place
)
{
dense_contents_
=
paddle
::
experimental
::
empty
(
IntArray
({
all_length_
}),
dtype_
,
place
);
if
(
platform
::
is_gpu_place
(
place
))
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
default_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
ConcatTensorsWithType
(
*
default_ctx
,
dense_tensors_
,
&
dense_contents_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."
));
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
*
default_ctx
=
static_cast
<
platform
::
CustomDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
ConcatTensorsWithType
(
*
default_ctx
,
dense_tensors_
,
&
dense_contents_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat grad tensors since it's not compiled with "
"CUSTOM_DEVICE,"
"Please recompile or reinstall Paddle with CUSTOM_DEVICE support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
auto
*
default_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
ConcatTensorsWithType
(
*
default_ctx
,
dense_tensors_
,
&
dense_contents_
,
dtype_
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Concat grad tensor not supported on place (%s)"
,
place
));
}
}
void
EagerGroup
::
SplitTensors
(
const
platform
::
Place
&
place
)
{
if
(
platform
::
is_gpu_place
(
place
))
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
default_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
SplitTensorsWithType
(
*
default_ctx
,
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split grad tensor since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."
));
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
*
default_ctx
=
static_cast
<
platform
::
CustomDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
SplitTensorsWithType
(
*
default_ctx
,
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split grad tensor since it's not compiled with "
"CUSTOM_DEVICE,"
"Please recompile or reinstall Paddle with CUSTOM_DEVICE support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
auto
*
default_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
SplitTensorsWithType
(
*
default_ctx
,
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Split grad tensor not supported on place (%s)"
,
place
));
}
}
EagerReducer
::
EagerReducer
(
const
std
::
vector
<
Tensor
>
tensors
,
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
std
::
shared_ptr
<
distributed
::
ProcessGroup
>
process_group
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
bool
find_unused_parameters
)
:
tensors_
(
tensors
),
group_indices_
(
group_indices
),
is_sparse_gradient_
(
is_sparse_gradient
),
process_group_
(
process_group
),
group_size_limits_
(
group_size_limits
),
find_unused_vars_each_step_
(
find_unused_parameters
)
{
VLOG
(
3
)
<<
"Start construct the Reducer ..."
;
nranks_
=
process_group_
->
GetSize
();
// initialize groups
InitializeGroups
(
group_indices
);
for
(
size_t
global_var_index
=
0
;
global_var_index
<
tensors_
.
size
();
++
global_var_index
)
{
auto
tensor
=
tensors_
[
global_var_index
];
auto
reduce_hook
=
[
=
](
void
)
->
void
{
this
->
AddDistHook
(
global_var_index
);
};
const
auto
&
grad_node
=
GetGradNodeFromTensor
(
&
tensor
);
PADDLE_ENFORCE
(
grad_node
.
get
()
!=
nullptr
,
paddle
::
platform
::
errors
::
Fatal
(
"Detected NULL grad_node,"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation"
));
const
auto
&
accumulation_grad_node
=
std
::
dynamic_pointer_cast
<
egr
::
GradNodeAccumulation
>
(
grad_node
);
accumulation_grad_node
->
RegisterReduceHook
(
std
::
make_shared
<
egr
::
CppVoidHook
>
(
reduce_hook
));
gradnode_index_map_
[
grad_node
.
get
()]
=
global_var_index
;
}
vars_marked_ready_
.
resize
(
tensors_
.
size
(),
false
);
local_used_vars_
.
resize
(
tensors_
.
size
(),
0
);
if
(
find_unused_vars_each_step_
)
{
global_used_vars_
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int32_t
>
(
tensors_
.
size
())}),
DataType
::
INT32
,
inner_place_
);
}
}
std
::
shared_ptr
<
egr
::
GradNodeBase
>
EagerReducer
::
GetGradNodeFromTensor
(
Tensor
*
tensor
)
{
auto
*
autograd_meta
=
tensor
->
get_autograd_meta
();
const
auto
&
grad_node
=
static_cast
<
egr
::
AutogradMeta
*>
(
autograd_meta
)
->
GetMutableGradNode
();
return
grad_node
;
}
void
EagerReducer
::
InitializeGroups
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
)
{
VLOG
(
3
)
<<
"Start initialize groups .."
;
// clear the group
groups_
.
clear
();
groups_
.
reserve
(
group_indices
.
size
());
variable_locators_
.
clear
();
variable_locators_
.
resize
(
tensors_
.
size
());
auto
group_nums
=
group_indices
.
size
();
for
(
size_t
group_index
=
0
;
group_index
<
group_nums
;
++
group_index
)
{
const
auto
&
tensor_indices_
=
group_indices
[
group_index
];
PADDLE_ENFORCE_GT
(
tensor_indices_
.
size
(),
0
,
platform
::
errors
::
PreconditionNotMet
(
"The number of group[%d]'s elements is 0."
,
group_index
));
EagerGroup
group
;
// It's just for check the sparse or dense
auto
first_var
=
tensors_
[
tensor_indices_
.
front
()];
if
(
tensor_indices_
.
size
()
==
1
&&
is_sparse_gradient_
[
tensor_indices_
.
front
()])
{
// process the sparse gradient. one sparse, one group
group
.
dtype_
=
first_var
.
dtype
();
group
.
is_sparse_
=
true
;
}
else
{
// process the dense gradient.
InitializeDenseGroups
(
tensor_indices_
,
&
group
);
}
// map tensors to this group by VariableLocator
size_t
inside_group_index
=
0
;
for
(
const
auto
var_index
:
tensor_indices_
)
{
TensorLocator
tensor_locator
;
tensor_locator
.
group_index
=
group_index
;
tensor_locator
.
inside_group_index
=
inside_group_index
++
;
variable_locators_
[
var_index
]
=
tensor_locator
;
}
group
.
tensor_indices_
=
std
::
move
(
tensor_indices_
);
groups_
.
emplace_back
(
std
::
move
(
group
));
VLOG
(
3
)
<<
"The Group["
<<
group_index
<<
"]:"
<<
groups_
.
back
();
}
}
void
EagerReducer
::
InitializeDenseGroups
(
const
std
::
vector
<
size_t
>
&
tensor_indices_
,
EagerGroup
*
p_group
)
{
VLOG
(
3
)
<<
"InitializeDenseGroups."
;
int64_t
all_length
=
0
;
for
(
size_t
index
=
0
;
index
<
tensor_indices_
.
size
();
++
index
)
{
auto
tensor_index
=
tensor_indices_
[
index
];
auto
&
tensor
=
tensors_
[
tensor_index
];
auto
&
tensor_name
=
tensor
.
name
();
PADDLE_ENFORCE_EQ
(
is_sparse_gradient_
[
tensor_index
],
false
,
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s's GRAD must be Tensor, but received "
"GRAD is SelectedRows"
,
tensor_name
));
PADDLE_ENFORCE_EQ
(
tensor
.
initialized
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s is not initialized."
,
tensor_name
));
const
auto
size
=
tensor
.
numel
();
PADDLE_ENFORCE_GT
(
size
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"The number of tensor %s's elements is 0."
,
tensor_name
));
all_length
+=
size
;
p_group
->
length_
.
push_back
(
size
);
// for concat operator
p_group
->
origin_shapes_
.
push_back
(
IntArray
(
tensor
.
shape
()));
p_group
->
dense_tensors_
.
push_back
(
phi
::
DenseTensor
());
const
auto
&
dtype
=
tensor
.
dtype
();
const
auto
&
inner_place
=
tensor
.
impl
()
->
place
();
if
(
index
>
0
)
{
PADDLE_ENFORCE_EQ
(
dtype
,
p_group
->
dtype_
,
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s has unexpected dtype."
,
tensor_name
));
}
else
{
p_group
->
dtype_
=
dtype
;
inner_place_
=
inner_place
;
}
}
p_group
->
all_length_
=
all_length
;
}
void
EagerReducer
::
TraverseBackwardGraph
(
const
std
::
vector
<
Tensor
>
&
outputs
)
{
std
::
queue
<
egr
::
GradNodeBase
*>
queue
;
std
::
set
<
egr
::
GradNodeBase
*>
visited
;
for
(
const
auto
&
output
:
outputs
)
{
auto
*
auto_grad_meta
=
static_cast
<
egr
::
AutogradMeta
*>
(
output
.
get_autograd_meta
());
if
(
!
auto_grad_meta
)
continue
;
auto
shared_grad_node
=
auto_grad_meta
->
GetMutableGradNode
();
if
(
shared_grad_node
==
nullptr
||
shared_grad_node
.
get
()
==
nullptr
||
auto_grad_meta
->
StopGradient
())
{
continue
;
}
egr
::
GradNodeBase
*
grad_node
=
shared_grad_node
.
get
();
queue
.
emplace
(
grad_node
);
}
while
(
!
queue
.
empty
())
{
egr
::
GradNodeBase
*
node
=
queue
.
front
();
queue
.
pop
();
const
paddle
::
small_vector
<
std
::
vector
<
egr
::
GradSlotMeta
>
,
egr
::
kSlotSmallVectorSize
>
&
metas
=
node
->
OutputMeta
();
for
(
size_t
i
=
0
;
i
<
metas
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
metas
[
i
].
size
();
j
++
)
{
const
egr
::
Edge
&
edge
=
metas
[
i
][
j
].
GetEdge
();
auto
next_node_shared
=
edge
.
GetMutableGradNode
();
if
(
!
next_node_shared
||
!
next_node_shared
.
get
())
{
continue
;
}
auto
*
next_node
=
next_node_shared
.
get
();
const
bool
was_inserted
=
visited
.
insert
(
next_node
).
second
;
if
(
was_inserted
)
{
queue
.
emplace
(
next_node
);
}
}
}
}
for
(
const
auto
&
it
:
gradnode_index_map_
)
{
if
(
visited
.
count
(
it
.
first
)
==
0
)
{
unused_vars_
.
push_back
(
it
.
second
);
VLOG
(
3
)
<<
"[Rank "
<<
process_group_
->
GetRank
()
<<
"]: "
<<
"Tensor "
<<
tensors_
[
it
.
second
].
name
()
<<
" at index "
<<
it
.
second
<<
" is marked as unused."
;
}
}
}
void
EagerReducer
::
PrepareForBackward
(
const
std
::
vector
<
Tensor
>
&
outputs
)
{
VLOG
(
3
)
<<
"after forward, then reset count for backward."
;
grad_need_hooks_
=
true
;
next_group_
=
0
;
std
::
for_each
(
groups_
.
begin
(),
groups_
.
end
(),
[](
EagerGroup
&
group
)
{
group
.
pending_
=
group
.
tensor_indices_
.
size
();
group
.
sparse_contents_
=
Tensor
();
});
// reinitialize vars_marked_ready_ for next iteration
vars_marked_ready_
.
clear
();
vars_marked_ready_
.
resize
(
tensors_
.
size
(),
false
);
PADDLE_ENFORCE_EQ
(
groups_need_finalize_
,
false
,
platform
::
errors
::
PreconditionNotMet
(
"A serious error has occurred here. Please "
"set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have "
"set, There may be several reasons for this error: "
"1) Please note that all forward outputs derived from the module "
"parameters must participate in the calculation of losses and "
"subsequent gradient calculations. If not, the wrapper will hang, "
"waiting for autograd to generate gradients for these parameters. "
"you can use detach or stop_gradient to make the unused parameters "
"detached from the autograd graph. "
"2) Used multiple forwards and one backward. You may be able to wrap "
"multiple forwards in a model."
));
// The first var to trigger the unused parameter
has_marked_unused_vars_
=
false
;
if
(
find_unused_vars_once_
||
find_unused_vars_each_step_
)
{
unused_vars_
.
clear
();
TraverseBackwardGraph
(
outputs
);
// only check once in first step
find_unused_vars_once_
=
false
;
}
if
(
find_unused_vars_each_step_
&&
unused_vars_
.
empty
())
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"All parameters are involved in the backward pass. "
"It is recommended to set find_unused_parameters to False "
"to improve performance. However, if unused parameters "
"appear in subsequent iterative training, then an error "
"will occur. Please make it clear that in the subsequent "
"training, there will be no parameters that are not used "
"in the backward pass, and then set find_unused_parameters"
;
}
if
(
unused_vars_
.
size
()
==
tensors_
.
size
())
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"There is no parameter in the device involved "
"in the backward calculation. If there are "
"parameters on other devices involved in the "
"backward, then a serious error will occur here."
;
}
}
void
EagerReducer
::
AddDistHook
(
size_t
var_index
)
{
PADDLE_ENFORCE_LT
(
var_index
,
variable_locators_
.
size
(),
platform
::
errors
::
OutOfRange
(
"Out of bounds variable index. it must be less"
"than %d, but it is %d"
,
variable_locators_
.
size
(),
var_index
));
// gradient synchronization is not required when grad_need_hooks_ is false.
if
(
!
grad_need_hooks_
)
{
return
;
}
VLOG
(
3
)
<<
"Tensor["
<<
var_index
<<
"] ["
<<
tensors_
[
var_index
].
name
()
<<
"@Grad] arrived and triggered disthook"
;
local_used_vars_
[
var_index
]
=
1
;
if
(
!
has_marked_unused_vars_
)
{
has_marked_unused_vars_
=
true
;
for
(
const
auto
unused_index
:
unused_vars_
)
{
MarkVarReady
(
unused_index
,
false
);
}
}
MarkVarReady
(
var_index
,
true
);
}
void
EagerReducer
::
MarkVarReady
(
const
size_t
var_index
,
const
bool
is_used_var
)
{
VLOG
(
3
)
<<
"Tensor["
<<
var_index
<<
"]["
<<
tensors_
[
var_index
].
name
()
<<
"] is marked ready."
;
// error happened, if the var is ready before.
if
(
vars_marked_ready_
[
var_index
])
{
auto
error_info
=
string
::
Sprintf
(
"Error happened, when parameter[%d][%s] has been ready before. "
"Please set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have set, "
"there may be several reasons for this error: "
"1) In multiple reentrant backward phase, some parameters are reused."
"2) Using model parameters outside of forward function. Please "
"make sure that model parameters are not shared in concurrent "
"forward-backward passes."
,
var_index
,
tensors_
[
var_index
].
name
());
PADDLE_ENFORCE_EQ
(
has_marked_unused_vars_
,
false
,
platform
::
errors
::
PreconditionNotMet
(
error_info
));
error_info
+=
"3) Unused parameters retrieval is incorrect. "
"The return value of forward will be used to retrieve"
" the unused parameters of the entire model. These "
"gradients of unused parameters will not be synchronized "
"between multiple cards. However, if the unused "
"parameters participate in the backward calculation "
"again at a later time (e.g. after the forward function, "
"the loss calculation uses the unused "
"paramters of the forward and trigger backward), "
"its gradient will be wrong."
;
PADDLE_ENFORCE_EQ
(
has_marked_unused_vars_
,
true
,
platform
::
errors
::
PreconditionNotMet
(
error_info
));
}
else
{
vars_marked_ready_
[
var_index
]
=
true
;
}
groups_need_finalize_
=
true
;
const
auto
&
var_locator
=
variable_locators_
[
var_index
];
const
auto
group_index
=
var_locator
.
group_index
;
const
auto
inside_group_index
=
var_locator
.
inside_group_index
;
auto
&
group
=
groups_
[
group_index
];
auto
&
group_tensor
=
group
.
dense_tensors_
[
inside_group_index
];
const
auto
length
=
group
.
length_
[
inside_group_index
];
if
(
!
group
.
is_sparse_
)
{
if
(
is_used_var
)
{
auto
*
autograd_meta
=
tensors_
[
var_index
].
get_autograd_meta
();
auto
&
grad_tensor
=
static_cast
<
egr
::
AutogradMeta
*>
(
autograd_meta
)
->
Grad
();
group_tensor
.
ShareDataWith
(
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
grad_tensor
.
impl
())))
.
Resize
({
grad_tensor
.
numel
()});
}
else
{
// TODO(shenliang03): maybe save the memory by avoiding tensor
// construction
if
(
!
group_tensor
.
initialized
())
{
group_tensor
.
Resize
({
static_cast
<
int64_t
>
(
length
)});
group_tensor
.
mutable_data
(
inner_place_
,
group
.
dtype_
);
}
if
(
HasGrad
(
var_index
))
{
VLOG
(
3
)
<<
"Tensor["
<<
tensors_
[
var_index
].
name
()
<<
"] has grad"
;
auto
grad_tensor
=
egr
::
EagerUtils
::
mutable_grad
(
tensors_
[
var_index
]);
group_tensor
.
ShareDataWith
(
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
grad_tensor
->
impl
())))
.
Resize
({
length
});
}
else
{
VLOG
(
3
)
<<
"Tensor["
<<
tensors_
[
var_index
].
name
()
<<
"] doesn't have grad"
;
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
);
group_tensor
.
Resize
({
static_cast
<
int64_t
>
(
length
)});
phi
::
funcs
::
set_constant
(
*
dev_ctx
,
&
group_tensor
,
0.0
);
}
}
}
else
{
auto
*
autograd_meta
=
tensors_
[
var_index
].
get_autograd_meta
();
auto
&
grad_tensor
=
static_cast
<
egr
::
AutogradMeta
*>
(
autograd_meta
)
->
Grad
();
// process sparse group
PADDLE_ENFORCE_EQ
(
HasGrad
(
var_index
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The sparse parameter[%d][%s] should have gradient. "
"Currently, DataParallel does not support sparse "
"parameters without generating gradients during training. "
"For example, if is_sparese=True is used in Embedding, "
"the current step of this parameter cannot generate gradient "
"because of stop_gradient/detatch, where error will occur."
,
var_index
,
tensors_
[
var_index
].
name
()));
// need to check tensor type
PADDLE_ENFORCE_EQ
(
grad_tensor
.
is_selected_rows
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The sparse parameter[%d][%s] must have a selectedrows gradient. "
"Before forward pass, the parameter type is inferred to be "
"SelectedRows, but after backward pass, its actual type becomes "
"LodTensor. It is currently not supported by DataParallel. "
"For example, if sparse embedding is used, and the weight of "
"embedding is shared with subsequent dense parameters, then "
"the parameter gradient of the embedding will be converted "
"to dense parameters."
,
var_index
,
tensors_
[
var_index
].
name
()));
group
.
sparse_contents_
.
set_impl
(
grad_tensor
.
impl
());
}
if
(
--
group
.
pending_
==
0
)
{
// can start allreduce
MarkGroupReady
(
group_index
);
}
if
(
next_group_
==
groups_
.
size
())
{
FinalizeBackward
();
}
}
void
EagerReducer
::
MarkGroupReady
(
size_t
group_index
)
{
VLOG
(
3
)
<<
"Group["
<<
group_index
<<
"] is ready"
;
PADDLE_ENFORCE_GE
(
group_index
,
next_group_
,
platform
::
errors
::
PreconditionNotMet
(
"The index of the incoming group must be greater "
"than or equal to the previously synchronized group index, "
"expect it to greater than or equal to %d, but got %d."
,
next_group_
,
group_index
));
if
(
group_index
>
next_group_
)
{
VLOG
(
3
)
<<
"It will adjust the order of group in next batch automatically"
;
return
;
}
for
(;
next_group_
<
groups_
.
size
()
&&
groups_
[
next_group_
].
pending_
==
0
;
++
next_group_
)
{
UNUSED
auto
&
group
=
groups_
[
next_group_
];
if
(
group
.
is_sparse_
)
{
AllReduceSparse
(
&
group
,
next_group_
);
}
else
{
FusedAllReduceSchedule
(
&
group
,
next_group_
);
}
}
}
bool
EagerReducer
::
HasGrad
(
size_t
var_index
)
{
auto
grad
=
egr
::
EagerUtils
::
mutable_grad
(
tensors_
[
var_index
]);
if
(
grad
&&
grad
->
initialized
())
{
return
true
;
}
else
{
return
false
;
}
}
void
EagerReducer
::
ProcessUnusedDenseVars
()
{
// The calculation stream must be used here to
// avoid conflicts with communication.
VLOG
(
3
)
<<
"Local used vars : "
<<
string
::
join_strings
(
local_used_vars_
,
','
);
const
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
);
auto
*
global_used_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
global_used_vars_
.
impl
())
.
get
();
framework
::
TensorFromVector
<
int32_t
>
(
local_used_vars_
,
*
dev_ctx
,
global_used_tensor
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
ReduceOp
::
SUM
;
std
::
vector
<
Tensor
>
reduce_tensors
=
{
global_used_vars_
};
std
::
vector
<
phi
::
DenseTensor
>
in_out
;
for
(
auto
&
t
:
reduce_tensors
)
{
in_out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
framework
::
TensorToVector
<
int
>
(
*
global_used_tensor
,
*
dev_ctx
,
&
local_used_vars_
);
dev_ctx
->
Wait
();
// sync compute stream to get global used var message,
// but maybe affect speed performance
VLOG
(
3
)
<<
"Global used vars : "
<<
string
::
join_strings
(
local_used_vars_
,
','
);
for
(
const
auto
var_index
:
unused_vars_
)
{
const
bool
global_unused
=
(
local_used_vars_
[
var_index
]
==
0
);
// global used but local unused, set grad
VLOG
(
3
)
<<
"[Rank "
<<
process_group_
->
GetRank
()
<<
"]: "
<<
"Var ["
<<
var_index
<<
"] ["
<<
tensors_
[
var_index
].
name
()
<<
"] global_unused: "
<<
global_unused
<<
" has grad: "
<<
HasGrad
(
var_index
);
if
(
!
global_unused
)
{
VLOG
(
3
)
<<
"Set Tensor["
<<
var_index
<<
"]'s Grad for [Rank "
<<
process_group_
->
GetRank
()
<<
"]"
;
const
auto
&
var_locator
=
variable_locators_
[
var_index
];
const
auto
group_index
=
var_locator
.
group_index
;
const
auto
&
group
=
groups_
[
group_index
];
const
auto
inside_group_index
=
var_locator
.
inside_group_index
;
auto
&
src_tensor
=
group
.
dense_tensors_
[
inside_group_index
];
// sparse no need to check and no support find_unused_parameters
if
(
group
.
is_sparse_
)
{
continue
;
}
// NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
// gradient accumulation can continue normally after clear_gradients()
// especiall in cases including complex control flow.
std
::
static_pointer_cast
<
egr
::
GradNodeAccumulation
>
(
GetGradNodeFromTensor
(
&
tensors_
[
var_index
]))
->
SetFakeEmpty
(
false
);
Tensor
grad_value
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
src_tensor
));
auto
dest_var_base
=
tensors_
[
var_index
];
auto
grad_tensor
=
egr
::
EagerUtils
::
mutable_grad
(
dest_var_base
);
grad_tensor
->
copy_
(
grad_value
,
inner_place_
,
true
);
grad_tensor
->
reshape
(
dest_var_base
.
shape
());
}
}
}
void
EagerReducer
::
FinalizeBackward
()
{
groups_need_finalize_
=
false
;
grad_need_hooks_
=
false
;
for
(
auto
&
group
:
groups_
)
{
if
(
!
group
.
is_sparse_
)
{
group
.
task
->
Synchronize
();
}
}
for
(
auto
&
group
:
groups_
)
{
if
(
!
group
.
is_sparse_
)
{
group
.
SplitTensors
(
inner_place_
);
group
.
dense_contents_
.
reset
();
}
}
if
(
find_unused_vars_each_step_
)
{
ProcessUnusedDenseVars
();
local_used_vars_
.
clear
();
local_used_vars_
.
resize
(
tensors_
.
size
(),
0
);
VLOG
(
3
)
<<
"ProcessUnusedDenseVars is finished."
;
}
VLOG
(
3
)
<<
"In the batch, Reducer is finished."
;
}
void
EagerReducer
::
FusedAllReduceSchedule
(
EagerGroup
*
group
,
const
int
curr_group_index
)
{
// The overall timeline: concat > div_nranks > allreduce > split
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
ReduceOp
::
SUM
;
VLOG
(
3
)
<<
"group ["
<<
curr_group_index
<<
"] start fused_allreduce."
;
// concat tensors
group
->
ConcatTensors
(
inner_place_
);
// div nranks
paddle
::
experimental
::
scale_
(
group
->
dense_contents_
,
1.0
/
nranks_
,
0.0
,
false
);
// all_reduce
std
::
vector
<
Tensor
>
reduce_tensors
=
{
group
->
dense_contents_
};
std
::
vector
<
phi
::
DenseTensor
>
in_out
;
for
(
auto
&
t
:
reduce_tensors
)
{
in_out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
group
->
task
=
process_group_
->
AllReduce
(
in_out
,
in_out
,
opts
);
// split in FinalizeBackward()
}
void
EagerReducer
::
AllReduceSparse
(
EagerGroup
*
group
,
const
int
curr_group_index
)
{
// div nranks
Tensor
sparse_tensor
(
group
->
sparse_contents_
);
paddle
::
experimental
::
scale_
(
sparse_tensor
,
1.0
/
nranks_
,
0.0
,
false
);
VLOG
(
3
)
<<
"sparse_group ["
<<
curr_group_index
<<
"] start allreduce."
;
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
);
if
(
platform
::
is_gpu_place
(
inner_place_
))
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
dev_ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
));
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."
));
#endif
}
else
if
(
platform
::
is_custom_place
(
inner_place_
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
dev_ctx
=
static_cast
<
platform
::
CustomDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
));
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat grad tensors since it's not compiled with "
"CUSTOM_DEVICE,"
"Please recompile or reinstall Paddle with CUSTOM_DEVICE support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
inner_place_
))
{
dev_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Split grad tensor not supported on place (%s)"
,
inner_place_
));
}
auto
src
=
std
::
dynamic_pointer_cast
<
phi
::
SelectedRows
>
(
group
->
sparse_contents_
.
impl
());
const
auto
&
src_rows
=
src
->
rows
();
const
auto
&
rank_
=
process_group_
->
GetRank
();
const
auto
&
size_
=
process_group_
->
GetSize
();
framework
::
Vector
<
int64_t
>
rows_num_vector
(
size_
);
rows_num_vector
[
rank_
]
=
static_cast
<
int64_t
>
(
src_rows
.
size
());
Tensor
rows_num_tensor
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int64_t
>
(
size_
)}),
DataType
::
INT64
,
inner_place_
);
auto
*
rows_num_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
rows_num_tensor
.
impl
()).
get
();
framework
::
TensorFromVector
<
int64_t
>
(
rows_num_vector
,
*
dev_ctx
,
rows_num_dense_tensor
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
ReduceOp
::
SUM
;
std
::
vector
<
Tensor
>
reduce_tensors
=
{
rows_num_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
in_out
;
for
(
auto
&
t
:
reduce_tensors
)
{
in_out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
framework
::
TensorToVector
<
int64_t
>
(
*
rows_num_dense_tensor
,
*
dev_ctx
,
&
rows_num_vector
);
dev_ctx
->
Wait
();
const
auto
*
cpu_rows_num_ptr
=
rows_num_vector
.
data
();
auto
rows_num
=
std
::
accumulate
(
cpu_rows_num_ptr
,
cpu_rows_num_ptr
+
size_
,
static_cast
<
int64_t
>
(
0
));
VLOG
(
3
)
<<
"Gather rows: "
<<
string
::
join_strings
(
rows_num_vector
,
','
)
<<
", total rows number: "
<<
rows_num
<<
", height: "
<<
src
->
height
();
dev_ctx
->
Wait
();
Tensor
src_value_tensor
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
src
->
value
()));
std
::
vector
<
int64_t
>
dst_shape
=
src_value_tensor
.
shape
();
if
(
std
::
all_of
(
cpu_rows_num_ptr
,
cpu_rows_num_ptr
+
size_
,
[
&
](
int64_t
row
)
{
return
row
==
cpu_rows_num_ptr
[
0
];
}))
{
// During sparse communication, the number of each card is same.
// allgather is used to speed up the allreduce by replacing broadcast.
VLOG
(
3
)
<<
"allgather replaces broadcast to speed up in sparse allreduce"
;
Tensor
dst_rows_tensor
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int64_t
>
(
rows_num
)}),
DataType
::
INT64
,
inner_place_
);
Tensor
src_rows_tensor
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int64_t
>
((
*
src
).
rows
().
size
())}),
DataType
::
INT64
,
inner_place_
);
auto
*
src_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
src_rows_tensor
.
impl
())
.
get
();
framework
::
TensorFromVector
<
int64_t
>
(
(
*
src
).
rows
(),
*
dev_ctx
,
src_rows_dense_tensor
);
std
::
vector
<
Tensor
>
src_rows_tensors
=
{
src_rows_tensor
};
std
::
vector
<
Tensor
>
dst_rows_tensors
=
{
dst_rows_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
in
;
std
::
vector
<
phi
::
DenseTensor
>
out
;
for
(
auto
&
t
:
src_rows_tensors
)
{
in
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
for
(
auto
&
t
:
dst_rows_tensors
)
{
out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllGather
(
in
,
out
)
->
Synchronize
();
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
framework
::
TensorToVector
<
int64_t
>
(
*
dst_rows_dense_tensor
,
*
dev_ctx
,
&
dst_rows_vector
);
dev_ctx
->
Wait
();
dst_shape
[
dst_shape
.
size
()
-
2
]
=
rows_num
;
auto
dst_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
paddle
::
experimental
::
full
(
IntArray
(
dst_shape
),
0
,
src_value_tensor
.
dtype
(),
inner_place_
)
.
impl
());
auto
dst
=
std
::
make_shared
<
phi
::
SelectedRows
>
(
dst_rows_vector
,
(
*
src
).
height
());
*
(
dst
->
mutable_value
())
=
*
dst_dense_tensor
;
Tensor
dst_value_tensor
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
dst
->
value
()));
std
::
vector
<
Tensor
>
src_value_tensors
=
{
src_value_tensor
};
std
::
vector
<
Tensor
>
dst_value_tensors
=
{
dst_value_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
src_dense
;
std
::
vector
<
phi
::
DenseTensor
>
dst_dense
;
for
(
auto
&
t
:
src_value_tensors
)
{
src_dense
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
for
(
auto
&
t
:
dst_value_tensors
)
{
dst_dense
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllGather
(
src_dense
,
dst_dense
)
->
Synchronize
();
src
->
set_rows
(
dst_rows_vector
);
*
(
src
->
mutable_value
())
=
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_value_tensor
.
impl
()));
}
else
{
std
::
vector
<
Tensor
>
rows_tensors
;
std
::
vector
<
Tensor
>
values_tensors
;
for
(
int
i
=
0
;
i
<
size_
;
++
i
)
{
std
::
vector
<
int64_t
>
value_tensor_shape
=
{
cpu_rows_num_ptr
[
i
],
dst_shape
[
dst_shape
.
size
()
-
1
]};
Tensor
rows_tensor
=
paddle
::
experimental
::
full
(
IntArray
({
static_cast
<
int64_t
>
(
cpu_rows_num_ptr
[
i
])}),
0
,
DataType
::
INT64
,
inner_place_
);
Tensor
values_tensor
=
paddle
::
experimental
::
full
(
IntArray
(
value_tensor_shape
),
0
,
src
->
value
().
dtype
(),
inner_place_
);
std
::
vector
<
phi
::
DenseTensor
>
rows_dense_vector
;
std
::
vector
<
phi
::
DenseTensor
>
values_dense_vector
;
if
(
i
==
rank_
)
{
auto
*
rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
rows_tensor
.
impl
())
.
get
();
framework
::
TensorFromVector
<
int64_t
>
(
src_rows
,
*
dev_ctx
,
rows_dense_tensor
);
values_tensor
.
set_impl
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
src
->
value
()));
}
rows_dense_vector
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
rows_tensor
.
impl
()));
values_dense_vector
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
values_tensor
.
impl
()));
auto
b_opts
=
BroadcastOptions
();
b_opts
.
source_rank
=
i
;
process_group_
->
Broadcast
(
rows_dense_vector
,
rows_dense_vector
,
b_opts
);
process_group_
->
Broadcast
(
values_dense_vector
,
values_dense_vector
,
b_opts
)
->
Wait
();
rows_tensors
.
push_back
(
rows_tensor
);
values_tensors
.
push_back
(
values_tensor
);
}
Tensor
dst_rows_tensor
=
paddle
::
experimental
::
concat
(
rows_tensors
,
phi
::
Scalar
(
0
));
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
framework
::
TensorToVector
<
int64_t
>
(
*
dst_rows_dense_tensor
,
*
dev_ctx
,
&
dst_rows_vector
);
src
->
set_rows
(
dst_rows_vector
);
Tensor
dst_values_tensor
=
paddle
::
experimental
::
concat
(
values_tensors
,
phi
::
Scalar
(
0
));
*
(
src
->
mutable_value
())
=
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_values_tensor
.
impl
()));
}
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
EagerGroup
&
group
)
{
const
auto
&
tensors_
=
group
.
tensor_indices_
;
out
<<
"numel: "
<<
group
.
all_length_
<<
" ;var number: "
<<
tensors_
.
size
()
<<
"
\n
"
;
auto
begin
=
tensors_
.
begin
();
auto
end
=
tensors_
.
end
();
out
<<
"["
;
for
(
int
i
=
0
;
begin
!=
end
&&
i
<
100
;
++
i
,
++
begin
)
{
if
(
i
>
0
)
out
<<
' '
;
out
<<
*
begin
;
}
if
(
begin
!=
end
)
{
out
<<
" ..."
;
}
out
<<
"]
\n
"
;
return
out
;
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/reducer.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/utils/hook_utils.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/utils/string/string_helper.h"
namespace
paddle
{
namespace
distributed
{
using
Tensor
=
paddle
::
experimental
::
Tensor
;
using
Scalar
=
paddle
::
experimental
::
ScalarBase
<
paddle
::
experimental
::
Tensor
>
;
using
IntArray
=
paddle
::
experimental
::
IntArrayBase
<
paddle
::
experimental
::
Tensor
>
;
using
Backend
=
paddle
::
experimental
::
Backend
;
std
::
vector
<
std
::
vector
<
size_t
>>
Eager_AssignGroupBySize
(
const
std
::
vector
<
Tensor
>
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
const
std
::
vector
<
int64_t
>
&
tensor_indices
=
{});
class
EagerGroup
{
public:
Tensor
dense_contents_
;
Tensor
sparse_contents_
;
bool
is_sparse_
=
false
;
// for concat kernel
std
::
vector
<
phi
::
DenseTensor
>
dense_tensors_
;
std
::
vector
<
int64_t
>
length_
;
int64_t
all_length_
{
0
};
std
::
vector
<
IntArray
>
origin_shapes_
;
// Global indices of participating tensors in the group
std
::
vector
<
size_t
>
tensor_indices_
;
// Number of params that haven't been ready. When it is 0, it means
// the group is ready.
size_t
pending_
=
-
1
;
// external message of group
phi
::
DataType
dtype_
;
// help to sync
std
::
shared_ptr
<
ProcessGroup
::
Task
>
task
;
// context is used to select the stream for concat
void
ConcatTensors
(
const
platform
::
Place
&
);
// context is used to select the stream for split
void
SplitTensors
(
const
platform
::
Place
&
);
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
,
const
EagerGroup
&
);
};
struct
TensorLocator
{
// record the index in groups_
size_t
group_index
;
size_t
inside_group_index
;
};
class
EagerReducer
{
public:
explicit
EagerReducer
(
const
std
::
vector
<
Tensor
>
tensors
,
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
std
::
shared_ptr
<
distributed
::
ProcessGroup
>
process_group
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
bool
find_unused_parameters
);
virtual
~
EagerReducer
()
{}
std
::
shared_ptr
<
egr
::
GradNodeBase
>
GetGradNodeFromTensor
(
Tensor
*
tensor
);
void
InitializeGroups
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
);
void
InitializeDenseGroups
(
const
std
::
vector
<
size_t
>
&
tensor_indices_
,
EagerGroup
*
p_group
);
void
PrepareForBackward
(
const
std
::
vector
<
Tensor
>
&
outputs
);
void
AddDistHook
(
size_t
var_index
);
void
MarkVarReady
(
const
size_t
var_index
,
const
bool
is_used_var
);
void
MarkGroupReady
(
const
size_t
group_index
);
void
FusedAllReduceSchedule
(
EagerGroup
*
group
,
const
int
curr_group_index
);
void
AllReduceSparse
(
EagerGroup
*
group
,
const
int
curr_group_index
);
void
FinalizeBackward
();
void
TraverseBackwardGraph
(
const
std
::
vector
<
Tensor
>
&
outputs
);
void
ProcessUnusedDenseVars
();
bool
HasGrad
(
size_t
var_index
);
private:
std
::
vector
<
Tensor
>
tensors_
;
std
::
vector
<
std
::
vector
<
size_t
>>
group_indices_
;
std
::
vector
<
bool
>
is_sparse_gradient_
;
std
::
shared_ptr
<
distributed
::
ProcessGroup
>
process_group_
;
std
::
vector
<
size_t
>
group_size_limits_
;
std
::
vector
<
EagerGroup
>
groups_
;
std
::
vector
<
TensorLocator
>
variable_locators_
;
platform
::
Place
inner_place_
;
size_t
next_group_
=
0
;
int64_t
nranks_
=
-
1
;
bool
grad_need_hooks_
{
false
};
std
::
vector
<
bool
>
vars_marked_ready_
;
std
::
vector
<
int32_t
>
local_used_vars_
;
// Following variables are to help unused vars
std
::
vector
<
size_t
>
unused_vars_
;
std
::
map
<
egr
::
GradNodeBase
*
,
size_t
>
gradnode_index_map_
;
bool
has_marked_unused_vars_
{
false
};
bool
find_unused_vars_each_step_
{
false
};
bool
find_unused_vars_once_
{
true
};
bool
groups_need_finalize_
{
false
};
Tensor
global_used_vars_
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/CMakeLists.txt
0 → 100644
View file @
f0ef3442
cc_library
(
afs_wrapper
SRCS afs_warpper.cc
DEPS fs ps_framework_proto
)
#set_property(GLOBAL PROPERTY COMMON_DEPS afs_warpper)
paddle/fluid/distributed/common/afs_warpper.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/common/afs_warpper.h"
#include "paddle/fluid/framework/io/fs.h"
namespace
paddle
{
namespace
distributed
{
// AfsClient impl
int
AfsClient
::
initialize
(
const
FsClientParameter
&
fs_client_param
)
{
// temporarily implemented with hdfs-client
return
initialize
(
fs_client_param
.
hadoop_bin
(),
fs_client_param
.
uri
(),
fs_client_param
.
user
(),
fs_client_param
.
passwd
(),
fs_client_param
.
buffer_size
());
}
int
AfsClient
::
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
user
,
const
std
::
string
&
passwd
,
int
buffer_size_param
)
{
return
initialize
(
hadoop_bin
,
uri
,
paddle
::
string
::
format_string
(
"%s,%s"
,
user
.
c_str
(),
passwd
.
c_str
()),
buffer_size_param
);
}
int
AfsClient
::
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
ugi
,
int
buffer_size_param
)
{
// temporarily implemented with hdfs-client
size_t
buffer_size
=
1L
<<
25
;
// 32MB
if
(
buffer_size_param
>
static_cast
<
int
>
(
buffer_size
))
{
buffer_size
=
buffer_size_param
;
}
paddle
::
framework
::
hdfs_set_buffer_size
(
buffer_size
);
paddle
::
framework
::
hdfs_set_command
(
paddle
::
string
::
format_string
(
"2>>./hdfs_err.log %s fs -Dfs.default.name=%s -Dhadoop.job.ugi=%s "
"-Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=300000"
,
hadoop_bin
.
c_str
(),
uri
.
c_str
(),
ugi
.
c_str
()));
return
0
;
}
// open file in 'w' or 'r'
std
::
shared_ptr
<
FsReadChannel
>
AfsClient
::
open_r
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
,
int
*
err_no
)
{
std
::
shared_ptr
<
FsReadChannel
>
channel
=
std
::
make_shared
<
FsReadChannel
>
(
buffer_size
);
std
::
shared_ptr
<
FILE
>
fp
=
paddle
::
framework
::
fs_open_read
(
config
.
path
,
err_no
,
config
.
deconverter
);
channel
->
open
(
fp
,
config
);
return
channel
;
}
std
::
shared_ptr
<
FsWriteChannel
>
AfsClient
::
open_w
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
,
int
*
err_no
)
{
std
::
shared_ptr
<
FsWriteChannel
>
channel
=
std
::
make_shared
<
FsWriteChannel
>
(
buffer_size
);
std
::
shared_ptr
<
FILE
>
fp
=
paddle
::
framework
::
fs_open_write
(
config
.
path
,
err_no
,
config
.
converter
);
channel
->
open
(
fp
,
config
);
return
channel
;
}
// remove file in path, path maybe a reg, such as 'part-000-*'
void
AfsClient
::
remove
(
const
std
::
string
&
path
)
{
return
paddle
::
framework
::
fs_remove
(
path
);
}
void
AfsClient
::
remove_dir
(
const
std
::
string
&
dir
)
{
return
paddle
::
framework
::
fs_remove
(
dir
);
}
// list files in path, path maybe a dir with reg
std
::
vector
<
std
::
string
>
AfsClient
::
list
(
const
std
::
string
&
path
)
{
return
paddle
::
framework
::
fs_list
(
path
);
}
// exist or not
bool
AfsClient
::
exist
(
const
std
::
string
&
dir
)
{
return
paddle
::
framework
::
fs_exists
(
dir
);
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/afs_warpper.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/the_one_ps.pb.h"
#include "paddle/fluid/string/string_helper.h"
namespace
paddle
{
namespace
distributed
{
struct
FsDataConverter
{
std
::
string
converter
;
std
::
string
deconverter
;
};
struct
FsChannelConfig
{
std
::
string
path
;
// path of file
std
::
string
converter
;
// data converter
std
::
string
deconverter
;
};
class
FsReadChannel
{
public:
FsReadChannel
()
:
_buffer_size
(
0
)
{}
explicit
FsReadChannel
(
uint32_t
buffer_size
)
:
_buffer_size
(
buffer_size
)
{}
virtual
~
FsReadChannel
()
{}
FsReadChannel
(
FsReadChannel
&&
)
=
delete
;
FsReadChannel
(
const
FsReadChannel
&
)
=
delete
;
int
open
(
std
::
shared_ptr
<
FILE
>
fp
,
const
FsChannelConfig
&
config
)
{
_file
=
fp
;
return
0
;
}
inline
int
close
()
{
_file
.
reset
();
return
0
;
}
inline
uint32_t
read_line
(
std
::
string
&
line_data
)
{
// NOLINT
line_data
.
clear
();
char
buffer
=
'\0'
;
size_t
read_count
=
0
;
while
(
1
==
fread
(
&
buffer
,
1
,
1
,
_file
.
get
())
&&
buffer
!=
'\n'
)
{
++
read_count
;
line_data
.
append
(
&
buffer
,
1
);
}
if
(
read_count
==
0
&&
buffer
!=
'\n'
)
{
return
-
1
;
}
return
0
;
}
private:
uint32_t
_buffer_size
;
FsChannelConfig
_config
;
std
::
shared_ptr
<
FILE
>
_file
;
};
class
FsWriteChannel
{
public:
FsWriteChannel
()
:
_buffer_size
(
0
)
{}
explicit
FsWriteChannel
(
uint32_t
buffer_size
)
:
_buffer_size
(
buffer_size
)
{}
virtual
~
FsWriteChannel
()
{}
FsWriteChannel
(
FsWriteChannel
&&
)
=
delete
;
FsWriteChannel
(
const
FsWriteChannel
&
)
=
delete
;
int
open
(
std
::
shared_ptr
<
FILE
>
fp
,
const
FsChannelConfig
&
config
)
{
_file
=
fp
;
// the buffer has set in fs.cc
// if (_buffer_size != 0) {
// _buffer = std::shared_ptr<char>(new char[_buffer_size]);
// CHECK(0 == setvbuf(&*_file, _buffer.get(), _IOFBF, _buffer_size));
//}
return
0
;
}
inline
void
flush
()
{
return
;
}
inline
int
close
()
{
flush
();
_file
.
reset
();
return
0
;
}
inline
uint32_t
write_line
(
const
char
*
data
,
uint32_t
size
)
{
size_t
write_count
=
fwrite_unlocked
(
data
,
1
,
size
,
_file
.
get
());
if
(
write_count
!=
size
)
{
return
-
1
;
}
write_count
=
fwrite_unlocked
(
"
\n
"
,
1
,
1
,
_file
.
get
());
if
(
write_count
!=
1
)
{
return
-
1
;
}
return
0
;
}
inline
uint32_t
write_line
(
const
std
::
string
&
data
)
{
return
write_line
(
data
.
c_str
(),
data
.
size
());
}
private:
uint32_t
_buffer_size
;
FsChannelConfig
_config
;
std
::
shared_ptr
<
FILE
>
_file
;
std
::
shared_ptr
<
char
>
_buffer
;
};
class
AfsClient
{
public:
AfsClient
()
{}
virtual
~
AfsClient
()
{}
AfsClient
(
AfsClient
&&
)
=
delete
;
AfsClient
(
const
AfsClient
&
)
=
delete
;
int
initialize
(
const
FsClientParameter
&
fs_client_param
);
int
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
user
,
const
std
::
string
&
passwd
,
int
buffer_size_param
=
(
1L
<<
25
));
int
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
ugi
,
int
buffer_size_param
=
(
1L
<<
25
));
// open file in 'w' or 'r'
std
::
shared_ptr
<
FsReadChannel
>
open_r
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
=
0
,
int
*
err_no
=
nullptr
);
std
::
shared_ptr
<
FsWriteChannel
>
open_w
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
=
0
,
int
*
err_no
=
nullptr
);
// remove file in path, path maybe a reg, such as 'part-000-*'
void
remove
(
const
std
::
string
&
path
);
void
remove_dir
(
const
std
::
string
&
dir
);
// list files in path, path maybe a dir with reg
std
::
vector
<
std
::
string
>
list
(
const
std
::
string
&
path
);
// exist or not
bool
exist
(
const
std
::
string
&
dir
);
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/chunk_allocator.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <glog/logging.h>
namespace
paddle
{
namespace
distributed
{
// Fast allocation and deallocation of objects by allocating them in chunks.
template
<
class
T
>
class
ChunkAllocator
{
public:
explicit
ChunkAllocator
(
size_t
chunk_size
=
64
)
{
CHECK
(
sizeof
(
Node
)
==
std
::
max
(
sizeof
(
void
*
),
sizeof
(
T
)));
_chunk_size
=
chunk_size
;
_chunks
=
NULL
;
_free_nodes
=
NULL
;
_counter
=
0
;
}
ChunkAllocator
(
const
ChunkAllocator
&
)
=
delete
;
~
ChunkAllocator
()
{
while
(
_chunks
!=
NULL
)
{
Chunk
*
x
=
_chunks
;
_chunks
=
_chunks
->
next
;
free
(
x
);
}
}
template
<
class
...
ARGS
>
T
*
acquire
(
ARGS
&&
...
args
)
{
if
(
_free_nodes
==
NULL
)
{
create_new_chunk
();
}
T
*
x
=
(
T
*
)(
void
*
)
_free_nodes
;
// NOLINT
_free_nodes
=
_free_nodes
->
next
;
new
(
x
)
T
(
std
::
forward
<
ARGS
>
(
args
)...);
_counter
++
;
return
x
;
}
void
release
(
T
*
x
)
{
x
->~
T
();
Node
*
node
=
(
Node
*
)(
void
*
)
x
;
// NOLINT
node
->
next
=
_free_nodes
;
_free_nodes
=
node
;
_counter
--
;
}
size_t
size
()
const
{
return
_counter
;
}
private:
struct
alignas
(
T
)
Node
{
union
{
Node
*
next
;
char
data
[
sizeof
(
T
)];
};
};
struct
Chunk
{
Chunk
*
next
;
Node
nodes
[];
};
size_t
_chunk_size
;
// how many elements in one chunk
Chunk
*
_chunks
;
// a list
Node
*
_free_nodes
;
// a list
size_t
_counter
;
// how many elements are acquired
void
create_new_chunk
()
{
Chunk
*
chunk
;
posix_memalign
(
reinterpret_cast
<
void
**>
(
&
chunk
),
std
::
max
<
size_t
>
(
sizeof
(
void
*
),
alignof
(
Chunk
)),
sizeof
(
Chunk
)
+
sizeof
(
Node
)
*
_chunk_size
);
chunk
->
next
=
_chunks
;
_chunks
=
chunk
;
for
(
size_t
i
=
0
;
i
<
_chunk_size
;
i
++
)
{
Node
*
node
=
&
chunk
->
nodes
[
i
];
node
->
next
=
_free_nodes
;
_free_nodes
=
node
;
}
}
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/cost_timer.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <unordered_map>
#include "butil/time.h"
#include "bvar/latency_recorder.h"
#include "glog/logging.h"
namespace
paddle
{
namespace
distributed
{
struct
CostProfilerNode
{
std
::
shared_ptr
<
bvar
::
LatencyRecorder
>
recorder
;
};
class
CostProfiler
{
public:
~
CostProfiler
()
{}
static
CostProfiler
&
instance
()
{
static
CostProfiler
profiler
;
return
profiler
;
}
void
register_profiler
(
const
std
::
string
&
label
)
{
if
(
_cost_profiler_map
.
find
(
label
)
!=
_cost_profiler_map
.
end
())
{
return
;
}
auto
profiler_node
=
std
::
make_shared
<
CostProfilerNode
>
();
profiler_node
->
recorder
.
reset
(
new
bvar
::
LatencyRecorder
(
"cost_profiler"
,
label
));
_cost_profiler_map
[
label
]
=
profiler_node
;
}
CostProfilerNode
*
profiler
(
const
std
::
string
&
label
)
{
auto
itr
=
_cost_profiler_map
.
find
(
label
);
if
(
itr
!=
_cost_profiler_map
.
end
())
{
return
itr
->
second
.
get
();
}
return
NULL
;
}
private:
CostProfiler
()
{}
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
CostProfilerNode
>>
_cost_profiler_map
;
};
class
CostTimer
{
public:
explicit
CostTimer
(
const
std
::
string
&
label
)
{
_label
=
label
;
auto
&
profiler
=
CostProfiler
::
instance
();
_profiler_node
=
profiler
.
profiler
(
label
);
// 如果不在profiler中,则使用log输出耗时信息
_is_print_cost
=
_profiler_node
==
NULL
;
_start_time_ms
=
butil
::
gettimeofday_ms
();
}
explicit
CostTimer
(
CostProfilerNode
&
profiler_node
)
{
// NOLINT
_is_print_cost
=
false
;
_profiler_node
=
&
profiler_node
;
_start_time_ms
=
butil
::
gettimeofday_ms
();
}
~
CostTimer
()
{
if
(
_is_print_cost
)
{
VLOG
(
3
)
<<
"CostTimer label:"
<<
_label
<<
", cost:"
<<
butil
::
gettimeofday_ms
()
-
_start_time_ms
<<
"ms"
;
}
else
{
*
(
_profiler_node
->
recorder
)
<<
butil
::
gettimeofday_ms
()
-
_start_time_ms
;
}
}
private:
std
::
string
_label
;
bool
_is_print_cost
;
uint64_t
_start_time_ms
;
CostProfilerNode
*
_profiler_node
;
};
}
// namespace distributed
}
// namespace paddle
Prev
1
…
4
5
6
7
8
9
10
11
12
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment