Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
d2d32668
Commit
d2d32668
authored
Apr 26, 2023
by
yuguo960516yuguo
Browse files
2.3.0-dtk-22.04.2
parent
ad08b8ce
Pipeline
#226
failed with stages
in 0 seconds
Changes
268
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3849 additions
and
0 deletions
+3849
-0
paddle/fluid/distributed/collective/ProcessGroupHeter.h
paddle/fluid/distributed/collective/ProcessGroupHeter.h
+140
-0
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+867
-0
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+214
-0
paddle/fluid/distributed/collective/Types.h
paddle/fluid/distributed/collective/Types.h
+53
-0
paddle/fluid/distributed/collective/reducer.cc
paddle/fluid/distributed/collective/reducer.cc
+1091
-0
paddle/fluid/distributed/collective/reducer.h
paddle/fluid/distributed/collective/reducer.h
+145
-0
paddle/fluid/distributed/common/CMakeLists.txt
paddle/fluid/distributed/common/CMakeLists.txt
+6
-0
paddle/fluid/distributed/common/afs_warpper.cc
paddle/fluid/distributed/common/afs_warpper.cc
+100
-0
paddle/fluid/distributed/common/afs_warpper.h
paddle/fluid/distributed/common/afs_warpper.h
+161
-0
paddle/fluid/distributed/common/chunk_allocator.h
paddle/fluid/distributed/common/chunk_allocator.h
+95
-0
paddle/fluid/distributed/common/cost_timer.h
paddle/fluid/distributed/common/cost_timer.h
+93
-0
paddle/fluid/distributed/common/local_random.h
paddle/fluid/distributed/common/local_random.h
+66
-0
paddle/fluid/distributed/common/registerer.h
paddle/fluid/distributed/common/registerer.h
+128
-0
paddle/fluid/distributed/common/topk_calculator.h
paddle/fluid/distributed/common/topk_calculator.h
+73
-0
paddle/fluid/distributed/common/utils.h
paddle/fluid/distributed/common/utils.h
+108
-0
paddle/fluid/distributed/dataset_utils/README.md
paddle/fluid/distributed/dataset_utils/README.md
+6
-0
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+87
-0
paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
...fluid/distributed/fleet_executor/amplifier_interceptor.cc
+60
-0
paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
.../fluid/distributed/fleet_executor/amplifier_interceptor.h
+43
-0
paddle/fluid/distributed/fleet_executor/carrier.cc
paddle/fluid/distributed/fleet_executor/carrier.cc
+313
-0
No files found.
Too many changes to show.
To preserve performance only
268 of 268+
files are displayed.
Plain diff
Email patch
paddle/fluid/distributed/collective/ProcessGroupHeter.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#endif
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL))
#include "paddle/fluid/distributed/ps/service/heter_client.h"
#endif
#include "paddle/fluid/distributed/collective/Common.h"
constexpr
const
char
*
HETER_BACKEND_NAME
=
"HETER_BACKEND"
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
class
ProcessGroupHeter
:
public
ProcessGroup
{
public:
class
HeterTask
:
public
ProcessGroup
::
Task
,
public
std
::
enable_shared_from_this
<
HeterTask
>
{
public:
HeterTask
(
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
);
bool
IsCompleted
();
void
SynchronizeStreams
()
{}
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
Synchronize
()
{}
virtual
~
HeterTask
();
};
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoints
,
int
src_rank
,
int
dst_rank
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
HETER_BACKEND_NAME
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
,
std
::
vector
<
phi
::
DenseTensor
>&
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
,
std
::
vector
<
phi
::
DenseTensor
>&
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
int
peer
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int
peer
)
override
;
protected:
virtual
std
::
shared_ptr
<
ProcessGroupHeter
::
HeterTask
>
CreateTask
(
int
rank
,
CommType
opType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
private:
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
ProcessGroup
>
inner_pg_
;
std
::
shared_ptr
<
ProcessGroupGloo
>
inter_pg_
;
int
local_rank_
;
int
local_size_
;
int
gloo_rank_
;
int
gloo_size_
;
bool
with_switch_
;
std
::
string
switch_endpoint_
;
int
src_rank_
;
int
dst_rank_
;
static
int
send_count
;
static
int
recv_count
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
0 → 100644
View file @
d2d32668
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
DECLARE_bool
(
nccl_blocking_wait
);
DECLARE_bool
(
use_stream_safe_cuda_allocator
);
constexpr
int64_t
kWaitBlockTImeout
=
10
;
namespace
paddle
{
namespace
distributed
{
void
SyncDefaultStream
(
const
std
::
vector
<
Place
>&
places
,
std
::
vector
<
EventManager
>&
ncclEvents
,
// NOLINT
std
::
vector
<
std
::
unique_ptr
<
CUDADeviceContext
>>&
dev_ctx
)
{
// NOLINT
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]));
ncclEvents
[
i
].
Record
(
*
default_ctx
);
ncclEvents
[
i
].
Block
(
*
dev_ctx
[
i
]);
}
}
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
ProcessGroupNCCL
::
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
{
return
std
::
make_shared
<
ProcessGroupNCCL
::
NCCLTask
>
(
places
,
rank
,
comm_type
,
inputs
);
}
ProcessGroupNCCL
::
NCCLTask
::
NCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
:
Task
(
rank
,
inputs
,
CommType
),
places_
(
places
)
{
control_events_
.
resize
(
places
.
size
());
ncclComms_
.
resize
(
places
.
size
());
}
ProcessGroupNCCL
::
NCCLTask
::~
NCCLTask
()
{}
void
ProcessGroupNCCL
::
NCCLTask
::
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
)
{
// NOLINT
outputs_
=
std
::
make_shared
<
std
::
vector
<
phi
::
DenseTensor
>>
(
outputs
);
}
void
ProcessGroupNCCL
::
NCCLTask
::
SynchronizeStreams
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places_
[
i
]));
default_ctx
->
WaitEvent
(
control_events_
[
i
].
GetRawCudaEvent
());
}
}
bool
ProcessGroupNCCL
::
NCCLTask
::
IsCompleted
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
if
(
!
control_events_
[
i
].
Query
())
{
return
false
;
}
}
return
true
;
}
void
ProcessGroupNCCL
::
CheckSplitSizes
(
std
::
vector
<
int64_t
>&
split_sizes
,
std
::
vector
<
int64_t
>
tensor_shape
)
{
int64_t
len_size
=
split_sizes
.
size
();
if
(
len_size
==
0
)
{
PADDLE_ENFORCE_EQ
(
tensor_shape
[
0
]
%
size_
==
0
,
true
,
platform
::
errors
::
InvalidArgument
(
"Tensor's dim[0] must be divisible by group size "
"when split_sizes not given."
));
split_sizes
.
insert
(
split_sizes
.
end
(),
size_
,
static_cast
<
int64_t
>
(
tensor_shape
[
0
]
/
size_
));
}
else
{
PADDLE_ENFORCE_EQ
(
len_size
==
size_
,
true
,
platform
::
errors
::
InvalidArgument
(
"The length of split_sizes must be equal to group size."
));
auto
sum_size
=
std
::
accumulate
(
split_sizes
.
begin
(),
split_sizes
.
end
(),
static_cast
<
int64_t
>
(
0
));
PADDLE_ENFORCE_EQ
(
sum_size
==
tensor_shape
[
0
],
true
,
platform
::
errors
::
InvalidArgument
(
"The sum of split_sizes must be equal to tensor's dim[0]."
));
}
}
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool
ProcessGroupNCCL
::
NCCLTask
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
SynchronizeStreams
();
if
(
FLAGS_nccl_blocking_wait
)
{
// NOTE(shenliang03): It will block host for sync
while
(
!
IsCompleted
())
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
kWaitBlockTImeout
));
}
}
if
(
!
barrierTensors_
.
empty
())
{
// If we use the work to do barrier, we should block cpu
for
(
auto
&
place
:
places_
)
{
platform
::
CUDADeviceGuard
gpuGuard
(
place
);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceSynchronize
());
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
#endif
}
}
return
true
;
}
// Same as Wait
void
ProcessGroupNCCL
::
NCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupNCCL
::
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
),
store_
(
store
)
{
platform
::
SetDeviceId
(
place_
.
device
);
}
void
ProcessGroupNCCL
::
BroadcastUniqueNCCLID
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
)
{
// NOLINT
if
(
rank_
==
0
)
{
for
(
size_t
i
=
0
;
i
<
nccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupNCCL/nccl_ids/"
+
std
::
to_string
(
gid_
)
+
"/"
+
std
::
to_string
(
i
);
auto
nccl_id
=
std
::
vector
<
uint8_t
>
(
reinterpret_cast
<
uint8_t
*>
(
&
nccl_ids
[
i
]),
reinterpret_cast
<
uint8_t
*>
(
&
nccl_ids
[
i
])
+
NCCL_UNIQUE_ID_BYTES
);
store_
->
set
(
key
,
nccl_id
);
}
}
else
{
for
(
size_t
i
=
0
;
i
<
nccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupNCCL/nccl_ids/"
+
std
::
to_string
(
gid_
)
+
"/"
+
std
::
to_string
(
i
);
auto
ret
=
store_
->
get
(
key
);
std
::
memcpy
(
&
nccl_ids
[
i
],
ret
.
data
(),
ret
.
size
());
}
}
}
// create NCCLManager cache for places_key
void
ProcessGroupNCCL
::
CreateNCCLManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
)
{
PADDLE_ENFORCE_EQ
(
places_key
.
empty
(),
false
,
platform
::
errors
::
PreconditionNotMet
(
"Not able to create/get the NCCL Communicator since "
"the GPU place are not known"
));
std
::
vector
<
std
::
shared_ptr
<
NCCLCommManager
>>
nccl_comms
;
nccl_comms
.
resize
(
places
.
size
());
// using vector just for broadcast
std
::
vector
<
ncclUniqueId
>
nccl_ids
;
nccl_ids
.
resize
(
1
);
auto
&
nccl_id
=
nccl_ids
.
front
();
for
(
auto
&
place
:
places
)
{
used_place_ids_
.
insert
(
place
.
GetDeviceId
());
}
if
(
rank_
==
0
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
));
}
BroadcastUniqueNCCLID
(
nccl_ids
);
VLOG
(
3
)
<<
"init nccl rank: "
<<
rank_
<<
", nranks: "
<<
size_
<<
", place: "
<<
places_key
<<
", nccl uniqueid: "
<<
SerializeNCCLUniqueId
(
nccl_id
);
std
::
vector
<
std
::
unique_ptr
<
CUDADeviceContext
>>
dev_ctx
;
dev_ctx
.
resize
(
places
.
size
());
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
platform
::
CUDADeviceGuard
guard
(
places
[
i
]);
nccl_comms
[
i
]
=
NCCLCommManager
::
Create
(
GetSize
(),
GetRank
(),
nccl_id
);
dev_ctx
[
i
].
reset
(
new
CUDADeviceContext
(
places
[
i
]));
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
std
::
vector
<
EventManager
>
events
;
events
.
resize
(
places
.
size
());
// These caches will be useful to process sync/wait/communicate
places_to_events_
.
emplace
(
places_key
,
std
::
move
(
events
));
places_to_ncclcomm_
.
emplace
(
places_key
,
std
::
move
(
nccl_comms
));
places_to_ctx_
.
emplace
(
places_key
,
std
::
move
(
dev_ctx
));
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
Fn
fn
,
CommType
op_type
)
{
const
auto
places
=
GetPlaceList
(
inputs
);
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
inputs
);
task
->
SetOutputs
(
outputs
);
// construct uninitialize guard for device
platform
::
CUDADeviceGuard
cuda_guard
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
memory
::
RecordStream
(
inputs
[
i
].
Holder
(),
places_to_ctx_
[
key
][
i
]
->
stream
());
}
}
{
platform
::
NCCLGroupGuard
nccl_guard
;
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
const
auto
&
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
fn
(
inputs
[
i
],
outputs
[
i
],
nccl_comms
[
i
]
->
GetNcclComm
(),
nccl_stream
);
}
}
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
return
task
;
}
template
<
typename
Fn
>
void
ProcessGroupNCCL
::
Collective
(
const
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
,
Fn
fn
,
CommType
op_type
)
{
std
::
vector
<
Place
>
places
;
places
.
push_back
(
in
->
place
());
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
// construct uninitialize guard for device
platform
::
CUDADeviceGuard
cuda_guard
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
cuda_guard
.
SetDevice
(
places
[
0
]);
memory
::
RecordStream
(
in
->
Holder
(),
places_to_ctx_
[
key
][
0
]
->
stream
());
}
{
platform
::
NCCLGroupGuard
nccl_guard
;
cuda_guard
.
SetDevice
(
places
[
0
]);
const
auto
&
nccl_stream
=
places_to_ctx_
[
key
][
0
]
->
stream
();
fn
(
in
,
out
,
nccl_comms
[
0
]
->
GetNcclComm
(),
nccl_stream
);
}
cuda_guard
.
SetDevice
(
places
[
0
]);
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
Fn
fn
,
int
dst_rank
,
CommType
op_type
)
{
const
auto
places
=
GetPlaceList
(
tensors
);
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
tensors
);
// construct uninitialize guard for device
platform
::
CUDADeviceGuard
cuda_guard
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
memory
::
RecordStream
(
tensors
[
i
].
Holder
(),
places_to_ctx_
[
key
][
i
]
->
stream
());
}
}
{
platform
::
NCCLGroupGuard
nccl_guard
;
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
const
auto
&
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
fn
(
tensors
[
i
],
nccl_comms
[
i
]
->
GetNcclComm
(),
nccl_stream
,
dst_rank
);
}
}
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
);
},
CommType
::
ALLREDUCE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
const
auto
root
=
opts
.
source_rank
*
in_tensors
.
size
()
+
opts
.
source_root
;
return
platform
::
dynload
::
ncclBroadcast
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
root
,
comm
,
stream
);
},
CommType
::
BROADCAST
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Barrier
(
const
BarrierOptions
&
opts
)
{
// Only support single card single process
std
::
vector
<
phi
::
GPUPlace
>
places
=
{
place_
};
std
::
vector
<
phi
::
DenseTensor
>
barrierTensors
;
barrierTensors
.
reserve
(
places
.
size
());
platform
::
CUDADeviceGuard
gpuGuard
;
for
(
auto
&
place
:
places
)
{
gpuGuard
.
SetDeviceIndex
(
place
.
GetDeviceId
());
auto
dt
=
full
({
1
},
0
,
phi
::
DataType
::
FLOAT32
,
place
);
barrierTensors
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dt
.
impl
()));
}
auto
task
=
ProcessGroupNCCL
::
AllReduce
(
barrierTensors
,
barrierTensors
);
auto
nccl_task
=
dynamic_cast
<
ProcessGroupNCCL
::
NCCLTask
*>
(
task
.
get
());
nccl_task
->
barrierTensors_
=
std
::
move
(
barrierTensors
);
return
task
;
}
void
CheckTensorsInDifferentDevices
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
const
size_t
num_devices
)
{
PADDLE_ENFORCE_EQ
(
tensors
.
size
()
==
0
,
false
,
platform
::
errors
::
InvalidArgument
(
"Tensor list must be nonempty."
));
PADDLE_ENFORCE_LE
(
tensors
.
size
(),
num_devices
,
platform
::
errors
::
InvalidArgument
(
"Tensor list mustn't be larger than the number of available GPUs."
));
std
::
set
<
Place
>
used_devices
;
for
(
const
auto
&
t
:
tensors
)
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
t
.
place
()),
true
,
platform
::
errors
::
InvalidArgument
(
"Tensors must be CUDA and dense tensor."
));
const
auto
inserted
=
used_devices
.
insert
(
t
.
place
()).
second
;
PADDLE_ENFORCE_EQ
(
inserted
,
true
,
platform
::
errors
::
InvalidArgument
(
"Tensors must be on distinct GPU devices."
));
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
)
{
// CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
phi
::
DenseTensor
shared_input
=
flatten_tensor
.
Slice
(
offset
,
offset
+
length
);
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
;
shared_tensors
.
push_back
(
shared_input
);
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
)
{
// phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
phi
::
DenseTensor
shared_input
=
flatten_tensor
.
Slice
(
offset
,
offset
+
length
);
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
;
shared_tensors
.
push_back
(
shared_input
);
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
return
platform
::
dynload
::
ncclAllGather
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
},
CommType
::
ALLGATHER
);
}
void
*
GetPointerByOffset
(
void
*
raw_pointer
,
size_t
offset
,
experimental
::
DataType
type
)
{
if
(
type
==
experimental
::
DataType
::
FLOAT32
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
float
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
FLOAT64
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
double
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
INT32
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int32_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
INT64
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int64_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
FLOAT16
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int16_t
*>
(
raw_pointer
)
+
offset
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"This datatype in nccl is not supported."
));
}
return
nullptr
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
size_t
offset
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
GetPointerByOffset
(
output
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
input
.
numel
()
/
size_
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
},
CommType
::
ALLTOALL
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllToAll_Single
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
input
.
dtype
()
==
output
.
dtype
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The dtypes of input and output must be equal."
));
std
::
vector
<
int64_t
>
in_dims
=
phi
::
vectorize
(
input
.
dims
());
std
::
vector
<
int64_t
>
out_dims
=
phi
::
vectorize
(
output
.
dims
());
CheckSplitSizes
(
in_sizes
,
in_dims
);
CheckSplitSizes
(
out_sizes
,
out_dims
);
size_t
in_offset
=
0
,
out_offset
=
0
;
size_t
in_length
=
0
,
out_length
=
0
;
size_t
in_row_size
=
input
.
numel
()
/
in_dims
[
0
];
size_t
out_row_size
=
output
.
numel
()
/
out_dims
[
0
];
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
in_length
=
in_sizes
[
i
]
*
in_row_size
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
in_offset
,
input
.
dtype
()),
in_length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
in_offset
+=
in_length
;
out_length
=
out_sizes
[
i
]
*
out_row_size
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
GetPointerByOffset
(
output
.
data
(),
out_offset
,
input
.
dtype
()),
out_length
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
out_offset
+=
out_length
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
},
CommType
::
ALLTOALL_SINGLE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
opts
.
root_rank
,
comm
,
stream
));
},
CommType
::
REDUCE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
size_t
offset
=
0
;
if
(
rank_
==
opts
.
root_rank
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
input
.
numel
()
/
size_
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
}
},
CommType
::
SCATTER
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
_ReduceScatterBase
(
phi
::
DenseTensor
&
out_tensor
,
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
)
{
// auto tensor = out_tensors.back();
PADDLE_ENFORCE_EQ
(
out_tensor
.
dtype
(),
in_tensor
.
dtype
(),
platform
::
errors
::
InvalidArgument
(
"Input tensor and output tensor should be same dtype."
));
PADDLE_ENFORCE_EQ
(
out_tensor
.
numel
()
*
size_
,
in_tensor
.
numel
(),
platform
::
errors
::
InvalidArgument
(
"input tensor must be the same size as "
"output tensor size times world_size"
));
auto
inputs
=
std
::
vector
<
phi
::
DenseTensor
>
{
in_tensor
};
auto
outputs
=
std
::
vector
<
phi
::
DenseTensor
>
{
out_tensor
};
return
Collective
(
inputs
,
outputs
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
platform
::
CUDADeviceGuard
cuda_guard
;
cuda_guard
.
SetDevice
(
output
.
place
());
memory
::
RecordStream
(
output
.
Holder
(),
stream
);
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduceScatter
(
input
.
data
(),
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
));
},
CommType
::
REDUCE_SCATTER
);
}
void
ProcessGroupNCCL
::
GroupStart
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
}
void
ProcessGroupNCCL
::
GroupEnd
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
constexpr
const
char
*
NCCL_BACKEND_NAME
=
"NCCL"
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
using
CUDAStream
=
platform
::
stream
::
CUDAStream
;
using
CUDADeviceContext
=
paddle
::
platform
::
CUDADeviceContext
;
class
ProcessGroupNCCL
:
public
ProcessGroup
{
public:
class
NCCLTask
:
public
ProcessGroup
::
Task
,
public
std
::
enable_shared_from_this
<
NCCLTask
>
{
public:
NCCLTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
bool
IsCompleted
();
void
SynchronizeStreams
();
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
Synchronize
();
void
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
);
// NOLINT
virtual
~
NCCLTask
();
std
::
vector
<
EventManager
>
control_events_
;
std
::
vector
<
phi
::
DenseTensor
>
barrierTensors_
;
protected:
std
::
vector
<
Place
>
places_
;
std
::
vector
<
std
::
shared_ptr
<
NCCLCommManager
>>
ncclComms_
;
std
::
shared_ptr
<
std
::
vector
<
phi
::
DenseTensor
>>
outputs_
;
private:
};
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
NCCL_BACKEND_NAME
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
in
,
std
::
vector
<
phi
::
DenseTensor
>&
out
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll_Single
(
std
::
vector
<
phi
::
DenseTensor
>&
in
,
std
::
vector
<
phi
::
DenseTensor
>&
out
,
std
::
vector
<
int64_t
>&
in_sizes
,
std
::
vector
<
int64_t
>&
out_sizes
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
_ReduceScatterBase
(
phi
::
DenseTensor
&
,
// NOLINT
phi
::
DenseTensor
&
,
// NOLINT
const
ReduceScatterOptions
&
)
override
;
static
void
GroupStart
();
static
void
GroupEnd
();
protected:
virtual
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
opType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
protected:
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
NCCLCommManager
>
nccl_comm_
;
std
::
mutex
mutex_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
NCCLCommManager
>>>
places_to_ncclcomm_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
EventManager
>>
places_to_events_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
unique_ptr
<
CUDADeviceContext
>>>
places_to_ctx_
;
std
::
set
<
int
>
used_place_ids_
;
private:
void
BcastNCCLId
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
,
// NOLINT
int
root
,
// NOLINT
int
server_fd
);
void
BroadcastUniqueNCCLID
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
);
// NOLINT
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
Fn
fn
,
CommType
op_type
);
template
<
typename
Fn
>
void
Collective
(
const
phi
::
DenseTensor
*
,
phi
::
DenseTensor
*
,
Fn
fn
,
CommType
op_type
);
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
Fn
fn
,
int
dst_rank
,
CommType
op_type
);
void
CreateNCCLManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
);
void
CheckSplitSizes
(
std
::
vector
<
int64_t
>&
split_sizes
,
std
::
vector
<
int64_t
>
tensor_shape
);
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/Types.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <cstdint>
#include <vector>
namespace
paddle
{
namespace
distributed
{
// TODO(shenliang03): To support AVG for reduce
enum
class
ReduceOp
:
std
::
uint8_t
{
SUM
=
0
,
AVG
,
MAX
,
MIN
,
PRODUCT
};
struct
AllreduceOptions
{
ReduceOp
reduce_op
=
ReduceOp
::
SUM
;
};
struct
BroadcastOptions
{
int
source_rank
=
0
;
int
source_root
=
0
;
};
struct
BarrierOptions
{
std
::
vector
<
int
>
place_ids
;
};
struct
ReduceOptions
{
ReduceOp
reduce_op
=
ReduceOp
::
SUM
;
int
root_rank
=
0
;
};
struct
ScatterOptions
{
int
root_rank
=
0
;
};
struct
ReduceScatterOptions
{
ReduceOp
reduce_op
=
ReduceOp
::
SUM
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/reducer.cc
0 → 100644
View file @
d2d32668
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/reducer.h"
namespace
paddle
{
namespace
distributed
{
static
Backend
TransToBackend
(
platform
::
Place
place
)
{
static
const
std
::
map
<
phi
::
AllocationType
,
Backend
>
type_backend
=
{
{
phi
::
AllocationType
::
GPU
,
Backend
::
GPU
},
{
phi
::
AllocationType
::
CPU
,
Backend
::
CPU
},
};
phi
::
AllocationType
type
=
place
.
GetType
();
auto
it
=
type_backend
.
find
(
type
);
PADDLE_ENFORCE_EQ
(
it
!=
type_backend
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Place type (%s) is not supported. "
,
place
));
return
it
->
second
;
}
std
::
vector
<
std
::
vector
<
size_t
>>
Eager_AssignGroupBySize
(
const
std
::
vector
<
Tensor
>
tensors
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
const
std
::
vector
<
int64_t
>
&
tensor_indices
)
{
PADDLE_ENFORCE_EQ
(
tensors
.
size
(),
is_sparse_gradient
.
size
(),
platform
::
errors
::
PreconditionNotMet
(
"tensors len must be equal to is_sparse_gradient len, but "
"[%lu] != [%lu]"
,
tensors
.
size
(),
is_sparse_gradient
.
size
()));
auto
check_perm
=
[](
const
std
::
vector
<
int64_t
>
&
x
)
->
bool
{
size_t
len
=
x
.
size
();
std
::
vector
<
size_t
>
cnt
(
len
,
0
);
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
{
if
(
x
[
i
]
>=
static_cast
<
int64_t
>
(
len
)
||
x
[
i
]
<
0
||
cnt
[
x
[
i
]])
{
return
false
;
}
cnt
[
x
[
i
]]
++
;
}
return
true
;
};
PADDLE_ENFORCE_EQ
(
true
,
check_perm
(
tensor_indices
),
platform
::
errors
::
PreconditionNotMet
(
"tensor_indices must be a permutation from 0 to %lu"
,
tensor_indices
.
size
()));
// the return vector
std
::
vector
<
std
::
vector
<
size_t
>>
res
;
// Key: the var type
// Value: should use which index in group_size_limits for group size limit
std
::
map
<
experimental
::
DataType
,
size_t
>
group_limit_index
;
// Key: the var type
// Value: <the var index in input tensors, total numel in this group>
std
::
map
<
experimental
::
DataType
,
std
::
pair
<
std
::
vector
<
size_t
>
,
size_t
>>
next_group
;
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
const
auto
&
var
=
tensors
[
i
];
size_t
tensor_real_index
=
i
;
if
(
!
tensor_indices
.
empty
())
{
tensor_real_index
=
tensor_indices
[
i
];
}
if
(
is_sparse_gradient
[
tensor_real_index
])
{
// we keep sparse var a single group
res
.
push_back
({
tensor_real_index
});
continue
;
}
const
auto
&
var_dtype
=
var
.
dtype
();
VLOG
(
3
)
<<
"var["
<<
var
.
name
()
<<
"] 's type is "
<<
var_dtype
;
auto
&
group_info
=
next_group
[
var_dtype
];
int64_t
var_size
=
-
1
;
if
(
var
.
is_dense_tensor
())
{
var_size
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
var
.
impl
())
->
numel
();
}
else
{
VLOG
(
3
)
<<
"var "
<<
var
.
name
()
<<
" is not tensor or selected_rows, so skip it"
;
continue
;
}
group_info
.
first
.
push_back
(
tensor_real_index
);
group_info
.
second
+=
experimental
::
SizeOf
(
var_dtype
)
*
var_size
;
// group_info.second += framework::SizeOfType(var_dtype) * var_size;
if
(
group_limit_index
.
find
(
var_dtype
)
==
group_limit_index
.
end
())
{
// means it is the first var of var_dtype
group_limit_index
[
var_dtype
]
=
0
;
}
auto
&
cur_limit_index
=
group_limit_index
[
var_dtype
];
if
(
group_info
.
second
>=
group_size_limits
[
cur_limit_index
])
{
// exceed group capacity and create a new group
res
.
emplace_back
(
std
::
move
(
group_info
.
first
));
group_info
=
std
::
pair
<
std
::
vector
<
size_t
>
,
size_t
>
();
cur_limit_index
=
(
std
::
min
)(
cur_limit_index
+
1
,
group_size_limits
.
size
()
-
1
);
}
}
// add the final groups
for
(
auto
&
e
:
next_group
)
{
auto
&
group_info
=
e
.
second
;
if
(
!
group_info
.
first
.
empty
())
{
res
.
emplace_back
(
std
::
move
(
group_info
.
first
));
}
}
for
(
const
auto
&
group_index
:
res
)
{
PADDLE_ENFORCE_NE
(
group_index
.
empty
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"AssignGroupBySize construct empty group, please check."
));
}
if
(
tensor_indices
.
empty
())
{
std
::
sort
(
res
.
begin
(),
res
.
end
(),
[](
const
std
::
vector
<
size_t
>
&
x
,
const
std
::
vector
<
size_t
>
&
y
)
{
return
x
.
front
()
<
y
.
front
();
});
}
return
res
;
}
template
<
typename
DeviceContext
,
typename
T
>
static
void
ConcatTensorsForAllReduce
(
const
DeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
dense_tensors_
,
Tensor
*
p_dense_contents
)
{
operators
::
math
::
ConcatFunctor
<
DeviceContext
,
T
>
concat_functor_
;
concat_functor_
(
context
,
dense_tensors_
,
0
,
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
p_dense_contents
->
impl
())
.
get
());
}
template
<
typename
DeviceContext
,
typename
T
>
static
void
SplitTensorsForAllReduce
(
const
DeviceContext
&
context
,
Tensor
*
p_dense_contents
,
std
::
vector
<
phi
::
DenseTensor
>
*
p_dense_tensors
)
{
auto
*
in
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
p_dense_contents
->
impl
())
.
get
();
std
::
vector
<
phi
::
DenseTensor
*>
outs
;
std
::
vector
<
const
phi
::
DenseTensor
*>
shape_refer
;
outs
.
reserve
(
p_dense_tensors
->
size
());
shape_refer
.
reserve
(
p_dense_tensors
->
size
());
for
(
auto
&
tensor
:
*
p_dense_tensors
)
{
outs
.
emplace_back
(
&
tensor
);
shape_refer
.
emplace_back
(
&
tensor
);
}
operators
::
math
::
SplitFunctor
<
DeviceContext
,
T
>
split_functor_
;
split_functor_
(
context
,
*
in
,
shape_refer
,
0
,
&
outs
);
}
// context is used to select the stream for concat
template
<
typename
DeviceContext
>
static
void
ConcatTensorsWithType
(
const
DeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
dense_tensors_
,
Tensor
*
p_dense_contents
,
phi
::
DataType
type
)
{
switch
(
type
)
{
case
phi
::
DataType
::
FLOAT16
:
ConcatTensorsForAllReduce
<
DeviceContext
,
platform
::
float16
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
case
phi
::
DataType
::
FLOAT32
:
ConcatTensorsForAllReduce
<
DeviceContext
,
float
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
case
phi
::
DataType
::
FLOAT64
:
ConcatTensorsForAllReduce
<
DeviceContext
,
double
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors for "
"allreduce."
,
type
));
}
}
// context is used to select the stream for split
template
<
typename
DeviceContext
>
static
void
SplitTensorsWithType
(
const
DeviceContext
&
context
,
Tensor
*
p_dense_contents
,
std
::
vector
<
phi
::
DenseTensor
>
*
p_dense_tensors
,
phi
::
DataType
type
)
{
switch
(
type
)
{
case
phi
::
DataType
::
FLOAT16
:
SplitTensorsForAllReduce
<
DeviceContext
,
platform
::
float16
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
case
phi
::
DataType
::
FLOAT32
:
SplitTensorsForAllReduce
<
DeviceContext
,
float
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
case
phi
::
DataType
::
FLOAT64
:
SplitTensorsForAllReduce
<
DeviceContext
,
double
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors for "
"allreduce."
,
type
));
}
}
void
EagerGroup
::
ConcatTensors
(
const
platform
::
Place
&
place
)
{
if
(
platform
::
is_gpu_place
(
place
))
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
default_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
ConcatTensorsWithType
(
*
default_ctx
,
dense_tensors_
,
&
dense_contents_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
auto
*
default_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
ConcatTensorsWithType
(
*
default_ctx
,
dense_tensors_
,
&
dense_contents_
,
dtype_
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Concat grad tensor not supported on place (%s)"
,
place
));
}
}
void
EagerGroup
::
SplitTensors
(
const
platform
::
Place
&
place
)
{
if
(
platform
::
is_gpu_place
(
place
))
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
default_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
SplitTensorsWithType
(
*
default_ctx
,
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split grad tensor since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
auto
*
default_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
SplitTensorsWithType
(
*
default_ctx
,
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Split grad tensor not supported on place (%s)"
,
place
));
}
}
EagerReducer
::
EagerReducer
(
const
std
::
vector
<
Tensor
>
tensors
,
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
std
::
shared_ptr
<
distributed
::
ProcessGroup
>
process_group
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
bool
find_unused_parameters
)
:
tensors_
(
tensors
),
group_indices_
(
group_indices
),
is_sparse_gradient_
(
is_sparse_gradient
),
process_group_
(
process_group
),
group_size_limits_
(
group_size_limits
),
find_unused_vars_each_step_
(
find_unused_parameters
)
{
VLOG
(
3
)
<<
"Start construct the Reducer ..."
;
nranks_
=
process_group_
->
GetSize
();
// initialize groups
InitializeGroups
(
group_indices
);
for
(
size_t
global_var_index
=
0
;
global_var_index
<
tensors_
.
size
();
++
global_var_index
)
{
auto
tensor
=
tensors_
[
global_var_index
];
auto
reduce_hook
=
[
=
](
void
)
->
void
{
this
->
AddDistHook
(
global_var_index
);
};
const
auto
&
grad_node
=
GetGradNodeFromTensor
(
&
tensor
);
PADDLE_ENFORCE
(
grad_node
.
get
()
!=
nullptr
,
paddle
::
platform
::
errors
::
Fatal
(
"Detected NULL grad_node,"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation"
));
const
auto
&
accumulation_grad_node
=
std
::
dynamic_pointer_cast
<
egr
::
GradNodeAccumulation
>
(
grad_node
);
accumulation_grad_node
->
RegisterReduceHook
(
std
::
make_shared
<
egr
::
CppTensorVoidHook
>
(
reduce_hook
));
gradnode_index_map_
[
grad_node
.
get
()]
=
global_var_index
;
}
vars_marked_ready_
.
resize
(
tensors_
.
size
(),
false
);
local_used_vars_
.
resize
(
tensors_
.
size
(),
0
);
if
(
find_unused_vars_each_step_
)
{
global_used_vars_
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int32_t
>
(
tensors_
.
size
())}),
DataType
::
INT32
,
inner_place_
);
}
}
std
::
shared_ptr
<
egr
::
GradNodeBase
>
EagerReducer
::
GetGradNodeFromTensor
(
Tensor
*
tensor
)
{
auto
*
autograd_meta
=
tensor
->
get_autograd_meta
();
const
auto
&
grad_node
=
static_cast
<
egr
::
AutogradMeta
*>
(
autograd_meta
)
->
GetMutableGradNode
();
return
grad_node
;
}
void
EagerReducer
::
InitializeGroups
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
)
{
VLOG
(
3
)
<<
"Start initialize groups .."
;
// clear the group
groups_
.
clear
();
groups_
.
reserve
(
group_indices
.
size
());
variable_locators_
.
clear
();
variable_locators_
.
resize
(
tensors_
.
size
());
auto
group_nums
=
group_indices
.
size
();
for
(
size_t
group_index
=
0
;
group_index
<
group_nums
;
++
group_index
)
{
const
auto
&
tensor_indices_
=
group_indices
[
group_index
];
PADDLE_ENFORCE_GT
(
tensor_indices_
.
size
(),
0
,
platform
::
errors
::
PreconditionNotMet
(
"The number of group[%d]'s elements is 0."
,
group_index
));
EagerGroup
group
;
// It's just for check the sparse or dense
auto
first_var
=
tensors_
[
tensor_indices_
.
front
()];
if
(
tensor_indices_
.
size
()
==
1
&&
is_sparse_gradient_
[
tensor_indices_
.
front
()])
{
// process the sparse gradient. one sparse, one group
group
.
dtype_
=
first_var
.
dtype
();
group
.
is_sparse_
=
true
;
}
else
{
// process the dense gradient.
InitializeDenseGroups
(
tensor_indices_
,
&
group
);
group
.
dense_contents_
=
paddle
::
experimental
::
empty
(
IntArray
({
group
.
all_length_
}),
group
.
dtype_
,
inner_place_
);
}
// map tensors to this group by VariableLocator
size_t
inside_group_index
=
0
;
for
(
const
auto
var_index
:
tensor_indices_
)
{
TensorLocator
tensor_locator
;
tensor_locator
.
group_index
=
group_index
;
tensor_locator
.
inside_group_index
=
inside_group_index
++
;
variable_locators_
[
var_index
]
=
tensor_locator
;
}
group
.
tensor_indices_
=
std
::
move
(
tensor_indices_
);
groups_
.
emplace_back
(
std
::
move
(
group
));
VLOG
(
3
)
<<
"The Group["
<<
group_index
<<
"]:"
<<
groups_
.
back
();
}
}
void
EagerReducer
::
InitializeDenseGroups
(
const
std
::
vector
<
size_t
>
&
tensor_indices_
,
EagerGroup
*
p_group
)
{
VLOG
(
3
)
<<
"InitializeDenseGroups."
;
int64_t
all_length
=
0
;
for
(
size_t
index
=
0
;
index
<
tensor_indices_
.
size
();
++
index
)
{
auto
tensor_index
=
tensor_indices_
[
index
];
auto
&
tensor
=
tensors_
[
tensor_index
];
auto
&
tensor_name
=
tensor
.
name
();
PADDLE_ENFORCE_EQ
(
is_sparse_gradient_
[
tensor_index
],
false
,
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s's GRAD must be Tensor, but received "
"GRAD is SelectedRows"
,
tensor_name
));
PADDLE_ENFORCE_EQ
(
tensor
.
initialized
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s is not initialized."
,
tensor_name
));
const
auto
size
=
tensor
.
numel
();
PADDLE_ENFORCE_GT
(
size
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"The number of tensor %s's elements is 0."
,
tensor_name
));
all_length
+=
size
;
p_group
->
length_
.
push_back
(
size
);
// for concat operator
p_group
->
origin_shapes_
.
push_back
(
IntArray
(
tensor
.
shape
()));
p_group
->
dense_tensors_
.
push_back
(
phi
::
DenseTensor
());
const
auto
&
dtype
=
tensor
.
dtype
();
const
auto
&
inner_place
=
tensor
.
impl
()
->
place
();
if
(
index
>
0
)
{
PADDLE_ENFORCE_EQ
(
dtype
,
p_group
->
dtype_
,
platform
::
errors
::
PreconditionNotMet
(
"Tensor %s has unexpected dtype."
,
tensor_name
));
}
else
{
p_group
->
dtype_
=
dtype
;
inner_place_
=
inner_place
;
}
}
p_group
->
all_length_
=
all_length
;
}
void
EagerReducer
::
TraverseBackwardGraph
(
const
std
::
vector
<
Tensor
>
&
outputs
)
{
std
::
queue
<
egr
::
GradNodeBase
*>
queue
;
std
::
set
<
egr
::
GradNodeBase
*>
visited
;
for
(
const
auto
&
output
:
outputs
)
{
auto
*
auto_grad_meta
=
static_cast
<
egr
::
AutogradMeta
*>
(
output
.
get_autograd_meta
());
if
(
!
auto_grad_meta
)
continue
;
auto
shared_grad_node
=
auto_grad_meta
->
GetMutableGradNode
();
if
(
shared_grad_node
==
nullptr
||
shared_grad_node
.
get
()
==
nullptr
||
auto_grad_meta
->
StopGradient
())
{
continue
;
}
egr
::
GradNodeBase
*
grad_node
=
shared_grad_node
.
get
();
queue
.
emplace
(
grad_node
);
}
while
(
!
queue
.
empty
())
{
egr
::
GradNodeBase
*
node
=
queue
.
front
();
queue
.
pop
();
const
paddle
::
small_vector
<
std
::
vector
<
egr
::
GradSlotMeta
>
,
egr
::
kSlotSmallVectorSize
>
&
metas
=
node
->
OutputMeta
();
for
(
size_t
i
=
0
;
i
<
metas
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
metas
[
i
].
size
();
j
++
)
{
const
egr
::
Edge
&
edge
=
metas
[
i
][
j
].
GetEdge
();
auto
next_node_shared
=
edge
.
GetMutableGradNode
();
if
(
!
next_node_shared
||
!
next_node_shared
.
get
())
{
continue
;
}
auto
*
next_node
=
next_node_shared
.
get
();
const
bool
was_inserted
=
visited
.
insert
(
next_node
).
second
;
if
(
was_inserted
)
{
queue
.
emplace
(
next_node
);
}
}
}
}
for
(
const
auto
&
it
:
gradnode_index_map_
)
{
if
(
visited
.
count
(
it
.
first
)
==
0
)
{
unused_vars_
.
push_back
(
it
.
second
);
VLOG
(
3
)
<<
"[Rank "
<<
process_group_
->
GetRank
()
<<
"]: "
<<
"Tensor "
<<
tensors_
[
it
.
second
].
name
()
<<
" at index "
<<
it
.
second
<<
" is marked as unused."
;
}
}
}
void
EagerReducer
::
PrepareForBackward
(
const
std
::
vector
<
Tensor
>
&
outputs
)
{
VLOG
(
3
)
<<
"after forward, then reset count for backward."
;
grad_need_hooks_
=
true
;
next_group_
=
0
;
std
::
for_each
(
groups_
.
begin
(),
groups_
.
end
(),
[](
EagerGroup
&
group
)
{
group
.
pending_
=
group
.
tensor_indices_
.
size
();
group
.
sparse_contents_
=
Tensor
();
});
// reinitialize vars_marked_ready_ for next iteration
vars_marked_ready_
.
clear
();
vars_marked_ready_
.
resize
(
tensors_
.
size
(),
false
);
PADDLE_ENFORCE_EQ
(
groups_need_finalize_
,
false
,
platform
::
errors
::
PreconditionNotMet
(
"A serious error has occurred here. Please "
"set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have "
"set, There may be several reasons for this error: "
"1) Please note that all forward outputs derived from the module "
"parameters must participate in the calculation of losses and "
"subsequent gradient calculations. If not, the wrapper will hang, "
"waiting for autograd to generate gradients for these parameters. "
"you can use detach or stop_gradient to make the unused parameters "
"detached from the autograd graph. "
"2) Used multiple forwards and one backward. You may be able to wrap "
"multiple forwards in a model."
));
// The first var to trigger the unused parameter
has_marked_unused_vars_
=
false
;
if
(
find_unused_vars_once_
||
find_unused_vars_each_step_
)
{
unused_vars_
.
clear
();
TraverseBackwardGraph
(
outputs
);
// only check once in first step
find_unused_vars_once_
=
false
;
}
if
(
find_unused_vars_each_step_
&&
unused_vars_
.
empty
())
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"All parameters are involved in the backward pass. "
"It is recommended to set find_unused_parameters to False "
"to improve performance. However, if unused parameters "
"appear in subsequent iterative training, then an error "
"will occur. Please make it clear that in the subsequent "
"training, there will be no parameters that are not used "
"in the backward pass, and then set find_unused_parameters"
;
}
if
(
unused_vars_
.
size
()
==
tensors_
.
size
())
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"There is no parameter in the device involved "
"in the backward calculation. If there are "
"parameters on other devices involved in the "
"backward, then a serious error will occur here."
;
}
}
void
EagerReducer
::
AddDistHook
(
size_t
var_index
)
{
PADDLE_ENFORCE_LT
(
var_index
,
variable_locators_
.
size
(),
platform
::
errors
::
OutOfRange
(
"Out of bounds variable index. it must be less"
"than %d, but it is %d"
,
variable_locators_
.
size
(),
var_index
));
// gradient synchronization is not required when grad_need_hooks_ is false.
if
(
!
grad_need_hooks_
)
{
return
;
}
VLOG
(
3
)
<<
"Tensor["
<<
var_index
<<
"] ["
<<
tensors_
[
var_index
].
name
()
<<
"@Grad] arrived and triggered disthook"
;
local_used_vars_
[
var_index
]
=
1
;
if
(
!
has_marked_unused_vars_
)
{
has_marked_unused_vars_
=
true
;
for
(
const
auto
unused_index
:
unused_vars_
)
{
MarkVarReady
(
unused_index
,
false
);
}
}
MarkVarReady
(
var_index
,
true
);
}
void
EagerReducer
::
MarkVarReady
(
const
size_t
var_index
,
const
bool
is_used_var
)
{
VLOG
(
3
)
<<
"Tensor["
<<
var_index
<<
"]["
<<
tensors_
[
var_index
].
name
()
<<
"] is marked ready."
;
// error happened, if the var is ready before.
if
(
vars_marked_ready_
[
var_index
])
{
auto
error_info
=
string
::
Sprintf
(
"Error happened, when parameter[%d][%s] has been ready before. "
"Please set find_unused_parameters=True to traverse backward graph "
"in each step to prepare reduce in advance. If you have set, "
"there may be several reasons for this error: "
"1) In multiple reentrant backward phase, some parameters are reused."
"2) Using model parameters outside of forward function. Please "
"make sure that model parameters are not shared in concurrent "
"forward-backward passes."
,
var_index
,
tensors_
[
var_index
].
name
());
PADDLE_ENFORCE_EQ
(
has_marked_unused_vars_
,
false
,
platform
::
errors
::
PreconditionNotMet
(
error_info
));
error_info
+=
"3) Unused parameters retrieval is incorrect. "
"The return value of forward will be used to retrieve"
" the unused parameters of the entire model. These "
"gradients of unused parameters will not be synchronized "
"between multiple cards. However, if the unused "
"parameters participate in the backward calculation "
"again at a later time (e.g. after the forward function, "
"the loss calculation uses the unused "
"paramters of the forward and trigger backward), "
"its gradient will be wrong."
;
PADDLE_ENFORCE_EQ
(
has_marked_unused_vars_
,
true
,
platform
::
errors
::
PreconditionNotMet
(
error_info
));
}
else
{
vars_marked_ready_
[
var_index
]
=
true
;
}
groups_need_finalize_
=
true
;
const
auto
&
var_locator
=
variable_locators_
[
var_index
];
const
auto
group_index
=
var_locator
.
group_index
;
const
auto
inside_group_index
=
var_locator
.
inside_group_index
;
auto
&
group
=
groups_
[
group_index
];
auto
&
group_tensor
=
group
.
dense_tensors_
[
inside_group_index
];
const
auto
length
=
group
.
length_
[
inside_group_index
];
if
(
!
group
.
is_sparse_
)
{
if
(
is_used_var
)
{
auto
*
autograd_meta
=
tensors_
[
var_index
].
get_autograd_meta
();
auto
&
grad_tensor
=
static_cast
<
egr
::
AutogradMeta
*>
(
autograd_meta
)
->
Grad
();
group_tensor
.
ShareDataWith
(
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
grad_tensor
.
impl
())))
.
Resize
({
grad_tensor
.
numel
()});
}
else
{
// TODO(shenliang03): maybe save the memory by avoiding tensor
// construction
if
(
!
group_tensor
.
initialized
())
{
group_tensor
.
Resize
({
static_cast
<
int64_t
>
(
length
)});
group_tensor
.
mutable_data
(
inner_place_
,
group
.
dtype_
);
}
if
(
HasGrad
(
var_index
))
{
VLOG
(
3
)
<<
"Tensor["
<<
tensors_
[
var_index
].
name
()
<<
"] has grad"
;
auto
grad_tensor
=
egr
::
EagerUtils
::
mutable_grad
(
tensors_
[
var_index
]);
group_tensor
.
ShareDataWith
(
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
grad_tensor
->
impl
())))
.
Resize
({
length
});
}
else
{
VLOG
(
3
)
<<
"Tensor["
<<
tensors_
[
var_index
].
name
()
<<
"] doesn't have grad"
;
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
);
group_tensor
.
Resize
({
static_cast
<
int64_t
>
(
length
)});
phi
::
funcs
::
set_constant
(
*
dev_ctx
,
&
group_tensor
,
0.0
);
}
}
}
else
{
auto
*
autograd_meta
=
tensors_
[
var_index
].
get_autograd_meta
();
auto
&
grad_tensor
=
static_cast
<
egr
::
AutogradMeta
*>
(
autograd_meta
)
->
Grad
();
// process sparse group
PADDLE_ENFORCE_EQ
(
HasGrad
(
var_index
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The sparse parameter[%d][%s] should have gradient. "
"Currently, DataParallel does not support sparse "
"parameters without generating gradients during training. "
"For example, if is_sparese=True is used in Embedding, "
"the current step of this parameter cannot generate gradient "
"because of stop_gradient/detatch, where error will occur."
,
var_index
,
tensors_
[
var_index
].
name
()));
// need to check tensor type
PADDLE_ENFORCE_EQ
(
grad_tensor
.
is_selected_rows
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The sparse parameter[%d][%s] must have a selectedrows gradient. "
"Before forward pass, the parameter type is inferred to be "
"SelectedRows, but after backward pass, its actual type becomes "
"LodTensor. It is currently not supported by DataParallel. "
"For example, if sparse embedding is used, and the weight of "
"embedding is shared with subsequent dense parameters, then "
"the parameter gradient of the embedding will be converted "
"to dense parameters."
,
var_index
,
tensors_
[
var_index
].
name
()));
group
.
sparse_contents_
.
set_impl
(
grad_tensor
.
impl
());
}
if
(
--
group
.
pending_
==
0
)
{
// can start allreduce
MarkGroupReady
(
group_index
);
}
if
(
next_group_
==
groups_
.
size
())
{
FinalizeBackward
();
}
}
void
EagerReducer
::
MarkGroupReady
(
size_t
group_index
)
{
VLOG
(
3
)
<<
"Group["
<<
group_index
<<
"] is ready"
;
PADDLE_ENFORCE_GE
(
group_index
,
next_group_
,
platform
::
errors
::
PreconditionNotMet
(
"The index of the incoming group must be greater "
"than or equal to the previously synchronized group index, "
"expect it to greater than or equal to %d, but got %d."
,
next_group_
,
group_index
));
if
(
group_index
>
next_group_
)
{
VLOG
(
3
)
<<
"It will adjust the order of group in next batch automatically"
;
return
;
}
for
(;
next_group_
<
groups_
.
size
()
&&
groups_
[
next_group_
].
pending_
==
0
;
++
next_group_
)
{
UNUSED
auto
&
group
=
groups_
[
next_group_
];
if
(
group
.
is_sparse_
)
{
AllReduceSparse
(
&
group
,
next_group_
);
}
else
{
FusedAllReduceSchedule
(
&
group
,
next_group_
);
}
}
}
bool
EagerReducer
::
HasGrad
(
size_t
var_index
)
{
auto
grad
=
egr
::
EagerUtils
::
mutable_grad
(
tensors_
[
var_index
]);
if
(
grad
&&
grad
->
initialized
())
{
return
true
;
}
else
{
return
false
;
}
}
void
EagerReducer
::
ProcessUnusedDenseVars
()
{
// The calculation stream must be used here to
// avoid conflicts with communication.
VLOG
(
3
)
<<
"Local used vars : "
<<
string
::
join_strings
(
local_used_vars_
,
','
);
const
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
);
auto
*
global_used_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
global_used_vars_
.
impl
())
.
get
();
framework
::
TensorFromVector
<
int32_t
>
(
local_used_vars_
,
*
dev_ctx
,
global_used_tensor
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
ReduceOp
::
SUM
;
std
::
vector
<
Tensor
>
reduce_tensors
=
{
global_used_vars_
};
std
::
vector
<
phi
::
DenseTensor
>
in_out
;
for
(
auto
&
t
:
reduce_tensors
)
{
in_out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
framework
::
TensorToVector
<
int
>
(
*
global_used_tensor
,
*
dev_ctx
,
&
local_used_vars_
);
dev_ctx
->
Wait
();
// sync compute stream to get global used var message,
// but maybe affect speed performance
VLOG
(
3
)
<<
"Global used vars : "
<<
string
::
join_strings
(
local_used_vars_
,
','
);
for
(
const
auto
var_index
:
unused_vars_
)
{
const
bool
global_unused
=
(
local_used_vars_
[
var_index
]
==
0
);
// global used but local unused, set grad
VLOG
(
3
)
<<
"[Rank "
<<
process_group_
->
GetRank
()
<<
"]: "
<<
"Var ["
<<
var_index
<<
"] ["
<<
tensors_
[
var_index
].
name
()
<<
"] global_unused: "
<<
global_unused
<<
" has grad: "
<<
HasGrad
(
var_index
);
if
(
!
global_unused
)
{
VLOG
(
3
)
<<
"Set Tensor["
<<
var_index
<<
"]'s Grad for [Rank "
<<
process_group_
->
GetRank
()
<<
"]"
;
const
auto
&
var_locator
=
variable_locators_
[
var_index
];
const
auto
group_index
=
var_locator
.
group_index
;
const
auto
&
group
=
groups_
[
group_index
];
const
auto
inside_group_index
=
var_locator
.
inside_group_index
;
auto
&
src_tensor
=
group
.
dense_tensors_
[
inside_group_index
];
// sparse no need to check and no support find_unused_parameters
if
(
group
.
is_sparse_
)
{
continue
;
}
// NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
// gradient accumulation can continue normally after clear_gradients()
// especiall in cases including complex control flow.
std
::
static_pointer_cast
<
egr
::
GradNodeAccumulation
>
(
GetGradNodeFromTensor
(
&
tensors_
[
var_index
]))
->
SetFakeEmpty
(
false
);
Tensor
grad_value
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
src_tensor
));
auto
dest_var_base
=
tensors_
[
var_index
];
auto
grad_tensor
=
egr
::
EagerUtils
::
mutable_grad
(
dest_var_base
);
grad_tensor
->
copy_
(
grad_value
,
inner_place_
,
true
);
grad_tensor
->
reshape
(
dest_var_base
.
shape
());
}
}
}
void
EagerReducer
::
FinalizeBackward
()
{
groups_need_finalize_
=
false
;
grad_need_hooks_
=
false
;
for
(
auto
&
group
:
groups_
)
{
if
(
!
group
.
is_sparse_
)
{
group
.
task
->
Synchronize
();
}
}
for
(
auto
&
group
:
groups_
)
{
if
(
!
group
.
is_sparse_
)
{
group
.
SplitTensors
(
inner_place_
);
}
}
if
(
find_unused_vars_each_step_
)
{
ProcessUnusedDenseVars
();
local_used_vars_
.
clear
();
local_used_vars_
.
resize
(
tensors_
.
size
(),
0
);
VLOG
(
3
)
<<
"ProcessUnusedDenseVars is finished."
;
}
VLOG
(
3
)
<<
"In the batch, Reducer is finished."
;
}
void
EagerReducer
::
FusedAllReduceSchedule
(
EagerGroup
*
group
,
const
int
curr_group_index
)
{
// The overall timeline: concat > div_nranks > allreduce > split
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
ReduceOp
::
SUM
;
VLOG
(
3
)
<<
"group ["
<<
curr_group_index
<<
"] start fused_allreduce."
;
// concat tensors
group
->
ConcatTensors
(
inner_place_
);
// div nranks
paddle
::
experimental
::
scale_
(
group
->
dense_contents_
,
1.0
/
nranks_
,
0.0
,
false
);
// all_reduce
std
::
vector
<
Tensor
>
reduce_tensors
=
{
group
->
dense_contents_
};
std
::
vector
<
phi
::
DenseTensor
>
in_out
;
for
(
auto
&
t
:
reduce_tensors
)
{
in_out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
group
->
task
=
process_group_
->
AllReduce
(
in_out
,
in_out
,
opts
);
// split in FinalizeBackward()
}
void
EagerReducer
::
AllReduceSparse
(
EagerGroup
*
group
,
const
int
curr_group_index
)
{
// div nranks
Tensor
sparse_tensor
(
group
->
sparse_contents_
);
paddle
::
experimental
::
scale_
(
sparse_tensor
,
1.0
/
nranks_
,
0.0
,
false
);
VLOG
(
3
)
<<
"sparse_group ["
<<
curr_group_index
<<
"] start allreduce."
;
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
);
if
(
platform
::
is_gpu_place
(
inner_place_
))
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
));
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
inner_place_
))
{
dev_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
inner_place_
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Split grad tensor not supported on place (%s)"
,
inner_place_
));
}
auto
src
=
std
::
dynamic_pointer_cast
<
phi
::
SelectedRows
>
(
group
->
sparse_contents_
.
impl
());
const
auto
&
src_rows
=
src
->
rows
();
const
auto
&
rank_
=
process_group_
->
GetRank
();
const
auto
&
size_
=
process_group_
->
GetSize
();
framework
::
Vector
<
int64_t
>
rows_num_vector
(
size_
);
rows_num_vector
[
rank_
]
=
static_cast
<
int64_t
>
(
src_rows
.
size
());
Tensor
rows_num_tensor
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int64_t
>
(
size_
)}),
DataType
::
INT64
,
inner_place_
);
auto
*
rows_num_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
rows_num_tensor
.
impl
()).
get
();
framework
::
TensorFromVector
<
int64_t
>
(
rows_num_vector
,
*
dev_ctx
,
rows_num_dense_tensor
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
ReduceOp
::
SUM
;
std
::
vector
<
Tensor
>
reduce_tensors
=
{
rows_num_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
in_out
;
for
(
auto
&
t
:
reduce_tensors
)
{
in_out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
framework
::
TensorToVector
<
int64_t
>
(
*
rows_num_dense_tensor
,
*
dev_ctx
,
&
rows_num_vector
);
dev_ctx
->
Wait
();
const
auto
*
cpu_rows_num_ptr
=
rows_num_vector
.
data
();
auto
rows_num
=
std
::
accumulate
(
cpu_rows_num_ptr
,
cpu_rows_num_ptr
+
size_
,
static_cast
<
int64_t
>
(
0
));
VLOG
(
3
)
<<
"Gather rows: "
<<
string
::
join_strings
(
rows_num_vector
,
','
)
<<
", total rows number: "
<<
rows_num
<<
", height: "
<<
src
->
height
();
dev_ctx
->
Wait
();
Tensor
src_value_tensor
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
src
->
value
()));
std
::
vector
<
int64_t
>
dst_shape
=
src_value_tensor
.
shape
();
if
(
std
::
all_of
(
cpu_rows_num_ptr
,
cpu_rows_num_ptr
+
size_
,
[
&
](
int64_t
row
)
{
return
row
==
cpu_rows_num_ptr
[
0
];
}))
{
// During sparse communication, the number of each card is same.
// allgather is used to speed up the allreduce by replacing broadcast.
VLOG
(
3
)
<<
"allgather replaces broadcast to speed up in sparse allreduce"
;
Tensor
dst_rows_tensor
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int64_t
>
(
rows_num
)}),
DataType
::
INT64
,
inner_place_
);
Tensor
src_rows_tensor
=
paddle
::
experimental
::
empty
(
IntArray
({
static_cast
<
int64_t
>
((
*
src
).
rows
().
size
())}),
DataType
::
INT64
,
inner_place_
);
auto
*
src_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
src_rows_tensor
.
impl
())
.
get
();
framework
::
TensorFromVector
<
int64_t
>
(
(
*
src
).
rows
(),
*
dev_ctx
,
src_rows_dense_tensor
);
std
::
vector
<
Tensor
>
src_rows_tensors
=
{
src_rows_tensor
};
std
::
vector
<
Tensor
>
dst_rows_tensors
=
{
dst_rows_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
in
;
std
::
vector
<
phi
::
DenseTensor
>
out
;
for
(
auto
&
t
:
src_rows_tensors
)
{
in
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
for
(
auto
&
t
:
dst_rows_tensors
)
{
out
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllGather
(
in
,
out
)
->
Synchronize
();
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
framework
::
TensorToVector
<
int64_t
>
(
*
dst_rows_dense_tensor
,
*
dev_ctx
,
&
dst_rows_vector
);
dev_ctx
->
Wait
();
dst_shape
[
dst_shape
.
size
()
-
2
]
=
rows_num
;
auto
dst_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
paddle
::
experimental
::
full
(
IntArray
(
dst_shape
),
0
,
src_value_tensor
.
dtype
(),
inner_place_
)
.
impl
());
auto
dst
=
std
::
make_shared
<
phi
::
SelectedRows
>
(
dst_rows_vector
,
(
*
src
).
height
());
*
(
dst
->
mutable_value
())
=
*
dst_dense_tensor
;
Tensor
dst_value_tensor
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
dst
->
value
()));
std
::
vector
<
Tensor
>
src_value_tensors
=
{
src_value_tensor
};
std
::
vector
<
Tensor
>
dst_value_tensors
=
{
dst_value_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
src_dense
;
std
::
vector
<
phi
::
DenseTensor
>
dst_dense
;
for
(
auto
&
t
:
src_value_tensors
)
{
src_dense
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
for
(
auto
&
t
:
dst_value_tensors
)
{
dst_dense
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
t
.
impl
()));
}
process_group_
->
AllGather
(
src_dense
,
dst_dense
)
->
Synchronize
();
src
->
set_rows
(
dst_rows_vector
);
*
(
src
->
mutable_value
())
=
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_value_tensor
.
impl
()));
}
else
{
std
::
vector
<
Tensor
>
rows_tensors
;
std
::
vector
<
Tensor
>
values_tensors
;
for
(
int
i
=
0
;
i
<
size_
;
++
i
)
{
std
::
vector
<
int64_t
>
value_tensor_shape
=
{
cpu_rows_num_ptr
[
i
],
dst_shape
[
dst_shape
.
size
()
-
1
]};
Tensor
rows_tensor
=
paddle
::
experimental
::
full
(
IntArray
({
static_cast
<
int64_t
>
(
cpu_rows_num_ptr
[
i
])}),
0
,
DataType
::
INT64
,
inner_place_
);
Tensor
values_tensor
=
paddle
::
experimental
::
full
(
IntArray
(
value_tensor_shape
),
0
,
src
->
value
().
dtype
(),
inner_place_
);
std
::
vector
<
phi
::
DenseTensor
>
rows_dense_vector
;
std
::
vector
<
phi
::
DenseTensor
>
values_dense_vector
;
if
(
i
==
rank_
)
{
auto
*
rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
rows_tensor
.
impl
())
.
get
();
framework
::
TensorFromVector
<
int64_t
>
(
src_rows
,
*
dev_ctx
,
rows_dense_tensor
);
values_tensor
.
set_impl
(
std
::
make_shared
<
phi
::
DenseTensor
>
(
src
->
value
()));
}
rows_dense_vector
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
rows_tensor
.
impl
()));
values_dense_vector
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
values_tensor
.
impl
()));
auto
b_opts
=
BroadcastOptions
();
b_opts
.
source_rank
=
i
;
process_group_
->
Broadcast
(
rows_dense_vector
,
rows_dense_vector
,
b_opts
);
process_group_
->
Broadcast
(
values_dense_vector
,
values_dense_vector
,
b_opts
)
->
Wait
();
rows_tensors
.
push_back
(
rows_tensor
);
values_tensors
.
push_back
(
values_tensor
);
}
Tensor
dst_rows_tensor
=
paddle
::
experimental
::
concat
(
rows_tensors
,
phi
::
Scalar
(
0
));
framework
::
Vector
<
int64_t
>
dst_rows_vector
(
rows_num
,
0
);
auto
*
dst_rows_dense_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_rows_tensor
.
impl
())
.
get
();
framework
::
TensorToVector
<
int64_t
>
(
*
dst_rows_dense_tensor
,
*
dev_ctx
,
&
dst_rows_vector
);
src
->
set_rows
(
dst_rows_vector
);
Tensor
dst_values_tensor
=
paddle
::
experimental
::
concat
(
values_tensors
,
phi
::
Scalar
(
0
));
*
(
src
->
mutable_value
())
=
*
(
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dst_values_tensor
.
impl
()));
}
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
EagerGroup
&
group
)
{
const
auto
&
tensors_
=
group
.
tensor_indices_
;
out
<<
"numel: "
<<
group
.
all_length_
<<
" ;var number: "
<<
tensors_
.
size
()
<<
"
\n
"
;
auto
begin
=
tensors_
.
begin
();
auto
end
=
tensors_
.
end
();
out
<<
"["
;
for
(
int
i
=
0
;
begin
!=
end
&&
i
<
100
;
++
i
,
++
begin
)
{
if
(
i
>
0
)
out
<<
' '
;
out
<<
*
begin
;
}
if
(
begin
!=
end
)
{
out
<<
" ..."
;
}
out
<<
"]
\n
"
;
return
out
;
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/reducer.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/utils/hook_utils.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/utils/string/string_helper.h"
namespace
paddle
{
namespace
distributed
{
using
Tensor
=
paddle
::
experimental
::
Tensor
;
using
Scalar
=
paddle
::
experimental
::
ScalarBase
<
paddle
::
experimental
::
Tensor
>
;
using
IntArray
=
paddle
::
experimental
::
IntArrayBase
<
paddle
::
experimental
::
Tensor
>
;
using
Backend
=
paddle
::
experimental
::
Backend
;
std
::
vector
<
std
::
vector
<
size_t
>>
Eager_AssignGroupBySize
(
const
std
::
vector
<
Tensor
>
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
const
std
::
vector
<
int64_t
>
&
tensor_indices
=
{});
class
EagerGroup
{
public:
Tensor
dense_contents_
;
Tensor
sparse_contents_
;
bool
is_sparse_
=
false
;
// for concat kernel
std
::
vector
<
phi
::
DenseTensor
>
dense_tensors_
;
std
::
vector
<
int64_t
>
length_
;
int64_t
all_length_
{
0
};
std
::
vector
<
IntArray
>
origin_shapes_
;
// Global indices of participating tensors in the group
std
::
vector
<
size_t
>
tensor_indices_
;
// Number of params that haven't been ready. When it is 0, it means
// the group is ready.
size_t
pending_
=
-
1
;
// external message of group
phi
::
DataType
dtype_
;
// help to sync
std
::
shared_ptr
<
ProcessGroup
::
Task
>
task
;
// context is used to select the stream for concat
void
ConcatTensors
(
const
platform
::
Place
&
);
// context is used to select the stream for split
void
SplitTensors
(
const
platform
::
Place
&
);
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
,
const
EagerGroup
&
);
};
struct
TensorLocator
{
// record the index in groups_
size_t
group_index
;
size_t
inside_group_index
;
};
class
EagerReducer
{
public:
explicit
EagerReducer
(
const
std
::
vector
<
Tensor
>
tensors
,
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
std
::
shared_ptr
<
distributed
::
ProcessGroup
>
process_group
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
bool
find_unused_parameters
);
virtual
~
EagerReducer
()
{}
std
::
shared_ptr
<
egr
::
GradNodeBase
>
GetGradNodeFromTensor
(
Tensor
*
tensor
);
void
InitializeGroups
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
);
void
InitializeDenseGroups
(
const
std
::
vector
<
size_t
>
&
tensor_indices_
,
EagerGroup
*
p_group
);
void
PrepareForBackward
(
const
std
::
vector
<
Tensor
>
&
outputs
);
void
AddDistHook
(
size_t
var_index
);
void
MarkVarReady
(
const
size_t
var_index
,
const
bool
is_used_var
);
void
MarkGroupReady
(
const
size_t
group_index
);
void
FusedAllReduceSchedule
(
EagerGroup
*
group
,
const
int
curr_group_index
);
void
AllReduceSparse
(
EagerGroup
*
group
,
const
int
curr_group_index
);
void
FinalizeBackward
();
void
TraverseBackwardGraph
(
const
std
::
vector
<
Tensor
>
&
outputs
);
void
ProcessUnusedDenseVars
();
bool
HasGrad
(
size_t
var_index
);
private:
std
::
vector
<
Tensor
>
tensors_
;
std
::
vector
<
std
::
vector
<
size_t
>>
group_indices_
;
std
::
vector
<
bool
>
is_sparse_gradient_
;
std
::
shared_ptr
<
distributed
::
ProcessGroup
>
process_group_
;
std
::
vector
<
size_t
>
group_size_limits_
;
std
::
vector
<
EagerGroup
>
groups_
;
std
::
vector
<
TensorLocator
>
variable_locators_
;
platform
::
Place
inner_place_
;
size_t
next_group_
=
0
;
int64_t
nranks_
=
-
1
;
bool
grad_need_hooks_
{
false
};
std
::
vector
<
bool
>
vars_marked_ready_
;
std
::
vector
<
int32_t
>
local_used_vars_
;
// Following variables are to help unused vars
std
::
vector
<
size_t
>
unused_vars_
;
std
::
map
<
egr
::
GradNodeBase
*
,
size_t
>
gradnode_index_map_
;
bool
has_marked_unused_vars_
{
false
};
bool
find_unused_vars_each_step_
{
false
};
bool
find_unused_vars_once_
{
true
};
bool
groups_need_finalize_
{
false
};
Tensor
global_used_vars_
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/CMakeLists.txt
0 → 100644
View file @
d2d32668
cc_library
(
afs_wrapper
SRCS afs_warpper.cc
DEPS fs ps_framework_proto
)
#set_property(GLOBAL PROPERTY COMMON_DEPS afs_warpper)
paddle/fluid/distributed/common/afs_warpper.cc
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/common/afs_warpper.h"
#include "paddle/fluid/framework/io/fs.h"
namespace
paddle
{
namespace
distributed
{
// AfsClient impl
int
AfsClient
::
initialize
(
const
FsClientParameter
&
fs_client_param
)
{
// temporarily implemented with hdfs-client
return
initialize
(
fs_client_param
.
hadoop_bin
(),
fs_client_param
.
uri
(),
fs_client_param
.
user
(),
fs_client_param
.
passwd
(),
fs_client_param
.
buffer_size
());
}
int
AfsClient
::
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
user
,
const
std
::
string
&
passwd
,
int
buffer_size_param
)
{
return
initialize
(
hadoop_bin
,
uri
,
paddle
::
string
::
format_string
(
"%s,%s"
,
user
.
c_str
(),
passwd
.
c_str
()),
buffer_size_param
);
}
int
AfsClient
::
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
ugi
,
int
buffer_size_param
)
{
// temporarily implemented with hdfs-client
size_t
buffer_size
=
1L
<<
25
;
// 32MB
if
(
buffer_size_param
>
static_cast
<
int
>
(
buffer_size
))
{
buffer_size
=
buffer_size_param
;
}
paddle
::
framework
::
hdfs_set_buffer_size
(
buffer_size
);
paddle
::
framework
::
hdfs_set_command
(
paddle
::
string
::
format_string
(
"2>>./hdfs_err.log %s fs -Dfs.default.name=%s -Dhadoop.job.ugi=%s "
"-Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=300000"
,
hadoop_bin
.
c_str
(),
uri
.
c_str
(),
ugi
.
c_str
()));
return
0
;
}
// open file in 'w' or 'r'
std
::
shared_ptr
<
FsReadChannel
>
AfsClient
::
open_r
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
,
int
*
err_no
)
{
std
::
shared_ptr
<
FsReadChannel
>
channel
=
std
::
make_shared
<
FsReadChannel
>
(
buffer_size
);
std
::
shared_ptr
<
FILE
>
fp
=
paddle
::
framework
::
fs_open_read
(
config
.
path
,
err_no
,
config
.
deconverter
);
channel
->
open
(
fp
,
config
);
return
channel
;
}
std
::
shared_ptr
<
FsWriteChannel
>
AfsClient
::
open_w
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
,
int
*
err_no
)
{
std
::
shared_ptr
<
FsWriteChannel
>
channel
=
std
::
make_shared
<
FsWriteChannel
>
(
buffer_size
);
std
::
shared_ptr
<
FILE
>
fp
=
paddle
::
framework
::
fs_open_write
(
config
.
path
,
err_no
,
config
.
converter
);
channel
->
open
(
fp
,
config
);
return
channel
;
}
// remove file in path, path maybe a reg, such as 'part-000-*'
void
AfsClient
::
remove
(
const
std
::
string
&
path
)
{
return
paddle
::
framework
::
fs_remove
(
path
);
}
void
AfsClient
::
remove_dir
(
const
std
::
string
&
dir
)
{
return
paddle
::
framework
::
fs_remove
(
dir
);
}
// list files in path, path maybe a dir with reg
std
::
vector
<
std
::
string
>
AfsClient
::
list
(
const
std
::
string
&
path
)
{
return
paddle
::
framework
::
fs_list
(
path
);
}
// exist or not
bool
AfsClient
::
exist
(
const
std
::
string
&
dir
)
{
return
paddle
::
framework
::
fs_exists
(
dir
);
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/afs_warpper.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/string/string_helper.h"
namespace
paddle
{
namespace
distributed
{
struct
FsDataConverter
{
std
::
string
converter
;
std
::
string
deconverter
;
};
struct
FsChannelConfig
{
std
::
string
path
;
// path of file
std
::
string
converter
;
// data converter
std
::
string
deconverter
;
};
class
FsReadChannel
{
public:
FsReadChannel
()
:
_buffer_size
(
0
)
{}
explicit
FsReadChannel
(
uint32_t
buffer_size
)
:
_buffer_size
(
buffer_size
)
{}
virtual
~
FsReadChannel
()
{}
FsReadChannel
(
FsReadChannel
&&
)
=
delete
;
FsReadChannel
(
const
FsReadChannel
&
)
=
delete
;
int
open
(
std
::
shared_ptr
<
FILE
>
fp
,
const
FsChannelConfig
&
config
)
{
_file
=
fp
;
return
0
;
}
inline
int
close
()
{
_file
.
reset
();
return
0
;
}
inline
uint32_t
read_line
(
std
::
string
&
line_data
)
{
// NOLINT
line_data
.
clear
();
char
buffer
=
'\0'
;
size_t
read_count
=
0
;
while
(
1
==
fread
(
&
buffer
,
1
,
1
,
_file
.
get
())
&&
buffer
!=
'\n'
)
{
++
read_count
;
line_data
.
append
(
&
buffer
,
1
);
}
if
(
read_count
==
0
&&
buffer
!=
'\n'
)
{
return
-
1
;
}
return
0
;
}
private:
uint32_t
_buffer_size
;
FsChannelConfig
_config
;
std
::
shared_ptr
<
FILE
>
_file
;
};
class
FsWriteChannel
{
public:
FsWriteChannel
()
:
_buffer_size
(
0
)
{}
explicit
FsWriteChannel
(
uint32_t
buffer_size
)
:
_buffer_size
(
buffer_size
)
{}
virtual
~
FsWriteChannel
()
{}
FsWriteChannel
(
FsWriteChannel
&&
)
=
delete
;
FsWriteChannel
(
const
FsWriteChannel
&
)
=
delete
;
int
open
(
std
::
shared_ptr
<
FILE
>
fp
,
const
FsChannelConfig
&
config
)
{
_file
=
fp
;
// the buffer has set in fs.cc
// if (_buffer_size != 0) {
// _buffer = std::shared_ptr<char>(new char[_buffer_size]);
// CHECK(0 == setvbuf(&*_file, _buffer.get(), _IOFBF, _buffer_size));
//}
return
0
;
}
inline
void
flush
()
{
return
;
}
inline
int
close
()
{
flush
();
_file
.
reset
();
return
0
;
}
inline
uint32_t
write_line
(
const
char
*
data
,
uint32_t
size
)
{
size_t
write_count
=
fwrite_unlocked
(
data
,
1
,
size
,
_file
.
get
());
if
(
write_count
!=
size
)
{
return
-
1
;
}
write_count
=
fwrite_unlocked
(
"
\n
"
,
1
,
1
,
_file
.
get
());
if
(
write_count
!=
1
)
{
return
-
1
;
}
return
0
;
}
inline
uint32_t
write_line
(
const
std
::
string
&
data
)
{
return
write_line
(
data
.
c_str
(),
data
.
size
());
}
private:
uint32_t
_buffer_size
;
FsChannelConfig
_config
;
std
::
shared_ptr
<
FILE
>
_file
;
std
::
shared_ptr
<
char
>
_buffer
;
};
class
AfsClient
{
public:
AfsClient
()
{}
virtual
~
AfsClient
()
{}
AfsClient
(
AfsClient
&&
)
=
delete
;
AfsClient
(
const
AfsClient
&
)
=
delete
;
int
initialize
(
const
FsClientParameter
&
fs_client_param
);
int
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
user
,
const
std
::
string
&
passwd
,
int
buffer_size_param
=
(
1L
<<
25
));
int
initialize
(
const
std
::
string
&
hadoop_bin
,
const
std
::
string
&
uri
,
const
std
::
string
&
ugi
,
int
buffer_size_param
=
(
1L
<<
25
));
// open file in 'w' or 'r'
std
::
shared_ptr
<
FsReadChannel
>
open_r
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
=
0
,
int
*
err_no
=
nullptr
);
std
::
shared_ptr
<
FsWriteChannel
>
open_w
(
const
FsChannelConfig
&
config
,
uint32_t
buffer_size
=
0
,
int
*
err_no
=
nullptr
);
// remove file in path, path maybe a reg, such as 'part-000-*'
void
remove
(
const
std
::
string
&
path
);
void
remove_dir
(
const
std
::
string
&
dir
);
// list files in path, path maybe a dir with reg
std
::
vector
<
std
::
string
>
list
(
const
std
::
string
&
path
);
// exist or not
bool
exist
(
const
std
::
string
&
dir
);
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/chunk_allocator.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <glog/logging.h>
namespace
paddle
{
namespace
distributed
{
// Fast allocation and deallocation of objects by allocating them in chunks.
template
<
class
T
>
class
ChunkAllocator
{
public:
explicit
ChunkAllocator
(
size_t
chunk_size
=
64
)
{
CHECK
(
sizeof
(
Node
)
==
std
::
max
(
sizeof
(
void
*
),
sizeof
(
T
)));
_chunk_size
=
chunk_size
;
_chunks
=
NULL
;
_free_nodes
=
NULL
;
_counter
=
0
;
}
ChunkAllocator
(
const
ChunkAllocator
&
)
=
delete
;
~
ChunkAllocator
()
{
while
(
_chunks
!=
NULL
)
{
Chunk
*
x
=
_chunks
;
_chunks
=
_chunks
->
next
;
free
(
x
);
}
}
template
<
class
...
ARGS
>
T
*
acquire
(
ARGS
&&
...
args
)
{
if
(
_free_nodes
==
NULL
)
{
create_new_chunk
();
}
T
*
x
=
(
T
*
)(
void
*
)
_free_nodes
;
// NOLINT
_free_nodes
=
_free_nodes
->
next
;
new
(
x
)
T
(
std
::
forward
<
ARGS
>
(
args
)...);
_counter
++
;
return
x
;
}
void
release
(
T
*
x
)
{
x
->~
T
();
Node
*
node
=
(
Node
*
)(
void
*
)
x
;
// NOLINT
node
->
next
=
_free_nodes
;
_free_nodes
=
node
;
_counter
--
;
}
size_t
size
()
const
{
return
_counter
;
}
private:
struct
alignas
(
T
)
Node
{
union
{
Node
*
next
;
char
data
[
sizeof
(
T
)];
};
};
struct
Chunk
{
Chunk
*
next
;
Node
nodes
[];
};
size_t
_chunk_size
;
// how many elements in one chunk
Chunk
*
_chunks
;
// a list
Node
*
_free_nodes
;
// a list
size_t
_counter
;
// how many elements are acquired
void
create_new_chunk
()
{
Chunk
*
chunk
;
posix_memalign
(
reinterpret_cast
<
void
**>
(
&
chunk
),
std
::
max
<
size_t
>
(
sizeof
(
void
*
),
alignof
(
Chunk
)),
sizeof
(
Chunk
)
+
sizeof
(
Node
)
*
_chunk_size
);
chunk
->
next
=
_chunks
;
_chunks
=
chunk
;
for
(
size_t
i
=
0
;
i
<
_chunk_size
;
i
++
)
{
Node
*
node
=
&
chunk
->
nodes
[
i
];
node
->
next
=
_free_nodes
;
_free_nodes
=
node
;
}
}
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/cost_timer.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <unordered_map>
#include "butil/time.h"
#include "bvar/latency_recorder.h"
#include "glog/logging.h"
namespace
paddle
{
namespace
distributed
{
struct
CostProfilerNode
{
std
::
shared_ptr
<
bvar
::
LatencyRecorder
>
recorder
;
};
class
CostProfiler
{
public:
~
CostProfiler
()
{}
static
CostProfiler
&
instance
()
{
static
CostProfiler
profiler
;
return
profiler
;
}
void
register_profiler
(
const
std
::
string
&
label
)
{
if
(
_cost_profiler_map
.
find
(
label
)
!=
_cost_profiler_map
.
end
())
{
return
;
}
auto
profiler_node
=
std
::
make_shared
<
CostProfilerNode
>
();
profiler_node
->
recorder
.
reset
(
new
bvar
::
LatencyRecorder
(
"cost_profiler"
,
label
));
_cost_profiler_map
[
label
]
=
profiler_node
;
}
CostProfilerNode
*
profiler
(
const
std
::
string
&
label
)
{
auto
itr
=
_cost_profiler_map
.
find
(
label
);
if
(
itr
!=
_cost_profiler_map
.
end
())
{
return
itr
->
second
.
get
();
}
return
NULL
;
}
private:
CostProfiler
()
{}
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
CostProfilerNode
>>
_cost_profiler_map
;
};
class
CostTimer
{
public:
explicit
CostTimer
(
const
std
::
string
&
label
)
{
_label
=
label
;
auto
&
profiler
=
CostProfiler
::
instance
();
_profiler_node
=
profiler
.
profiler
(
label
);
// 如果不在profiler中,则使用log输出耗时信息
_is_print_cost
=
_profiler_node
==
NULL
;
_start_time_ms
=
butil
::
gettimeofday_ms
();
}
explicit
CostTimer
(
CostProfilerNode
&
profiler_node
)
{
// NOLINT
_is_print_cost
=
false
;
_profiler_node
=
&
profiler_node
;
_start_time_ms
=
butil
::
gettimeofday_ms
();
}
~
CostTimer
()
{
if
(
_is_print_cost
)
{
VLOG
(
3
)
<<
"CostTimer label:"
<<
_label
<<
", cost:"
<<
butil
::
gettimeofday_ms
()
-
_start_time_ms
<<
"ms"
;
}
else
{
*
(
_profiler_node
->
recorder
)
<<
butil
::
gettimeofday_ms
()
-
_start_time_ms
;
}
}
private:
std
::
string
_label
;
bool
_is_print_cost
;
uint64_t
_start_time_ms
;
CostProfilerNode
*
_profiler_node
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/local_random.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <assert.h>
#include <time.h>
#include <atomic>
#include <random>
namespace
paddle
{
namespace
distributed
{
// Get time in seconds.
inline
double
current_realtime
()
{
struct
timespec
tp
;
clock_gettime
(
CLOCK_REALTIME
,
&
tp
);
return
tp
.
tv_sec
+
tp
.
tv_nsec
*
1e-9
;
}
inline
std
::
default_random_engine
&
local_random_engine
()
{
struct
engine_wrapper_t
{
std
::
default_random_engine
engine
;
engine_wrapper_t
()
{
static
std
::
atomic
<
unsigned
long
>
x
(
0
);
// NOLINT
std
::
seed_seq
sseq
=
{
x
++
,
x
++
,
x
++
,
(
unsigned
long
)(
current_realtime
()
*
1000
)};
// NOLINT
engine
.
seed
(
sseq
);
}
};
thread_local
engine_wrapper_t
r
;
return
r
.
engine
;
}
template
<
class
T
=
double
>
std
::
uniform_real_distribution
<
T
>&
local_uniform_real_distribution
()
{
thread_local
std
::
uniform_real_distribution
<
T
>
distr
;
assert
(
distr
.
a
()
==
0.0
&&
distr
.
b
()
==
1.0
);
return
distr
;
}
template
<
class
T
=
double
>
T
uniform_real
()
{
return
local_uniform_real_distribution
<
T
>
()(
local_random_engine
());
}
template
<
class
T
=
double
>
T
uniform_real
(
T
a
,
T
b
)
{
if
(
a
==
b
)
{
return
a
;
}
return
(
T
)(
a
+
uniform_real
<
T
>
()
*
(
b
-
a
));
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/registerer.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <glog/logging.h>
#include <iostream>
#include <map>
#include <string>
#include <vector>
namespace
paddle
{
namespace
distributed
{
class
Any
{
public:
Any
()
:
content_
(
NULL
)
{}
template
<
typename
ValueType
>
Any
(
const
ValueType
&
value
)
:
content_
(
new
Holder
<
ValueType
>
(
value
))
{}
Any
(
const
Any
&
other
)
:
content_
(
other
.
content_
?
other
.
content_
->
clone
()
:
NULL
)
{}
~
Any
()
{
delete
content_
;
}
template
<
typename
ValueType
>
ValueType
*
any_cast
()
{
return
content_
?
&
static_cast
<
Holder
<
ValueType
>
*>
(
content_
)
->
held_
:
NULL
;
}
private:
class
PlaceHolder
{
public:
virtual
~
PlaceHolder
()
{}
virtual
PlaceHolder
*
clone
()
const
=
0
;
};
template
<
typename
ValueType
>
class
Holder
:
public
PlaceHolder
{
public:
explicit
Holder
(
const
ValueType
&
value
)
:
held_
(
value
)
{}
virtual
PlaceHolder
*
clone
()
const
{
return
new
Holder
(
held_
);
}
ValueType
held_
;
};
PlaceHolder
*
content_
;
};
class
ObjectFactory
{
public:
ObjectFactory
()
{}
virtual
~
ObjectFactory
()
{}
virtual
Any
NewInstance
()
{
return
Any
();
}
private:
};
typedef
std
::
map
<
std
::
string
,
ObjectFactory
*>
FactoryMap
;
typedef
std
::
map
<
std
::
string
,
FactoryMap
>
PsCoreClassMap
;
#ifdef __cplusplus
extern
"C"
{
#endif
inline
PsCoreClassMap
&
global_factory_map
()
{
static
PsCoreClassMap
*
base_class
=
new
PsCoreClassMap
();
return
*
base_class
;
}
#ifdef __cplusplus
}
#endif
inline
PsCoreClassMap
&
global_factory_map_cpp
()
{
return
global_factory_map
();
}
// typedef pa::Any Any;
// typedef ::FactoryMap FactoryMap;
#define REGISTER_PSCORE_REGISTERER(base_class) \
class base_class##Registerer { \
public: \
static base_class *CreateInstanceByName(const ::std::string &name) { \
if (global_factory_map_cpp().find(#base_class) == \
global_factory_map_cpp().end()) { \
LOG(ERROR) << "Can't Find BaseClass For CreateClass with:" \
<< #base_class; \
return NULL; \
} \
FactoryMap &map = global_factory_map_cpp()[#base_class]; \
FactoryMap::iterator iter = map.find(name); \
if (iter == map.end()) { \
LOG(ERROR) << "Can't Find Class For Create with:" << name; \
return NULL; \
} \
Any object = iter->second->NewInstance(); \
return *(object.any_cast<base_class *>()); \
} \
};
#define REGISTER_PSCORE_CLASS(clazz, name) \
class ObjectFactory##name : public ObjectFactory { \
public: \
Any NewInstance() { return Any(new name()); } \
}; \
void register_factory_##name() { \
FactoryMap &map = global_factory_map_cpp()[#clazz]; \
if (map.find(#name) == map.end()) { \
map[#name] = new ObjectFactory##name(); \
} \
} \
void register_factory_##name() __attribute__((constructor));
#define CREATE_PSCORE_CLASS(base_class, name) \
base_class##Registerer::CreateInstanceByName(name);
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/topk_calculator.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <queue>
#include <unordered_map>
namespace
paddle
{
namespace
distributed
{
class
TopkCalculator
{
public:
TopkCalculator
(
int
shard_num
,
size_t
k
)
:
_shard_num
(
shard_num
),
_total_max_size
(
k
)
{
_shard_max_size
=
_total_max_size
/
shard_num
;
_shard_max_size
=
_shard_max_size
>
1
?
_shard_max_size
:
1
;
for
(
int
i
=
0
;
i
<
shard_num
;
++
i
)
{
_mpq
.
emplace
(
i
,
std
::
priority_queue
<
double
,
std
::
vector
<
double
>
,
std
::
greater
<
double
>>
());
}
}
~
TopkCalculator
()
{}
bool
push
(
int
shard_id
,
double
value
)
{
if
(
_mpq
.
find
(
shard_id
)
==
_mpq
.
end
())
{
return
false
;
}
auto
&
pq
=
_mpq
[
shard_id
];
if
(
pq
.
size
()
<
_shard_max_size
)
{
pq
.
push
(
value
);
}
else
{
if
(
pq
.
top
()
<
value
)
{
pq
.
pop
();
pq
.
push
(
value
);
}
}
return
true
;
}
// TODO 再进行一次堆排序merge各个shard的结果
int
top
()
{
double
total
=
0
;
for
(
const
auto
&
item
:
_mpq
)
{
auto
&
pq
=
item
.
second
;
if
(
!
pq
.
empty
())
{
total
+=
pq
.
top
();
}
}
return
total
/
_shard_num
;
}
private:
std
::
unordered_map
<
int
,
std
::
priority_queue
<
double
,
std
::
vector
<
double
>
,
std
::
greater
<
double
>>>
_mpq
;
int
_shard_num
;
size_t
_total_max_size
;
size_t
_shard_max_size
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/common/utils.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <sys/time.h>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace
paddle
{
namespace
distributed
{
template
<
typename
T
>
inline
phi
::
funcs
::
BlasT
<
phi
::
CPUContext
,
T
>
GetBlas
()
{
phi
::
CPUContext
cpu_ctx
;
return
phi
::
funcs
::
GetBlas
<
phi
::
CPUContext
,
T
>
(
cpu_ctx
);
}
template
<
typename
T
>
inline
void
SQRT
(
int
n
,
const
T
*
x
,
T
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
sqrt
(
x
[
i
]);
}
}
template
<
typename
T
>
inline
void
ADD
(
int
n
,
const
T
*
x
,
const
T
y
,
T
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
;
}
}
template
<
typename
T
>
inline
void
DIV
(
int
n
,
const
T
x
,
const
T
*
y
,
T
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
/
y
[
i
];
}
}
template
<
typename
T
>
inline
void
ELE_MUL
(
int
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
static
bool
StartWith
(
const
std
::
string
&
str
,
const
std
::
string
&
substr
)
{
return
str
.
find
(
substr
)
==
0
;
}
static
bool
EndWith
(
const
std
::
string
&
str
,
const
std
::
string
&
substr
)
{
return
str
.
rfind
(
substr
)
==
(
str
.
length
()
-
substr
.
length
());
}
inline
std
::
vector
<
int
>
bucket
(
const
int
v_size
,
const
int
b_size
)
{
int
remainder
=
v_size
%
b_size
;
int
bucket
=
v_size
/
b_size
;
std
::
vector
<
int
>
ret_vec
(
b_size
,
bucket
);
for
(
int
i
=
0
;
i
<
remainder
;
++
i
)
{
ret_vec
[
i
]
=
ret_vec
[
i
]
+
1
;
}
int
cur_bucket
=
0
;
for
(
int
&
j
:
ret_vec
)
{
int
tmp
=
j
;
j
=
cur_bucket
;
cur_bucket
+=
tmp
;
}
ret_vec
.
push_back
(
cur_bucket
);
return
ret_vec
;
}
template
<
typename
T
>
std
::
string
to_string
(
const
std
::
vector
<
T
>&
vec
)
{
std
::
stringstream
ss
;
for
(
const
auto
&
c
:
vec
)
{
ss
<<
c
<<
" "
;
}
return
ss
.
str
();
}
inline
double
GetCurrentUS
()
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+6
*
time
.
tv_sec
+
time
.
tv_usec
;
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/dataset_utils/README.md
0 → 100644
View file @
d2d32668
# 目录说明
> 干掉原来的 index_dataset 目录
dataset 抽样工具类
用户自定义数据处理so
流式dataserver相关类
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
0 → 100644
View file @
d2d32668
proto_library
(
fleet_executor_desc_proto SRCS fleet_executor_desc.proto
)
if
(
WITH_PYTHON
)
py_proto_compile
(
fleet_executor_desc_py_proto SRCS fleet_executor_desc.proto
)
endif
()
proto_library
(
interceptor_message_proto SRCS interceptor_message.proto
)
if
(
WITH_ARM_BRPC
)
set
(
BRPC_DEPS arm_brpc snappy gflags glog
)
elseif
(
WITH_DISTRIBUTE AND WITH_PSCORE
)
set
(
BRPC_DEPS
brpc
ssl
crypto
protobuf
zlib
leveldb
snappy
gflags
glog
)
else
()
set
(
BRPC_DEPS
""
)
endif
()
cc_library
(
task_loop_thread_pool
SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc
DEPS enforce glog
)
cc_library
(
fleet_executor
SRCS fleet_executor.cc
carrier.cc
task_node.cc
runtime_graph.cc
dist_model.cc
interceptor.cc
compute_interceptor.cc
amplifier_interceptor.cc
source_interceptor.cc
sink_interceptor.cc
message_service.cc
message_bus.cc
dist_model_tensor_wrapper.cc
DEPS proto_desc
fleet_executor_desc_proto
interceptor_message_proto
task_loop_thread_pool
collective_helper
op_registry
executor_gc_helper
gflags
glog
${
BRPC_DEPS
}
)
if
(
WITH_DISTRIBUTE
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
if
(
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"
${
DISTRIBUTE_COMPILE_FLAGS
}
-faligned-new"
)
endif
()
set_source_files_properties
(
interceptor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
compute_interceptor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
source_interceptor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
sink_interceptor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
message_bus.h PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
message_bus.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
fleet_executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
carrier.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
message_service.h PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
message_service.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
add_subdirectory
(
test
)
endif
()
paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/operator.h"
namespace
paddle
{
namespace
distributed
{
AmplifierInterceptor
::
AmplifierInterceptor
(
int64_t
interceptor_id
,
TaskNode
*
node
)
:
ComputeInterceptor
(
interceptor_id
,
node
)
{
run_per_steps_
=
node
->
run_per_steps
();
run_at_offset_
=
node
->
run_at_offset
();
reply_up_per_steps_
=
node
->
reply_up_per_steps
();
send_down_per_steps_
=
node
->
send_down_per_steps
();
}
void
AmplifierInterceptor
::
RunOps
()
{
// run_per_steps_, run_at_offset_
// 4, 0 --> run at step 0, 4, 8, 12
// 4, 3 --> run at step 3, 7, 11, 15
if
((
step_
%
run_per_steps_
)
==
run_at_offset_
)
{
ComputeInterceptor
::
RunOps
();
}
}
void
AmplifierInterceptor
::
SendDataReadyToDownStream
()
{
// run multi times, send ready one times to downstream, that is
// input multi times, output one times
if
(
step_
%
send_down_per_steps_
==
0
)
{
ComputeInterceptor
::
SendDataReadyToDownStream
();
}
}
void
AmplifierInterceptor
::
ReplyCompletedToUpStream
()
{
// run multi times, reply one times to upstream, that is
// input one times, output multi times
if
(
step_
%
reply_up_per_steps_
==
0
)
{
ComputeInterceptor
::
ReplyCompletedToUpStream
();
}
}
REGISTER_INTERCEPTOR
(
Amplifier
,
AmplifierInterceptor
);
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <utility>
#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
namespace
paddle
{
namespace
distributed
{
class
AmplifierInterceptor
:
public
ComputeInterceptor
{
public:
AmplifierInterceptor
(
int64_t
interceptor_id
,
TaskNode
*
node
);
private:
void
RunOps
()
override
;
void
SendDataReadyToDownStream
()
override
;
void
ReplyCompletedToUpStream
()
override
;
int64_t
run_per_steps_
{
1
};
int64_t
run_at_offset_
{
0
};
// one input produces multi times output
int64_t
reply_up_per_steps_
{
1
};
// one output need multi times input
int64_t
send_down_per_steps_
{
1
};
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/fleet_executor/carrier.cc
0 → 100644
View file @
d2d32668
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include <algorithm>
#include "paddle/fluid/distributed/fleet_executor/global.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h"
namespace
paddle
{
namespace
distributed
{
USE_INTERCEPTOR
(
Source
);
USE_INTERCEPTOR
(
Compute
);
USE_INTERCEPTOR
(
Amplifier
);
USE_INTERCEPTOR
(
Sink
);
void
Carrier
::
Init
(
int64_t
rank
,
const
std
::
unordered_map
<
int64_t
,
int64_t
>&
interceptor_id_to_rank
)
{
rank_
=
rank
;
interceptor_id_to_rank_
=
interceptor_id_to_rank
;
// TODO(fleet_exe dev): thread pool
thread_num_
=
1
;
thread_pool_
.
SetThreadNum
(
thread_num_
);
thread_pool_
.
Start
();
}
void
Carrier
::
Init
(
int64_t
rank
,
const
std
::
unordered_map
<
int64_t
,
int64_t
>&
interceptor_id_to_rank
,
const
std
::
unordered_map
<
int64_t
,
TaskNode
*>&
interceptor_id_to_node
,
const
framework
::
ProgramDesc
&
program
,
framework
::
Scope
*
scope
,
int64_t
num_micro_batches
,
const
platform
::
Place
&
place
,
const
std
::
vector
<
std
::
string
>&
inference_root_scope_vars
)
{
rank_
=
rank
;
interceptor_id_to_rank_
=
interceptor_id_to_rank
;
interceptor_id_to_node_
=
interceptor_id_to_node
;
place_
=
place
;
root_scope_
=
scope
;
dev_ctx_
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
);
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
"root_scope can not be nullptr"
));
minibatch_scope_
=
&
root_scope_
->
NewScope
();
microbatch_scopes_
.
resize
(
num_micro_batches
);
for
(
int
i
=
0
;
i
<
num_micro_batches
;
++
i
)
{
microbatch_scopes_
[
i
]
=
&
minibatch_scope_
->
NewScope
();
CopyParameters
(
i
,
program
,
inference_root_scope_vars
);
}
// TODO(fleet_exe dev): thread pool
thread_num_
=
1
;
thread_pool_
.
SetThreadNum
(
thread_num_
);
thread_pool_
.
Start
();
CreateInterceptors
();
is_init_
=
true
;
}
void
Carrier
::
Release
()
{
if
(
root_scope_
)
{
root_scope_
->
DropKids
();
}
}
Carrier
::~
Carrier
()
{
VLOG
(
3
)
<<
"Carrier's destructor."
;
}
void
Carrier
::
CopyParameters
(
int
microbatch_id
,
const
framework
::
ProgramDesc
&
program
,
const
std
::
vector
<
std
::
string
>&
inference_root_scope_vars
)
{
auto
&
global_block
=
program
.
Block
(
0
);
std
::
map
<
std
::
string
,
int
>
inference_root_scope_var_map
;
for
(
auto
var_name
:
inference_root_scope_vars
)
{
inference_root_scope_var_map
.
insert
({
var_name
,
1
});
}
for
(
auto
&
var
:
global_block
.
AllVars
())
{
std
::
string
var_name
=
var
->
Name
();
bool
force_root
=
inference_root_scope_var_map
.
find
(
var_name
)
!=
inference_root_scope_var_map
.
end
();
if
(
force_root
)
{
VLOG
(
4
)
<<
var_name
<<
" will be forced to be created in the root scope."
;
}
if
((
var
->
Persistable
()
||
force_root
)
&&
microbatch_id
==
0
)
{
auto
*
ptr
=
root_scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
5
)
<<
"Create persistable var: "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
else
if
(
!
var
->
Persistable
())
{
auto
*
ptr
=
microbatch_scopes_
[
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
5
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for microbatch "
<<
microbatch_id
<<
", which pointer is "
<<
ptr
<<
"."
;
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
}
bool
Carrier
::
EnqueueInterceptorMessage
(
const
InterceptorMessage
&
interceptor_message
)
{
PADDLE_ENFORCE_EQ
(
interceptor_message
.
ctrl_message
(),
false
,
platform
::
errors
::
Fatal
(
"Control message should be only send inter rank using message bus."
));
int64_t
dst_id
=
interceptor_message
.
dst_id
();
Interceptor
*
dst_interceptor
=
GetInterceptor
(
dst_id
);
dst_interceptor
->
EnqueueRemoteInterceptorMessage
(
interceptor_message
);
return
true
;
}
Interceptor
*
Carrier
::
GetInterceptor
(
int64_t
interceptor_id
)
{
auto
iter
=
interceptor_idx_to_interceptor_
.
find
(
interceptor_id
);
PADDLE_ENFORCE_NE
(
iter
,
interceptor_idx_to_interceptor_
.
end
(),
platform
::
errors
::
InvalidArgument
(
"Cannot find interceptor instance for interceptor "
"id %lld. Wrong dst? Call before init?"
,
interceptor_id
));
return
iter
->
second
.
get
();
}
void
Carrier
::
Wait
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
running_mutex_
);
cond_var_
.
wait
(
lock
);
}
void
Carrier
::
WakeUp
()
{
// probably double notify, but ok for ut
cond_var_
.
notify_all
();
}
void
Carrier
::
Start
()
{
PADDLE_ENFORCE_EQ
(
is_init_
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"Using carrier before initialized."
));
for
(
int64_t
id
:
source_interceptor_ids_
)
{
VLOG
(
3
)
<<
"Carrier Start is sending start to source interceptor "
<<
id
<<
"."
;
InterceptorMessage
start_msg
;
// source node data_is_ready is send by carrier, so set src_id=-1
start_msg
.
set_src_id
(
-
1
);
start_msg
.
set_dst_id
(
id
);
start_msg
.
set_message_type
(
DATA_IS_READY
);
Send
(
start_msg
);
}
// TODO(wangxi): async step
Wait
();
dev_ctx_
->
Wait
();
for
(
auto
*
micro_scope
:
microbatch_scopes_
)
{
// By default, we should delete all kid scopes after run executor because
// some operators may create local scope when running, such as while_op.
// But when while_op also create a local executor to run it's sub block,
// the sub scopes it created should not be dropped immediately, because
// while_grad_op will use some variables created during while_op run, so
// we need to keep the kids and wait for the outer executor to drop them.
micro_scope
->
DropKids
();
}
}
bool
Carrier
::
IsInit
()
const
{
return
is_init_
;
}
int64_t
Carrier
::
GetRank
(
int64_t
interceptor_id
)
const
{
PADDLE_ENFORCE_NE
(
interceptor_id_to_rank_
.
find
(
interceptor_id
),
interceptor_id_to_rank_
.
end
(),
platform
::
errors
::
NotFound
(
"Cannot find rank for interceptor id %lld."
,
interceptor_id
));
return
interceptor_id_to_rank_
.
at
(
interceptor_id
);
}
bool
Carrier
::
Send
(
const
InterceptorMessage
&
msg
)
{
int64_t
src_id
=
msg
.
src_id
();
// TODO(liyurui): compatible solution, will be removed completely in the
// future
if
(
interceptor_id_to_rank_
.
find
(
src_id
)
==
interceptor_id_to_rank_
.
end
()
&&
src_id
==
SOURCE_ID
)
{
src_id
=
msg
.
dst_id
();
}
int64_t
dst_id
=
msg
.
dst_id
();
int64_t
src_rank
=
GetRank
(
src_id
);
int64_t
dst_rank
=
GetRank
(
dst_id
);
PADDLE_ENFORCE_EQ
(
src_rank
,
rank_
,
platform
::
errors
::
Fatal
(
"The source rank id %lld, which is not equal to "
"the carrier rank id %lld."
,
src_rank
,
rank_
));
if
(
src_rank
==
dst_rank
)
{
VLOG
(
3
)
<<
"Send a message from interceptor "
<<
src_id
<<
" to interceptor "
<<
dst_id
<<
", which are in the same ranks."
;
return
EnqueueInterceptorMessage
(
msg
);
}
else
{
VLOG
(
3
)
<<
"Send a message from interceptor "
<<
src_id
<<
" to interceptor "
<<
dst_id
<<
", which are in different ranks."
;
return
GlobalVal
<
MessageBus
>::
Get
()
->
Send
(
dst_rank
,
msg
);
}
}
Interceptor
*
Carrier
::
SetInterceptor
(
int64_t
interceptor_id
,
std
::
unique_ptr
<
Interceptor
>
interceptor
)
{
auto
iter
=
interceptor_idx_to_interceptor_
.
find
(
interceptor_id
);
PADDLE_ENFORCE_EQ
(
iter
,
interceptor_idx_to_interceptor_
.
end
(),
platform
::
errors
::
AlreadyExists
(
"The interceptor id %lld has already been created! "
"The interceptor id should be unique."
,
interceptor_id
));
interceptor
->
RegisterCarrier
(
this
);
// TODO(fleet_exe dev): get loop
auto
*
loop
=
thread_pool_
.
GetLoop
(
interceptor_id
%
thread_num_
);
PADDLE_ENFORCE_NOT_NULL
(
loop
,
platform
::
errors
::
Fatal
(
"thread task loop must not null"
));
interceptor
->
RegisterTaskLoop
(
loop
);
auto
*
ptr
=
interceptor
.
get
();
interceptor_idx_to_interceptor_
.
insert
(
std
::
make_pair
(
interceptor_id
,
std
::
move
(
interceptor
)));
return
ptr
;
}
static
std
::
shared_ptr
<
framework
::
GarbageCollector
>
GetGC
(
const
platform
::
Place
&
place
)
{
int64_t
max_memory_size
=
framework
::
GetEagerDeletionThreshold
();
std
::
shared_ptr
<
framework
::
GarbageCollector
>
gc
;
if
(
max_memory_size
>=
0
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
framework
::
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
framework
::
UnsafeFastGPUGarbageCollector
(
place
,
max_memory_size
));
}
}
#endif
}
// max_memory_size >= 0
return
gc
;
}
void
Carrier
::
CreateInterceptors
()
{
if
(
interceptor_id_to_node_
.
empty
())
return
;
auto
gc
=
GetGC
(
place_
);
// create each Interceptor
// no auto init since there is no config
for
(
const
auto
&
item
:
interceptor_id_to_node_
)
{
int64_t
interceptor_id
=
item
.
first
;
TaskNode
*
task_node
=
item
.
second
;
PADDLE_ENFORCE_LT
(
task_node
->
run_at_offset
(),
task_node
->
run_per_steps
(),
platform
::
errors
::
InvalidArgument
(
"Interceptor's run_at_offset must < run_per_steps, must now "
"run_at_offset=%ld run_per_steps=%ld"
,
task_node
->
run_at_offset
(),
task_node
->
run_per_steps
()));
std
::
unique_ptr
<
Interceptor
>
interceptor
;
PADDLE_ENFORCE_NE
(
task_node
->
type
().
empty
(),
true
,
platform
::
errors
::
NotFound
(
"Cannot found type for task node with id %lld"
,
task_node
->
task_id
()));
interceptor
=
InterceptorFactory
::
Create
(
task_node
->
type
(),
interceptor_id
,
task_node
);
interceptor
->
SetPlace
(
place_
);
interceptor
->
SetMiniBatchScope
(
minibatch_scope_
);
interceptor
->
SetMicroBatchScope
(
microbatch_scopes_
);
interceptor
->
SetRootScope
(
root_scope_
);
interceptor
->
SetGC
(
gc
);
SetInterceptor
(
interceptor_id
,
std
::
move
(
interceptor
));
VLOG
(
3
)
<<
"Create Interceptor with interceptor id: "
<<
interceptor_id
<<
" with type: "
<<
task_node
->
type
()
<<
"."
;
if
(
task_node
->
upstream
().
empty
())
{
source_interceptor_ids_
.
emplace_back
(
interceptor_id
);
}
}
}
}
// namespace distributed
}
// namespace paddle
Prev
1
…
3
4
5
6
7
8
9
10
11
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment