Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
f0ef3442
Commit
f0ef3442
authored
Apr 26, 2023
by
yuguo960516yuguo
Browse files
2.3.2-dtk-22.10.1
parent
ad08b8ce
Pipeline
#227
failed with stages
in 0 seconds
Changes
274
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3010 additions
and
0 deletions
+3010
-0
paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc
...le/fluid/distributed/auto_parallel/test/dist_attr_test.cc
+150
-0
paddle/fluid/distributed/auto_parallel/test/dist_mapper_test.cc
.../fluid/distributed/auto_parallel/test/dist_mapper_test.cc
+72
-0
paddle/fluid/distributed/auto_parallel/test/process_mesh_test.cc
...fluid/distributed/auto_parallel/test/process_mesh_test.cc
+53
-0
paddle/fluid/distributed/auto_parallel/utils.h
paddle/fluid/distributed/auto_parallel/utils.h
+114
-0
paddle/fluid/distributed/collective/CMakeLists.txt
paddle/fluid/distributed/collective/CMakeLists.txt
+96
-0
paddle/fluid/distributed/collective/Common.cc
paddle/fluid/distributed/collective/Common.cc
+60
-0
paddle/fluid/distributed/collective/Common.h
paddle/fluid/distributed/collective/Common.h
+35
-0
paddle/fluid/distributed/collective/CustomCCLTools.cc
paddle/fluid/distributed/collective/CustomCCLTools.cc
+47
-0
paddle/fluid/distributed/collective/CustomCCLTools.h
paddle/fluid/distributed/collective/CustomCCLTools.h
+198
-0
paddle/fluid/distributed/collective/HCCLTools.cc
paddle/fluid/distributed/collective/HCCLTools.cc
+48
-0
paddle/fluid/distributed/collective/HCCLTools.h
paddle/fluid/distributed/collective/HCCLTools.h
+184
-0
paddle/fluid/distributed/collective/MPITools.cc
paddle/fluid/distributed/collective/MPITools.cc
+56
-0
paddle/fluid/distributed/collective/MPITools.h
paddle/fluid/distributed/collective/MPITools.h
+53
-0
paddle/fluid/distributed/collective/NCCLTools.cc
paddle/fluid/distributed/collective/NCCLTools.cc
+48
-0
paddle/fluid/distributed/collective/NCCLTools.h
paddle/fluid/distributed/collective/NCCLTools.h
+264
-0
paddle/fluid/distributed/collective/ProcessGroup.cc
paddle/fluid/distributed/collective/ProcessGroup.cc
+64
-0
paddle/fluid/distributed/collective/ProcessGroup.h
paddle/fluid/distributed/collective/ProcessGroup.h
+370
-0
paddle/fluid/distributed/collective/ProcessGroupCustom.cc
paddle/fluid/distributed/collective/ProcessGroupCustom.cc
+400
-0
paddle/fluid/distributed/collective/ProcessGroupCustom.h
paddle/fluid/distributed/collective/ProcessGroupCustom.h
+139
-0
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+559
-0
No files found.
Too many changes to show.
To preserve performance only
274 of 274+
files are displayed.
Plain diff
Email patch
paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc
0 → 100644
View file @
f0ef3442
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <sstream>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/auto_parallel/dist_attr.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
namespace
paddle
{
namespace
distributed
{
namespace
auto_parallel
{
TEST
(
DistAttr
,
ctor
)
{
ProgramDesc
program
;
auto
*
global_block
=
program
.
MutableBlock
(
0
);
auto
*
x
=
global_block
->
Var
(
"X"
);
x
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
x
->
SetLoDLevel
(
0
);
x
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
x
->
SetShape
({
1000
,
784
});
auto
*
y
=
global_block
->
Var
(
"Y"
);
y
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
y
->
SetLoDLevel
(
0
);
y
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
y
->
SetShape
({
784
,
100
});
auto
*
op
=
global_block
->
AppendOp
();
op
->
SetType
(
"mul"
);
op
->
SetInput
(
"X"
,
{
x
->
Name
()});
op
->
SetInput
(
"Y"
,
{
y
->
Name
()});
auto
*
out
=
global_block
->
Var
(
"Out"
);
out
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
out
->
SetShape
({
1000
,
100
});
op
->
SetOutput
(
"Out"
,
{
out
->
Name
()});
std
::
vector
<
int64_t
>
shape
=
{
2
,
4
};
std
::
vector
<
int64_t
>
process_ids
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
std
::
vector
<
std
::
string
>
dim_names
=
{
"x"
,
"y"
};
ProcessMesh
process_mesh
(
shape
,
process_ids
,
dim_names
);
std
::
vector
<
int64_t
>
shape2
=
{
2
,
2
};
std
::
vector
<
int64_t
>
process_ids2
=
{
0
,
1
,
2
,
3
};
std
::
vector
<
std
::
string
>
dim_names2
=
{
"a"
,
"b"
};
ProcessMesh
process_mesh2
(
shape2
,
process_ids2
,
dim_names2
);
TensorDistAttr
x_dist_attr
(
*
x
),
y_dist_attr
(
*
y
),
out_dist_attr
(
*
out
);
x_dist_attr
.
set_process_mesh
(
process_mesh
);
x_dist_attr
.
set_dims_mapping
(
std
::
vector
<
int64_t
>
({
0
,
-
1
}));
x_dist_attr
.
set_batch_dim
(
0
);
x_dist_attr
.
set_dynamic_dims
(
std
::
vector
<
bool
>
({
true
,
false
}));
x_dist_attr
.
annotate
(
"process_mesh"
);
x_dist_attr
.
annotate
(
"dims_mapping"
);
EXPECT_EQ
(
x_dist_attr
.
process_mesh
(),
process_mesh
);
EXPECT_EQ
(
x_dist_attr
.
dims_mapping
(),
std
::
vector
<
int64_t
>
({
0
,
-
1
}));
EXPECT_EQ
(
x_dist_attr
.
batch_dim
(),
0
);
EXPECT_EQ
(
x_dist_attr
.
dynamic_dims
(),
std
::
vector
<
bool
>
({
true
,
false
}));
EXPECT_EQ
(
x_dist_attr
.
is_annotated
(
"process_mesh"
),
true
);
EXPECT_EQ
(
x_dist_attr
.
is_annotated
(
"dims_mapping"
),
true
);
EXPECT_EQ
(
x_dist_attr
.
verify
(),
true
);
std
::
stringstream
x_sstream
;
x_sstream
<<
x_dist_attr
;
EXPECT_EQ
(
x_sstream
.
str
(),
x_dist_attr
.
to_string
());
auto
x_proto
=
x_dist_attr
.
to_proto
();
TensorDistAttr
new_x_dist_attr
=
TensorDistAttr
::
from_proto
(
x_proto
);
EXPECT_EQ
(
x_dist_attr
,
new_x_dist_attr
);
// new_x_dist_attr is not valid since it does not bind to an var_desc
EXPECT_EQ
(
new_x_dist_attr
.
verify
(),
false
);
y_dist_attr
.
set_process_mesh
(
process_mesh
);
y_dist_attr
.
set_dims_mapping
(
std
::
vector
<
int64_t
>
({
-
1
,
0
}));
y_dist_attr
.
set_batch_dim
(
-
1
);
y_dist_attr
.
set_dynamic_dims
(
std
::
vector
<
bool
>
({
false
,
true
}));
x_dist_attr
.
annotate
(
"batch_dim"
);
x_dist_attr
.
annotate
(
"dynamic_dims"
);
EXPECT_EQ
(
y_dist_attr
.
process_mesh
(),
process_mesh
);
EXPECT_EQ
(
y_dist_attr
.
dims_mapping
(),
std
::
vector
<
int64_t
>
({
-
1
,
0
}));
EXPECT_EQ
(
y_dist_attr
.
batch_dim
(),
1
);
EXPECT_EQ
(
y_dist_attr
.
dynamic_dims
(),
std
::
vector
<
bool
>
({
false
,
true
}));
EXPECT_EQ
(
x_dist_attr
.
is_annotated
(
"batch_dim"
),
true
);
EXPECT_EQ
(
x_dist_attr
.
is_annotated
(
"dynamic_dims"
),
true
);
EXPECT_EQ
(
x_dist_attr
.
verify
(),
true
);
out_dist_attr
.
set_process_mesh
(
process_mesh
);
out_dist_attr
.
set_dims_mapping
(
std
::
vector
<
int64_t
>
({
0
,
1
}));
out_dist_attr
.
set_batch_dim
(
1
);
out_dist_attr
.
set_dynamic_dims
(
std
::
vector
<
bool
>
({
false
,
false
}));
EXPECT_EQ
(
out_dist_attr
.
process_mesh
(),
process_mesh
);
EXPECT_EQ
(
out_dist_attr
.
dims_mapping
(),
std
::
vector
<
int64_t
>
({
0
,
1
}));
EXPECT_EQ
(
out_dist_attr
.
batch_dim
(),
1
);
EXPECT_EQ
(
out_dist_attr
.
dynamic_dims
(),
std
::
vector
<
bool
>
({
false
,
false
}));
EXPECT_EQ
(
out_dist_attr
.
verify
(),
true
);
OperatorDistAttr
mul_dist_attr
(
*
op
);
mul_dist_attr
.
set_input_dist_attr
(
x
->
Name
(),
x_dist_attr
);
mul_dist_attr
.
set_input_dist_attr
(
y
->
Name
(),
y_dist_attr
);
mul_dist_attr
.
set_output_dist_attr
(
out
->
Name
(),
out_dist_attr
);
mul_dist_attr
.
set_process_mesh
(
process_mesh2
);
mul_dist_attr
.
set_impl_type
(
"dist_mul"
);
mul_dist_attr
.
set_impl_idx
(
0
);
mul_dist_attr
.
annotate
(
"process_mesh"
);
mul_dist_attr
.
annotate
(
"impl_type"
);
mul_dist_attr
.
annotate
(
"impl_idx"
);
EXPECT_NE
(
mul_dist_attr
.
input_dist_attr
(
x
->
Name
()),
x_dist_attr
);
EXPECT_NE
(
mul_dist_attr
.
input_dist_attr
(
y
->
Name
()),
y_dist_attr
);
EXPECT_NE
(
mul_dist_attr
.
output_dist_attr
(
out
->
Name
()),
out_dist_attr
);
EXPECT_EQ
(
mul_dist_attr
.
process_mesh
(),
process_mesh2
);
EXPECT_EQ
(
mul_dist_attr
.
input_dist_attr
(
x
->
Name
()).
process_mesh
(),
process_mesh2
);
EXPECT_EQ
(
mul_dist_attr
.
input_dist_attr
(
y
->
Name
()).
process_mesh
(),
process_mesh2
);
EXPECT_EQ
(
mul_dist_attr
.
impl_type
(),
"dist_mul"
);
EXPECT_EQ
(
mul_dist_attr
.
impl_idx
(),
0
);
EXPECT_EQ
(
mul_dist_attr
.
is_annotated
(
"process_mesh"
),
true
);
EXPECT_EQ
(
mul_dist_attr
.
is_annotated
(
"impl_type"
),
true
);
EXPECT_EQ
(
mul_dist_attr
.
is_annotated
(
"impl_idx"
),
true
);
EXPECT_EQ
(
mul_dist_attr
.
verify
(),
true
);
std
::
stringstream
mul_sstream
;
mul_sstream
<<
mul_dist_attr
;
EXPECT_EQ
(
mul_sstream
.
str
(),
mul_dist_attr
.
to_string
());
auto
mul_proto
=
mul_dist_attr
.
to_proto
();
OperatorDistAttr
new_mul_dist_attr
=
OperatorDistAttr
::
from_proto
(
mul_proto
);
EXPECT_EQ
(
mul_dist_attr
,
new_mul_dist_attr
);
// new_mul_dist_attr is not valid since it does not bind to an op_desc
EXPECT_EQ
(
new_mul_dist_attr
.
verify
(),
false
);
}
}
// namespace auto_parallel
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/auto_parallel/test/dist_mapper_test.cc
0 → 100644
View file @
f0ef3442
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h"
#include <map>
#include <sstream>
#include "gtest/gtest.h"
namespace
paddle
{
namespace
distributed
{
namespace
auto_parallel
{
TEST
(
DistributedMapper
,
Ctor
)
{
std
::
vector
<
int64_t
>
shape
=
{
2
,
3
};
std
::
vector
<
int64_t
>
device_ids
=
{
0
,
1
,
2
,
3
,
4
,
5
};
std
::
vector
<
std
::
string
>
dim_names
=
{
"x"
,
"y"
};
std
::
string
device_type
=
"GPU"
;
int64_t
size
=
shape
[
0
]
*
shape
[
1
];
DeviceMesh
device_mesh
(
"device_mesh"
,
shape
,
device_ids
,
dim_names
);
for
(
int64_t
i
=
0
;
i
<
shape
[
0
];
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
shape
[
1
];
++
j
)
{
int64_t
global_id
=
i
*
shape
[
1
]
+
j
;
int64_t
local_id
=
j
;
int64_t
machine_id
=
i
;
device_mesh
.
add_device
(
Device
(
global_id
,
local_id
,
machine_id
,
device_type
));
}
}
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
size
;
++
j
)
{
device_mesh
.
add_link
(
Link
(
i
,
j
,
"NVL"
));
}
}
DistributedMapper
dist_mapper
;
dist_mapper
.
add_device_mesh
(
device_mesh
);
std
::
map
<
int64_t
,
std
::
pair
<
std
::
string
,
std
::
vector
<
int64_t
>>>
process_id_to_device_ids
;
process_id_to_device_ids
[
0
]
=
{
"device_mesh"
,
{
5
}};
process_id_to_device_ids
[
1
]
=
{
"device_mesh"
,
{
4
}};
process_id_to_device_ids
[
2
]
=
{
"device_mesh"
,
{
3
}};
process_id_to_device_ids
[
3
]
=
{
"device_mesh"
,
{
2
}};
process_id_to_device_ids
[
4
]
=
{
"device_mesh"
,
{
1
}};
process_id_to_device_ids
[
5
]
=
{
"device_mesh"
,
{
0
}};
dist_mapper
.
set_process_id_to_device_ids
(
process_id_to_device_ids
);
EXPECT_EQ
(
dist_mapper
.
device_meshes
().
at
(
"device_mesh"
),
device_mesh
);
EXPECT_EQ
(
dist_mapper
.
device_mesh
(
"device_mesh"
),
device_mesh
);
EXPECT_EQ
(
dist_mapper
.
process_id_to_device_ids
(),
process_id_to_device_ids
);
std
::
stringstream
sstream
;
sstream
<<
dist_mapper
;
EXPECT_EQ
(
sstream
.
str
(),
dist_mapper
.
to_string
());
auto
proto
=
dist_mapper
.
to_proto
();
DistributedMapper
new_dist_mapper
=
DistributedMapper
::
from_proto
(
proto
);
EXPECT_EQ
(
dist_mapper
,
new_dist_mapper
);
}
}
// namespace auto_parallel
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/auto_parallel/test/process_mesh_test.cc
0 → 100644
View file @
f0ef3442
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/auto_parallel/process_mesh.h"
#include <iostream>
#include <sstream>
#include "gtest/gtest.h"
namespace
paddle
{
namespace
distributed
{
namespace
auto_parallel
{
TEST
(
ProcessMesh
,
Ctor
)
{
std
::
vector
<
int64_t
>
shape
=
{
2
,
3
};
std
::
vector
<
int64_t
>
process_ids
=
{
0
,
1
,
2
,
3
,
4
,
5
};
std
::
vector
<
std
::
string
>
dim_names
=
{
"x"
,
"y"
};
int64_t
size
=
shape
[
0
]
*
shape
[
1
];
ProcessMesh
process_mesh
(
shape
,
process_ids
,
dim_names
);
EXPECT_EQ
(
process_mesh
.
shape
(),
shape
);
EXPECT_EQ
(
process_mesh
.
process_ids
(),
process_ids
);
EXPECT_EQ
(
process_mesh
.
dim_names
()[
0
],
"x"
);
EXPECT_EQ
(
process_mesh
.
dim_names
()[
1
],
"y"
);
EXPECT_EQ
(
process_mesh
.
size
(),
size
);
EXPECT_EQ
(
process_mesh
.
ndim
(),
static_cast
<
int64_t
>
(
shape
.
size
()));
EXPECT_EQ
(
process_mesh
.
dim_size
(
0
),
shape
[
0
]);
EXPECT_EQ
(
process_mesh
.
dim_size
(
-
1
),
shape
[
1
]);
EXPECT_EQ
(
process_mesh
.
dim_size
(
"x"
),
shape
[
0
]);
EXPECT_EQ
(
process_mesh
.
dim_size
(
"y"
),
shape
[
1
]);
EXPECT_EQ
(
process_mesh
.
empty
(),
false
);
EXPECT_EQ
(
process_mesh
.
contains
(
0
),
true
);
EXPECT_EQ
(
process_mesh
.
contains
(
6
),
false
);
std
::
stringstream
sstream
;
sstream
<<
process_mesh
;
EXPECT_EQ
(
sstream
.
str
(),
process_mesh
.
to_string
());
auto
proto
=
process_mesh
.
to_proto
();
ProcessMesh
new_process_mesh
=
ProcessMesh
::
from_proto
(
proto
);
EXPECT_EQ
(
process_mesh
,
new_process_mesh
);
}
}
// namespace auto_parallel
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/auto_parallel/utils.h
0 → 100644
View file @
f0ef3442
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
distributed
{
namespace
auto_parallel
{
// struct Indent {
// Indent(int &level) : level(level) { ++level; }
// ~Indent() { --level; }
// int &level;
// };
// inline std::string str_indent(std::string& str, cur_indent) {
// string spaces(cur_indent, " ");
// return str + std::string(cur_indent, " ");
// }
template
<
class
T
>
bool
has_duplicates
(
const
std
::
vector
<
T
>&
vec
)
{
std
::
unordered_map
<
T
,
int
>
map
;
for
(
const
auto
&
i
:
vec
)
{
++
map
[
i
];
if
(
map
[
i
]
>
1
)
return
true
;
}
return
false
;
}
inline
int64_t
canonical_dim
(
int
dim
,
int
ndim
)
{
PADDLE_ENFORCE_EQ
(
dim
>=
-
ndim
&&
dim
<
ndim
,
true
,
platform
::
errors
::
InvalidArgument
(
"Dimension %d is outside of [-%d, %d)."
,
dim
,
ndim
,
ndim
));
if
(
dim
<
0
)
{
return
dim
+
ndim
;
}
return
dim
;
}
// Refer to https://stackoverflow.com/a/5289170
template
<
typename
Range
,
typename
Value
=
typename
Range
::
value_type
>
std
::
string
str_join
(
Range
const
&
elements
,
const
std
::
string
&
delimiter
=
","
)
{
std
::
ostringstream
os
;
auto
b
=
std
::
begin
(
elements
),
e
=
std
::
end
(
elements
);
if
(
b
!=
e
)
{
std
::
copy
(
b
,
prev
(
e
),
std
::
ostream_iterator
<
Value
>
(
os
,
delimiter
.
c_str
()));
b
=
prev
(
e
);
}
if
(
b
!=
e
)
{
os
<<
*
b
;
}
return
os
.
str
();
}
inline
std
::
string
str_join
(
std
::
map
<
std
::
string
,
bool
>
const
&
elements
,
const
std
::
string
&
delimiter
=
","
)
{
std
::
string
str
;
for
(
const
auto
&
item
:
elements
)
{
str
+=
item
.
first
+
": "
+
std
::
to_string
(
item
.
second
)
+
","
;
}
return
str
.
substr
(
0
,
str
.
size
()
-
2
);
}
// Refer to https://stackoverflow.com/a/46931770
inline
std
::
vector
<
std
::
string
>
str_split
(
std
::
string
const
&
input
,
const
std
::
string
&
delimiter
=
","
)
{
size_t
pos_start
=
0
,
pos_end
,
delim_len
=
delimiter
.
length
();
std
::
string
token
;
std
::
vector
<
std
::
string
>
output
;
while
((
pos_end
=
input
.
find
(
delimiter
,
pos_start
))
!=
std
::
string
::
npos
)
{
token
=
input
.
substr
(
pos_start
,
pos_end
-
pos_start
);
pos_start
=
pos_end
+
delim_len
;
output
.
push_back
(
token
);
}
output
.
push_back
(
input
.
substr
(
pos_start
));
return
output
;
}
// Refer to https://stackoverflow.com/a/29200671/2358969
template
<
typename
T
>
std
::
string
to_string_with_precision
(
const
T
a_value
,
const
int
n
=
2
)
{
std
::
ostringstream
out
;
out
.
precision
(
n
);
out
<<
std
::
fixed
<<
a_value
;
return
out
.
str
();
}
}
// namespace auto_parallel
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/CMakeLists.txt
0 → 100644
View file @
f0ef3442
cc_library
(
processgroup
SRCS ProcessGroup.cc
DEPS dense_tensor
)
cc_library
(
processgroup_stream
SRCS ProcessGroupStream.cc
DEPS dense_tensor
)
cc_library
(
eager_reducer
SRCS reducer.cc
DEPS eager_api processgroup processgroup_stream phi_api string_helper
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
processgroup_gloo
SRCS ProcessGroupGloo.cc
DEPS phi_api eager_api gloo_wrapper
)
endif
()
if
(
WITH_NCCL OR WITH_RCCL
)
cc_library
(
processgroup_nccl
SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
DEPS processgroup
processgroup_stream
place
enforce
collective_helper
device_context
dense_tensor
)
if
(
WITH_DISTRIBUTE AND WITH_PSCORE
)
if
(
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"
${
DISTRIBUTE_COMPILE_FLAGS
}
-faligned-new"
)
set_source_files_properties
(
ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
endif
()
cc_library
(
processgroup_heter
SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
DEPS place enforce collective_helper device_context phi_api eager_api
)
endif
()
endif
()
if
(
WITH_MPI
)
cc_library
(
processgroup_mpi
SRCS ProcessGroupMPI.cc MPITools.cc Common.cc
DEPS collective_helper device_context
)
endif
()
if
(
WITH_ASCEND_CL
)
cc_library
(
processgroup_hccl
SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc
DEPS place
npu_stream
enforce
collective_helper
device_context
phi_api
eager_api
)
if
(
WITH_DISTRIBUTE AND WITH_PSCORE
)
if
(
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"
${
DISTRIBUTE_COMPILE_FLAGS
}
-faligned-new"
)
set_source_files_properties
(
ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
endif
()
cc_library
(
processgroup_heter
SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc
DEPS place
npu_stream
enforce
collective_helper
device_context
phi_api
eager_api
)
endif
()
endif
()
if
(
WITH_CUSTOM_DEVICE
)
cc_library
(
processgroup_custom
SRCS ProcessGroupCustom.cc CustomCCLTools.cc Common.cc
DEPS phi_backends
place
enforce
collective_helper
device_context
phi_api
eager_api
)
endif
()
paddle/fluid/distributed/collective/Common.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/Common.h"
namespace
paddle
{
namespace
distributed
{
std
::
vector
<
Place
>
GetPlaceList
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
)
{
std
::
vector
<
Place
>
places
;
places
.
reserve
(
tensors
.
size
());
for
(
auto
&
tensor
:
tensors
)
{
places
.
push_back
(
tensor
.
place
());
}
return
places
;
}
std
::
string
GetKeyFromPlaces
(
const
std
::
vector
<
Place
>&
places
)
{
std
::
string
placeList
;
for
(
auto
&
place
:
places
)
{
std
::
stringstream
tmp
;
tmp
<<
place
;
if
(
placeList
.
empty
())
{
placeList
+=
tmp
.
str
();
}
else
{
placeList
+=
","
+
tmp
.
str
();
}
}
return
placeList
;
}
bool
CheckTensorsInCudaPlace
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
)
{
return
std
::
all_of
(
tensors
.
cbegin
(),
tensors
.
cend
(),
[
&
](
const
phi
::
DenseTensor
&
t
)
{
return
platform
::
is_gpu_place
(
t
.
place
());
});
}
bool
CheckTensorsInCustomPlace
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
const
std
::
string
&
dev_type
)
{
return
std
::
all_of
(
tensors
.
cbegin
(),
tensors
.
cend
(),
[
&
](
const
phi
::
DenseTensor
&
t
)
{
return
platform
::
places_are_same_class
(
t
.
place
(),
paddle
::
platform
::
CustomPlace
(
dev_type
));
});
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/Common.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
// Get the list of devices from list of tensors
std
::
vector
<
Place
>
GetPlaceList
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
);
// Get the deviceList String from the list of devices
std
::
string
GetKeyFromPlaces
(
const
std
::
vector
<
Place
>&
places
);
bool
CheckTensorsInCudaPlace
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
);
bool
CheckTensorsInCustomPlace
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
const
std
::
string
&
dev_type
);
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/CustomCCLTools.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace
paddle
{
namespace
distributed
{
phi
::
ccl
::
CCLReduceOp
ToCustomCCLRedType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
phi
::
ccl
::
CCLReduceOp
>
red_type
=
{
{
ReduceOp
::
MIN
,
phi
::
ccl
::
CCLReduceOp
::
MIN
},
{
ReduceOp
::
MAX
,
phi
::
ccl
::
CCLReduceOp
::
MAX
},
{
ReduceOp
::
SUM
,
phi
::
ccl
::
CCLReduceOp
::
SUM
},
{
ReduceOp
::
PRODUCT
,
phi
::
ccl
::
CCLReduceOp
::
PRODUCT
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid hccl reduction. "
"Must be Min | Max | Prod | Sum"
));
return
it
->
second
;
}
std
::
string
SerializeCustomCCLUniqueId
(
const
phi
::
ccl
::
CCLRootId
&
ccl_id
)
{
const
uint8_t
*
bytes
=
ccl_id
.
data
();
std
::
ostringstream
oss
;
for
(
size_t
i
=
0
;
i
<
ccl_id
.
size
();
++
i
)
{
oss
<<
std
::
hex
<<
static_cast
<
int
>
(
bytes
[
i
]);
}
return
oss
.
str
();
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/CustomCCLTools.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <string>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/device_guard.h"
#include "paddle/phi/backends/device_manager.h"
namespace
paddle
{
namespace
distributed
{
class
CustomEventManager
{
public:
CustomEventManager
()
=
default
;
~
CustomEventManager
()
{
if
(
is_created_
)
{
event_
->
Destroy
();
}
}
CustomEventManager
(
const
CustomEventManager
&
)
=
delete
;
CustomEventManager
&
operator
=
(
const
CustomEventManager
&
)
=
delete
;
CustomEventManager
(
CustomEventManager
&&
other
)
{
std
::
swap
(
is_created_
,
other
.
is_created_
);
std
::
swap
(
device_index_
,
other
.
device_index_
);
std
::
swap
(
device_type_
,
other
.
device_type_
);
std
::
swap
(
event_
,
other
.
event_
);
}
CustomEventManager
&
operator
=
(
CustomEventManager
&&
other
)
{
std
::
swap
(
is_created_
,
other
.
is_created_
);
std
::
swap
(
device_index_
,
other
.
device_index_
);
std
::
swap
(
device_type_
,
other
.
device_type_
);
std
::
swap
(
event_
,
other
.
event_
);
return
*
this
;
}
bool
IsCreated
()
const
{
return
is_created_
;
}
int8_t
DeviceId
()
const
{
return
device_index_
;
}
std
::
string
DeviceType
()
const
{
return
device_type_
;
}
phi
::
event
::
event_t
GetRawCustomEvent
()
const
{
return
event_
->
raw_event
();
}
phi
::
event
::
Event
*
GetCustomEvent
()
const
{
return
event_
.
get
();
}
void
Record
(
const
paddle
::
platform
::
CustomDeviceContext
&
ctx
)
{
auto
place
=
ctx
.
GetPlace
();
auto
device_type
=
place
.
GetDeviceType
();
auto
device_index
=
place
.
GetDeviceId
();
if
(
!
is_created_
)
{
CreateEvent
(
place
);
}
PADDLE_ENFORCE_EQ
(
device_index
,
device_index_
,
platform
::
errors
::
PreconditionNotMet
(
"CustomDeviceContext's device %d does not match"
"Event's device %d"
,
device_index
,
device_index_
));
PADDLE_ENFORCE_EQ
(
device_type
,
device_type_
,
platform
::
errors
::
PreconditionNotMet
(
"CustomDeviceContext's device %d does not match"
"Event's device type %d"
,
device_type
,
device_type_
));
phi
::
DeviceGuard
guard
(
place
);
phi
::
stream
::
Stream
stream
(
place
,
ctx
.
stream
());
event_
->
Record
(
&
stream
);
}
bool
Query
()
const
{
return
event_
->
Query
();
}
void
Block
(
const
paddle
::
platform
::
CustomDeviceContext
&
ctx
)
const
{
if
(
is_created_
)
{
auto
place
=
ctx
.
GetPlace
();
auto
device_type
=
place
.
GetDeviceType
();
auto
device_index
=
place
.
GetDeviceId
();
PADDLE_ENFORCE_EQ
(
device_index
,
device_index_
,
platform
::
errors
::
PreconditionNotMet
(
"CustomDeviceContext's device %d does not match"
"Event's device %d"
,
device_index
,
device_index_
));
PADDLE_ENFORCE_EQ
(
device_type
,
device_type_
,
platform
::
errors
::
PreconditionNotMet
(
"CustomDeviceContext's device %d does not match"
"Event's device type %d"
,
device_type
,
device_type_
));
phi
::
DeviceGuard
guard
(
place
);
phi
::
stream
::
Stream
stream
(
place
,
ctx
.
stream
());
stream
.
WaitEvent
(
event_
.
get
());
}
}
private:
bool
is_created_
{
false
};
std
::
shared_ptr
<
phi
::
event
::
Event
>
event_
{
nullptr
};
int8_t
device_index_
{
0
};
std
::
string
device_type_
;
private:
void
CreateEvent
(
const
platform
::
Place
&
place
)
{
device_index_
=
place
.
GetDeviceId
();
device_type_
=
place
.
GetDeviceType
();
event_
.
reset
(
new
phi
::
event
::
Event
);
event_
->
Init
(
place
);
is_created_
=
true
;
}
};
class
CustomCCLCommManager
{
public:
CustomCCLCommManager
(
const
std
::
string
&
device_type
,
phi
::
ccl
::
CCLComm
ccl_comm
)
:
device_type_
(
device_type
),
ccl_comm_
(
ccl_comm
)
{}
CustomCCLCommManager
()
:
CustomCCLCommManager
(
""
,
nullptr
)
{}
~
CustomCCLCommManager
()
noexcept
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
ccl_comm_
)
{
phi
::
DeviceManager
::
CCLDestroyComm
(
device_type_
,
ccl_comm_
);
}
}
static
std
::
shared_ptr
<
CustomCCLCommManager
>
Create
(
const
std
::
string
&
device_type
,
int
num_ranks
,
int
rank
,
phi
::
ccl
::
CCLRootId
*
comm_id
,
phi
::
ccl
::
CCLComm
*
ccl_comm
)
{
auto
custom_ccl_manager
=
std
::
make_shared
<
CustomCCLCommManager
>
();
phi
::
DeviceManager
::
CCLCommInitRank
(
device_type
,
num_ranks
,
comm_id
,
rank
,
ccl_comm
);
custom_ccl_manager
->
device_type_
=
device_type
;
custom_ccl_manager
->
ccl_id_
=
comm_id
;
custom_ccl_manager
->
rank_
=
rank
;
custom_ccl_manager
->
ccl_comm_
=
*
ccl_comm
;
return
custom_ccl_manager
;
}
phi
::
ccl
::
CCLRootId
*
GetCustomCCLId
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
ccl_id_
;
}
phi
::
ccl
::
CCLComm
GetCustomCCLComm
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
ccl_comm_
;
}
CustomCCLCommManager
(
const
CustomCCLCommManager
&
)
=
delete
;
CustomCCLCommManager
&
operator
=
(
const
CustomCCLCommManager
&
)
=
delete
;
CustomCCLCommManager
&
operator
=
(
CustomCCLCommManager
&&
other
)
=
delete
;
CustomCCLCommManager
(
CustomCCLCommManager
&&
other
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
other
.
mutex_
);
std
::
swap
(
ccl_comm_
,
other
.
ccl_comm_
);
}
protected:
std
::
string
device_type_
;
phi
::
ccl
::
CCLComm
ccl_comm_
;
phi
::
ccl
::
CCLRootId
*
ccl_id_
;
int
rank_
;
mutable
std
::
mutex
mutex_
;
};
phi
::
ccl
::
CCLReduceOp
ToCustomCCLRedType
(
ReduceOp
reduction
);
std
::
string
SerializeCustomCCLUniqueId
(
const
phi
::
ccl
::
CCLRootId
&
ccl_id
);
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/HCCLTools.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace
paddle
{
namespace
distributed
{
HcclReduceOp
ToHCCLRedType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
HcclReduceOp
>
red_type
=
{
{
ReduceOp
::
MIN
,
HCCL_REDUCE_MIN
},
{
ReduceOp
::
MAX
,
HCCL_REDUCE_MAX
},
{
ReduceOp
::
SUM
,
HCCL_REDUCE_SUM
},
{
ReduceOp
::
PRODUCT
,
HCCL_REDUCE_PROD
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid hccl reduction. "
"Must be Min | Max | Prod | Sum"
));
return
it
->
second
;
}
std
::
string
SerializeHCCLUniqueId
(
const
HcclRootInfo
&
hcclID
)
{
const
uint8_t
*
bytes
=
reinterpret_cast
<
const
uint8_t
*>
(
&
hcclID
);
std
::
ostringstream
oss
;
for
(
size_t
i
=
0
;
i
<
sizeof
(
hcclID
);
++
i
)
{
oss
<<
std
::
hex
<<
static_cast
<
int
>
(
bytes
[
i
]);
}
return
oss
.
str
();
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/HCCLTools.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <string>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/variant.h"
namespace
paddle
{
namespace
distributed
{
class
NPUEventManager
{
public:
NPUEventManager
()
=
default
;
~
NPUEventManager
()
{
if
(
is_created_
)
{
platform
::
NPUDeviceGuard
guard
(
device_index_
);
platform
::
NPUEventDestroy
(
event_
);
}
}
NPUEventManager
(
const
NPUEventManager
&
)
=
delete
;
NPUEventManager
&
operator
=
(
const
NPUEventManager
&
)
=
delete
;
NPUEventManager
(
NPUEventManager
&&
other
)
{
std
::
swap
(
is_created_
,
other
.
is_created_
);
std
::
swap
(
device_index_
,
other
.
device_index_
);
std
::
swap
(
event_
,
other
.
event_
);
}
NPUEventManager
&
operator
=
(
NPUEventManager
&&
other
)
{
std
::
swap
(
is_created_
,
other
.
is_created_
);
std
::
swap
(
device_index_
,
other
.
device_index_
);
std
::
swap
(
event_
,
other
.
event_
);
return
*
this
;
}
bool
IsCreated
()
const
{
return
is_created_
;
}
bool
DeviceId
()
const
{
return
device_index_
;
}
aclrtEvent
GetRawNPUEvent
()
const
{
return
event_
;
}
void
Record
(
const
paddle
::
platform
::
NPUDeviceContext
&
ctx
)
{
auto
device_index
=
ctx
.
GetPlace
().
device
;
if
(
!
is_created_
)
{
CreateEvent
(
device_index
);
}
PADDLE_ENFORCE_EQ
(
device_index
,
device_index_
,
platform
::
errors
::
PreconditionNotMet
(
"NPUDeviceContext's device %d does not match"
"Event's device %d"
,
device_index
,
device_index_
));
platform
::
NPUDeviceGuard
guard
(
device_index_
);
platform
::
NPUEventRecord
(
event_
,
ctx
.
stream
());
}
bool
Query
()
const
{
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
platform
::
NPUEventQuery
(
event_
,
&
status
);
if
(
status
==
ACL_EVENT_STATUS_COMPLETE
)
{
return
true
;
}
return
false
;
}
void
Block
(
const
paddle
::
platform
::
NPUDeviceContext
&
ctx
)
const
{
if
(
is_created_
)
{
auto
device_index
=
ctx
.
GetPlace
().
device
;
PADDLE_ENFORCE_EQ
(
device_index
,
device_index_
,
platform
::
errors
::
PreconditionNotMet
(
"phi::GPUContext's device %d does not match"
"Event's device %d"
,
device_index
,
device_index_
));
platform
::
NPUDeviceGuard
guard
(
device_index_
);
platform
::
NPUStreamWaitEvent
(
ctx
.
stream
(),
event_
);
}
}
private:
bool
is_created_
{
false
};
aclrtEvent
event_
{};
int8_t
device_index_
{
0
};
private:
void
CreateEvent
(
int
device_index
)
{
device_index_
=
device_index
;
platform
::
NPUDeviceGuard
guard
(
device_index
);
platform
::
NPUEventCreate
(
&
event_
);
is_created_
=
true
;
}
};
class
HCCLCommManager
{
public:
explicit
HCCLCommManager
(
HcclComm
hcclComm
)
:
hccl_comm_
(
hcclComm
)
{}
HCCLCommManager
()
:
HCCLCommManager
(
nullptr
)
{}
~
HCCLCommManager
()
noexcept
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
hccl_comm_
)
{
platform
::
dynload
::
HcclCommDestroy
(
hccl_comm_
);
}
}
static
std
::
shared_ptr
<
HCCLCommManager
>
Create
(
int
num_ranks
,
int
rank
,
HcclRootInfo
*
comm_id
,
HcclComm
hccl_comm
)
{
auto
hccl_manager
=
std
::
make_shared
<
HCCLCommManager
>
();
auto
ret
=
platform
::
dynload
::
HcclCommInitRootInfo
(
num_ranks
,
comm_id
,
rank
,
&
hccl_comm
);
using
__NPU_STATUS_TYPE__
=
decltype
(
ret
);
constexpr
auto
__success_type__
=
platform
::
details
::
NPUStatusType
<
__NPU_STATUS_TYPE__
>::
kSuccess
;
if
(
UNLIKELY
(
ret
!=
__success_type__
))
{
VLOG
(
0
)
<<
"Error: create hccl_id error."
;
exit
(
-
1
);
}
hccl_manager
->
hccl_id_
=
comm_id
;
hccl_manager
->
rank_
=
rank
;
hccl_manager
->
hccl_comm_
=
hccl_comm
;
return
hccl_manager
;
}
HcclRootInfo
*
GetHcclId
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
hccl_id_
;
}
HcclComm
GetHcclComm
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
hccl_comm_
;
}
HCCLCommManager
(
const
HCCLCommManager
&
)
=
delete
;
HCCLCommManager
&
operator
=
(
const
HCCLCommManager
&
)
=
delete
;
HCCLCommManager
&
operator
=
(
HCCLCommManager
&&
other
)
=
delete
;
HCCLCommManager
(
HCCLCommManager
&&
other
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
other
.
mutex_
);
std
::
swap
(
hccl_comm_
,
other
.
hccl_comm_
);
}
protected:
HcclComm
hccl_comm_
;
HcclRootInfo
*
hccl_id_
;
int
rank_
;
mutable
std
::
mutex
mutex_
;
};
HcclReduceOp
ToHCCLRedType
(
ReduceOp
reduction
);
std
::
string
SerializeHCCLUniqueId
(
const
HcclRootInfo
&
hcclID
);
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/MPITools.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/MPITools.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace
paddle
{
namespace
distributed
{
namespace
mpi
{
MPI_Op
ToMPIType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
MPI_Op
>
red_type
=
{
{
ReduceOp
::
MIN
,
MPI_MIN
},
{
ReduceOp
::
MAX
,
MPI_MAX
},
{
ReduceOp
::
SUM
,
MPI_SUM
},
{
ReduceOp
::
PRODUCT
,
MPI_PROD
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid mpi reduction. Must be MPI_MIN | MPI_MAX | "
"MPI_PROD | MPI_SUM."
));
return
it
->
second
;
}
// NOTE: MPI dose not support CUDA aware now.
bool
CheckMpiCudaAware
()
{
return
false
;
}
void
CheckValidInputs
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
)
{
PADDLE_ENFORCE_EQ
(
tensors
.
size
()
==
1
,
true
,
platform
::
errors
::
InvalidArgument
(
"the inputs size of MPI must be 1!"
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
tensors
)
&&
!
CheckMpiCudaAware
(),
false
,
platform
::
errors
::
InvalidArgument
(
"Found CUDA Tensor. But CUDA-aware MPI not support!"
));
}
}
// namespace mpi
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/MPITools.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <error.h>
#include <iostream>
#include <string>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/distributed/collective/Types.h"
#ifdef HOST
#undef HOST
#endif
#include <mpi.h>
namespace
paddle
{
namespace
distributed
{
namespace
mpi
{
#define MPI_CHECK(cmd) \
do { \
int r = cmd; \
if (r != MPI_SUCCESS) { \
LOG(FATAL) << "Failed, MPI error in" << __FILE__ << ":" << __LINE__ \
<< "with error code: " << std::to_string(r) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
MPI_Op
ToMPIType
(
ReduceOp
reduction
);
bool
CheckMpiCudaAware
();
void
CheckValidInputs
(
const
std
::
vector
<
phi
::
DenseTensor
>&
tensors
);
}
// namespace mpi
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/NCCLTools.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace
paddle
{
namespace
distributed
{
ncclRedOp_t
ToNCCLRedType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
ncclRedOp_t
>
red_type
=
{
{
ReduceOp
::
MIN
,
ncclMin
},
{
ReduceOp
::
MAX
,
ncclMax
},
{
ReduceOp
::
SUM
,
ncclSum
},
{
ReduceOp
::
PRODUCT
,
ncclProd
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"
));
return
it
->
second
;
}
std
::
string
SerializeNCCLUniqueId
(
const
ncclUniqueId
&
ncclID
)
{
const
uint8_t
*
bytes
=
reinterpret_cast
<
const
uint8_t
*>
(
&
ncclID
);
std
::
ostringstream
oss
;
for
(
auto
i
=
0
;
i
<
NCCL_UNIQUE_ID_BYTES
;
++
i
)
{
oss
<<
std
::
hex
<<
static_cast
<
int
>
(
bytes
[
i
]);
}
return
oss
.
str
();
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/NCCLTools.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <error.h>
#include <string>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/variant.h"
namespace
paddle
{
namespace
distributed
{
#define NCCLCHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__, \
__LINE__, \
platform::dynload::ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
// EventManage is different from paddle::platform::CudaEvent.
// It uses lazy initialization and is only created when the
// Record() method is called for the first time; it also monitors
// device information to ensure that recorded stream and event
// are on the same device.
class
EventManager
{
public:
EventManager
()
{}
explicit
EventManager
(
unsigned
int
flags
)
:
flags_
{
flags
}
{}
~
EventManager
()
{
if
(
is_created_
)
{
platform
::
CUDADeviceGuard
guard
(
device_index_
);
#ifdef PADDLE_WITH_HIP
hipEventDestroy
(
event_
);
#else
cudaEventDestroy
(
event_
);
#endif
}
}
EventManager
(
const
EventManager
&
)
=
delete
;
EventManager
&
operator
=
(
const
EventManager
&
)
=
delete
;
EventManager
(
EventManager
&&
other
)
{
std
::
swap
(
flags_
,
other
.
flags_
);
std
::
swap
(
is_created_
,
other
.
is_created_
);
std
::
swap
(
device_index_
,
other
.
device_index_
);
std
::
swap
(
event_
,
other
.
event_
);
}
EventManager
&
operator
=
(
EventManager
&&
other
)
{
std
::
swap
(
flags_
,
other
.
flags_
);
std
::
swap
(
is_created_
,
other
.
is_created_
);
std
::
swap
(
device_index_
,
other
.
device_index_
);
std
::
swap
(
event_
,
other
.
event_
);
return
*
this
;
}
bool
IsCreated
()
const
{
return
is_created_
;
}
bool
DeviceId
()
const
{
return
device_index_
;
}
gpuEvent_t
GetRawCudaEvent
()
const
{
return
event_
;
}
void
Record
(
const
phi
::
GPUContext
&
ctx
)
{
auto
device_index
=
ctx
.
GetPlace
().
device
;
if
(
!
is_created_
)
{
CreateEvent
(
device_index
);
}
PADDLE_ENFORCE_EQ
(
device_index
,
device_index_
,
platform
::
errors
::
PreconditionNotMet
(
"phi::GPUContext's device %d does not match"
"Event's device %d"
,
device_index
,
device_index_
));
platform
::
CUDADeviceGuard
guard
(
device_index_
);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
event_
,
ctx
.
stream
()));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
event_
,
ctx
.
stream
()));
#endif
}
bool
Query
()
const
{
#ifdef PADDLE_WITH_HIP
gpuError_t
err
=
hipEventQuery
(
event_
);
if
(
err
==
hipSuccess
)
{
return
true
;
}
if
(
err
==
hipErrorNotReady
)
{
return
false
;
}
#else
gpuError_t
err
=
cudaEventQuery
(
event_
);
if
(
err
==
cudaSuccess
)
{
return
true
;
}
if
(
err
==
cudaErrorNotReady
)
{
return
false
;
}
#endif
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
return
false
;
}
void
Synchronize
()
const
{
if
(
is_created_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventSynchronize
(
event_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventSynchronize
(
event_
));
#endif
}
}
void
Block
(
const
phi
::
GPUContext
&
ctx
)
const
{
if
(
is_created_
)
{
auto
device_index
=
ctx
.
GetPlace
().
device
;
PADDLE_ENFORCE_EQ
(
device_index
,
device_index_
,
platform
::
errors
::
PreconditionNotMet
(
"phi::GPUContext's device %d does not match"
"Event's device %d"
,
device_index
,
device_index_
));
platform
::
CUDADeviceGuard
guard
(
device_index_
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamWaitEvent
(
ctx
.
stream
(),
event_
,
0
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamWaitEvent
(
ctx
.
stream
(),
event_
,
0
));
#endif
}
}
private:
#ifdef PADDLE_WITH_HIP
unsigned
int
flags_
=
hipEventDefault
;
#else
unsigned
int
flags_
=
cudaEventDefault
;
#endif
bool
is_created_
{
false
};
gpuEvent_t
event_
{};
int8_t
device_index_
{
0
};
private:
void
CreateEvent
(
int
device_index
)
{
device_index_
=
device_index
;
platform
::
CUDADeviceGuard
guard
(
device_index
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventCreateWithFlags
(
&
event_
,
flags_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventCreateWithFlags
(
&
event_
,
flags_
));
#endif
is_created_
=
true
;
}
};
// NOTE(shenliang03): NCCLCommManager is more lightweight than
// platform::NCCLComm
class
NCCLCommManager
{
public:
explicit
NCCLCommManager
(
ncclComm_t
ncclComm
)
:
nccl_comm_
(
ncclComm
)
{}
NCCLCommManager
()
:
NCCLCommManager
(
nullptr
)
{}
~
NCCLCommManager
()
noexcept
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
nccl_comm_
)
{
platform
::
dynload
::
ncclCommDestroy
(
nccl_comm_
);
}
}
static
std
::
shared_ptr
<
NCCLCommManager
>
Create
(
int
num_ranks
,
int
rank
,
ncclUniqueId
comm_id
)
{
auto
nccl_manager
=
std
::
make_shared
<
NCCLCommManager
>
();
NCCLCHECK
(
platform
::
dynload
::
ncclCommInitRank
(
&
(
nccl_manager
->
nccl_comm_
),
num_ranks
,
comm_id
,
rank
));
nccl_manager
->
nccl_id_
=
comm_id
;
nccl_manager
->
rank_
=
rank
;
return
nccl_manager
;
}
ncclUniqueId
GetNcclId
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
nccl_id_
;
}
ncclComm_t
GetNcclComm
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
nccl_comm_
;
}
NCCLCommManager
(
const
NCCLCommManager
&
)
=
delete
;
NCCLCommManager
&
operator
=
(
const
NCCLCommManager
&
)
=
delete
;
NCCLCommManager
&
operator
=
(
NCCLCommManager
&&
other
)
=
delete
;
NCCLCommManager
(
NCCLCommManager
&&
other
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
other
.
mutex_
);
std
::
swap
(
nccl_comm_
,
other
.
nccl_comm_
);
}
protected:
ncclComm_t
nccl_comm_
;
ncclUniqueId
nccl_id_
;
int
rank_
;
mutable
std
::
mutex
mutex_
;
};
ncclRedOp_t
ToNCCLRedType
(
ReduceOp
reduction
);
std
::
string
SerializeNCCLUniqueId
(
const
ncclUniqueId
&
ncclID
);
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroup.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
namespace
paddle
{
namespace
distributed
{
ProcessGroup
::
Task
::
Task
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
CommType
comm_type
)
:
rank_
(
rank
),
comm_type_
(
comm_type
)
{}
ProcessGroup
::
Task
::
Task
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
CommType
comm_type
,
bool
sync_op
)
:
rank_
(
rank
),
comm_type_
(
comm_type
),
sync_op_
(
sync_op
)
{}
ProcessGroup
::
Task
::~
Task
()
=
default
;
bool
ProcessGroup
::
Task
::
IsCompleted
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
is_completed_
;
}
bool
ProcessGroup
::
Task
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
return
false
;
}
void
ProcessGroup
::
Task
::
Synchronize
()
{}
ProcessGroup
::
ProcessGroup
(
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
rank_
(
rank
),
size_
(
size
),
place_
(
place
),
gid_
(
gid
)
{
if
(
gid
!=
IGNORE_ID
)
{
auto
map
=
ProcessGroupMapFromGid
::
getInstance
();
map
->
insert
(
gid_
,
this
);
}
}
ProcessGroup
::
ProcessGroup
(
int
rank
,
int
size
,
int
gid
)
:
rank_
(
rank
),
size_
(
size
),
gid_
(
gid
)
{
if
(
gid
!=
IGNORE_ID
)
{
auto
map
=
ProcessGroupMapFromGid
::
getInstance
();
map
->
insert
(
gid_
,
this
);
}
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroup.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
constexpr
auto
kWaitTimeout
=
std
::
chrono
::
milliseconds
(
0
);
namespace
paddle
{
namespace
distributed
{
constexpr
int
IGNORE_ID
=
-
1
;
using
Tensor
=
paddle
::
experimental
::
Tensor
;
enum
class
CommType
:
std
::
uint8_t
{
BROADCAST
=
0
,
ALLREDUCE
=
1
,
ALLREDUCE_SPARSE
=
2
,
// TODO(shenliang03): to support sparse in allreduce
REDUCE
=
3
,
ALLGATHER
=
4
,
GATHER
=
5
,
SCATTER
=
6
,
REDUCE_SCATTER
=
7
,
ALLTOALL
=
8
,
SEND
=
9
,
RECV
=
10
,
BARRIER
=
11
,
ALLTOALL_SINGLE
=
12
,
UNKNOWN
=
100
,
};
class
ProcessGroup
{
public:
class
Task
{
public:
Task
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
CommType
comm_type
);
Task
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
CommType
comm_type
,
bool
sync_op
);
virtual
~
Task
();
virtual
bool
IsCompleted
();
virtual
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
virtual
void
Synchronize
();
bool
IsSync
()
const
{
return
sync_op_
;
}
protected:
const
int
rank_
;
CommType
comm_type_
{
CommType
::
UNKNOWN
};
std
::
mutex
mutex_
;
bool
is_completed_
{
false
};
private:
bool
sync_op_
{
true
};
};
explicit
ProcessGroup
(
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
);
explicit
ProcessGroup
(
int
rank
,
int
size
,
int
gid
);
virtual
~
ProcessGroup
()
{}
int
GetRank
()
const
{
return
rank_
;
}
int
GetSize
()
const
{
return
size_
;
}
virtual
const
std
::
string
GetBackendName
()
const
=
0
;
virtual
phi
::
DeviceContext
*
GetDeviceContext
(
const
Place
&
place
)
const
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Does not support to get device_context from ProcessGroup%s."
,
GetBackendName
()));
}
// TODO(liyurui): This API will be moved later
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
/* input tensors */
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
/* output tensors */
,
// NOLINT
const
AllreduceOptions
&
=
AllreduceOptions
())
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support allreduce"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
/* input tensors */
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
/* output tensors */
,
// NOLINT
const
AllreduceOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support allreduce with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
/* input tensors */
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
/* output tensors */
,
// NOLINT
const
BroadcastOptions
&
=
BroadcastOptions
())
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support broadcast"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
/* input tensors */
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
/* output tensors */
,
// NOLINT
const
BroadcastOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support broadcast with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support barrier"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
,
int
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
,
int
,
bool
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
,
int
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support recv"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
,
int
,
bool
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support recv with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
,
// NOLINT
int
,
int64_t
,
int64_t
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send_partial"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
,
int
,
int64_t
,
int64_t
,
bool
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send_partial with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
,
// NOLINT
int
,
int64_t
,
int64_t
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support recv_partial"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
,
int
,
int64_t
,
int64_t
,
bool
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support recv_partial with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support all_gather"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support all_gather with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
int64_t
offset
,
int64_t
length
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support AllGather_Partial"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
int64_t
offset
,
int64_t
length
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support AllGather_Partial"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support AllToAll"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support alltoall"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAll_Single
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
int64_t
>&
,
std
::
vector
<
int64_t
>&
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support AllToAll_Single"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllToAllSingle
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
int64_t
>&
,
std
::
vector
<
int64_t
>&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support alltoall_single"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
const
ReduceOptions
&
opts
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support reduce"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
/* input tensors */
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
/* output tensors */
,
// NOLINT
const
ReduceOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support reduce with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
const
ScatterOptions
&
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support scatter"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
const
ScatterOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support scatter with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
const
ReduceScatterOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support reduce_scatter with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
_ReduceScatterBase
(
phi
::
DenseTensor
&
,
// NOLINT
phi
::
DenseTensor
&
,
// NOLINT
const
ReduceScatterOptions
&
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support ReduceScatter"
,
GetBackendName
()));
}
protected:
const
int
rank_
;
const
int
size_
;
const
platform
::
Place
place_
;
const
int
gid_
;
};
class
ProcessGroupMapFromGid
{
public:
bool
has
(
int
gid
)
{
auto
it
=
map_
.
find
(
gid
);
return
it
!=
map_
.
end
();
}
void
insert
(
int
gid
,
ProcessGroup
*
pg
)
{
// TODO(sandyhouse): address ut and uncomment the following codes
// PADDLE_ENFORCE_EQ(has(gid), false,
// platform::errors::PreconditionNotMet(
// "The process group with id %d doesnot exist.",
// gid));
map_
[
gid
]
=
pg
;
}
ProcessGroup
*
get
(
int
gid
)
{
// TODO(sandyhouse): address ut and uncomment the following codes
// PADDLE_ENFORCE_EQ(has(gid), true,
// platform::errors::PreconditionNotMet(
// "The process group with id %d doesnot exist.",
// gid));
return
map_
.
find
(
gid
)
->
second
;
}
static
std
::
shared_ptr
<
ProcessGroupMapFromGid
>
getInstance
()
{
static
auto
s_instance
=
std
::
make_shared
<
ProcessGroupMapFromGid
>
();
return
s_instance
;
}
ProcessGroupMapFromGid
()
=
default
;
~
ProcessGroupMapFromGid
()
=
default
;
private:
std
::
unordered_map
<
int
,
ProcessGroup
*>
map_
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupCustom.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
DECLARE_bool
(
xccl_blocking_wait
);
constexpr
int64_t
kWaitBlockTImeout
=
10
;
namespace
paddle
{
namespace
distributed
{
void
SyncDefaultStream
(
const
std
::
vector
<
Place
>&
places
,
std
::
vector
<
CustomEventManager
>&
cclEvents
,
// NOLINT
std
::
vector
<
std
::
unique_ptr
<
CustomDeviceContext
>>&
dev_ctx
)
{
// NOLINT
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
platform
::
CustomDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]));
cclEvents
[
i
].
Record
(
*
dev_ctx
[
i
]);
cclEvents
[
i
].
Block
(
*
default_ctx
);
}
}
std
::
shared_ptr
<
ProcessGroupCustom
::
CustomTask
>
ProcessGroupCustom
::
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
{
return
std
::
make_shared
<
ProcessGroupCustom
::
CustomTask
>
(
places
,
rank
,
comm_type
,
inputs
);
}
ProcessGroupCustom
::
CustomTask
::
CustomTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
)
:
Task
(
rank
,
inputs
,
CommType
),
places_
(
places
)
{
control_events_
.
resize
(
places
.
size
());
cclComms_
.
resize
(
places
.
size
());
}
ProcessGroupCustom
::
CustomTask
::~
CustomTask
()
{}
void
ProcessGroupCustom
::
CustomTask
::
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
)
{
// NOLINT
outputs_
=
std
::
make_shared
<
std
::
vector
<
phi
::
DenseTensor
>>
(
outputs
);
}
void
ProcessGroupCustom
::
CustomTask
::
SynchronizeStreams
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
*
default_ctx
=
static_cast
<
platform
::
CustomDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places_
[
i
]));
phi
::
DeviceGuard
guard
(
default_ctx
->
GetPlace
());
phi
::
stream
::
Stream
stream
(
default_ctx
->
GetPlace
(),
default_ctx
->
stream
());
stream
.
WaitEvent
(
control_events_
[
i
].
GetCustomEvent
());
}
}
bool
ProcessGroupCustom
::
CustomTask
::
IsCompleted
()
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
if
(
!
control_events_
[
i
].
Query
())
{
return
false
;
}
}
return
true
;
}
bool
ProcessGroupCustom
::
CustomTask
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
SynchronizeStreams
();
while
(
!
IsCompleted
())
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
kWaitBlockTImeout
));
}
return
true
;
}
// Same as Wait
void
ProcessGroupCustom
::
CustomTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupCustom
::
ProcessGroupCustom
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
)
:
ProcessGroup
(
rank
,
size
,
place
,
gid
),
store_
(
store
),
device_type_
(
place
.
GetDeviceType
())
{
phi
::
DeviceManager
::
SetDevice
(
place_
);
}
void
ProcessGroupCustom
::
BroadcastUniqueCustomID
(
std
::
vector
<
phi
::
ccl
::
CCLRootId
>&
ccl_ids
)
{
// NOLINT
if
(
rank_
==
0
)
{
for
(
size_t
i
=
0
;
i
<
ccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupCustom/ccl_ids/"
+
std
::
to_string
(
i
);
store_
->
set
(
key
,
ccl_ids
[
i
]);
}
}
else
{
for
(
size_t
i
=
0
;
i
<
ccl_ids
.
size
();
i
++
)
{
auto
key
=
"ProcessGroupCustom/ccl_ids/"
+
std
::
to_string
(
i
);
ccl_ids
[
i
]
=
store_
->
get
(
key
);
}
}
}
// create CustomCCLManager cache for places_key
void
ProcessGroupCustom
::
CreateCustomManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
)
{
PADDLE_ENFORCE_EQ
(
places_key
.
empty
(),
false
,
platform
::
errors
::
PreconditionNotMet
(
"Not able to create/get the HCCL Communicator since "
"the NPU place are not known"
));
const
std
::
string
device_type
=
places
.
back
().
GetDeviceType
();
std
::
vector
<
std
::
shared_ptr
<
CustomCCLCommManager
>>
ccl_comms
;
ccl_comms
.
resize
(
places
.
size
());
// using vector just for broadcast
std
::
vector
<
phi
::
ccl
::
CCLRootId
>
ccl_ids
;
ccl_ids
.
resize
(
1
);
auto
&
ccl_id
=
ccl_ids
.
front
();
if
(
rank_
==
0
)
{
phi
::
DeviceManager
::
CCLGetUniqueId
(
device_type
,
&
ccl_id
);
}
BroadcastUniqueCustomID
(
ccl_ids
);
VLOG
(
3
)
<<
"init custom ccl rank: "
<<
rank_
<<
", nranks: "
<<
size_
<<
", place: "
<<
places_key
<<
", custom ccl uniqueid: "
<<
SerializeCustomCCLUniqueId
(
ccl_id
);
std
::
vector
<
std
::
unique_ptr
<
CustomDeviceContext
>>
dev_ctx
;
dev_ctx
.
resize
(
places
.
size
());
std
::
unique_ptr
<
phi
::
ccl
::
CCLComm
>
comms
(
new
phi
::
ccl
::
CCLComm
[
places
.
size
()]);
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
phi
::
DeviceGuard
guard
(
places
[
i
]);
ccl_comms
[
i
]
=
CustomCCLCommManager
::
Create
(
device_type
,
GetSize
(),
GetRank
(),
&
ccl_id
,
comms
.
get
()
+
i
);
dev_ctx
[
i
].
reset
(
new
CustomDeviceContext
(
places
[
i
]));
}
std
::
vector
<
CustomEventManager
>
events
;
events
.
resize
(
places
.
size
());
// These caches will be useful to process sync/wait/communicate
places_to_events_
.
emplace
(
places_key
,
std
::
move
(
events
));
places_to_customcomm_
.
emplace
(
places_key
,
std
::
move
(
ccl_comms
));
places_to_ctx_
.
emplace
(
places_key
,
std
::
move
(
dev_ctx
));
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupCustom
::
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
Fn
fn
,
CommType
op_type
)
{
const
auto
places
=
GetPlaceList
(
inputs
);
const
auto
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_customcomm_
.
find
(
key
)
==
places_to_customcomm_
.
end
())
{
CreateCustomManagerCache
(
key
,
places
);
}
}
auto
&
ccl_comms
=
places_to_customcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
inputs
);
task
->
SetOutputs
(
outputs
);
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
phi
::
DeviceGuard
guard
(
places
[
i
]);
const
auto
&
ccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
phi
::
stream
::
Stream
stream
(
places
[
i
],
ccl_stream
);
fn
(
inputs
[
i
],
outputs
[
i
],
ccl_comms
[
i
]
->
GetCustomCCLComm
(),
stream
);
}
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
phi
::
DeviceGuard
guard
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupCustom
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
in_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CustomPlace(%s)."
,
device_type_
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
out_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CustomPlace(%s)."
,
device_type_
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
phi
::
ccl
::
CCLComm
comm
,
const
phi
::
stream
::
Stream
&
stream
)
{
return
phi
::
DeviceManager
::
CCLAllGather
(
device_type_
,
input
.
data
(),
output
.
data
(),
input
.
numel
(),
phi
::
ccl
::
ToCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
},
CommType
::
ALLGATHER
);
}
void
*
XcclGetPointerByOffset
(
void
*
raw_pointer
,
size_t
offset
,
experimental
::
DataType
type
)
{
if
(
type
==
experimental
::
DataType
::
FLOAT32
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
float
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
FLOAT64
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
double
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
INT32
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int32_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
INT64
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int64_t
*>
(
raw_pointer
)
+
offset
);
}
else
if
(
type
==
experimental
::
DataType
::
FLOAT16
)
{
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
int16_t
*>
(
raw_pointer
)
+
offset
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"This datatype in xccl is not supported."
));
}
return
nullptr
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupCustom
::
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
in_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CustomPlace(%s)."
,
device_type_
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
out_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CustomPlace(%s)."
,
device_type_
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
phi
::
ccl
::
CCLComm
comm
,
const
phi
::
stream
::
Stream
&
stream
)
{
return
phi
::
DeviceManager
::
CCLAllGather
(
device_type_
,
XcclGetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
output
.
data
(),
length
,
phi
::
ccl
::
ToCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
},
CommType
::
ALLGATHER
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupCustom
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
AllreduceOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
in_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CustomPlace(%s)."
,
device_type_
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
out_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CustomPlace(%s)."
,
device_type_
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
phi
::
ccl
::
CCLComm
comm
,
const
phi
::
stream
::
Stream
&
stream
)
{
return
phi
::
DeviceManager
::
CCLAllReduce
(
device_type_
,
input
.
data
(),
output
.
data
(),
input
.
numel
(),
phi
::
ccl
::
ToCCLDataType
(
input
.
dtype
()),
ToCustomCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
);
},
CommType
::
ALLREDUCE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupCustom
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
BroadcastOptions
&
opts
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
in_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CustomPlace(%s)."
,
device_type_
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCustomPlace
(
out_tensors
,
device_type_
),
true
,
platform
::
errors
::
InvalidArgument
(
"All outputs should be in CustomPlace(%s)."
,
device_type_
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
phi
::
ccl
::
CCLComm
comm
,
const
phi
::
stream
::
Stream
&
stream
)
{
int
root
=
opts
.
source_rank
*
in_tensors
.
size
()
+
opts
.
source_root
;
if
(
rank_
==
root
)
{
return
phi
::
DeviceManager
::
CCLBroadcast
(
device_type_
,
input
.
data
(),
input
.
numel
(),
phi
::
ccl
::
ToCCLDataType
(
input
.
dtype
()),
root
,
comm
,
stream
);
}
else
{
return
phi
::
DeviceManager
::
CCLBroadcast
(
device_type_
,
output
.
data
(),
output
.
numel
(),
phi
::
ccl
::
ToCCLDataType
(
output
.
dtype
()),
root
,
comm
,
stream
);
}
},
CommType
::
BROADCAST
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupCustom
::
Barrier
(
const
BarrierOptions
&
opts
)
{
// Only support single card single process
std
::
vector
<
phi
::
CustomPlace
>
places
=
{
place_
};
std
::
vector
<
phi
::
DenseTensor
>
barrierTensors
;
barrierTensors
.
reserve
(
places
.
size
());
for
(
auto
&
place
:
places
)
{
phi
::
DeviceGuard
guard
(
place
);
auto
dt
=
full
({
1
},
0
,
phi
::
DataType
::
FLOAT32
,
place
);
barrierTensors
.
push_back
(
*
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
dt
.
impl
()));
}
auto
task
=
ProcessGroupCustom
::
AllReduce
(
barrierTensors
,
barrierTensors
);
auto
xccl_task
=
dynamic_cast
<
ProcessGroupCustom
::
CustomTask
*>
(
task
.
get
());
xccl_task
->
barrierTensors_
=
std
::
move
(
barrierTensors
);
return
task
;
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupCustom.h
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/CustomCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/device/npu/npu_stream.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
using
CustomDeviceContext
=
paddle
::
platform
::
CustomDeviceContext
;
class
ProcessGroupCustom
:
public
ProcessGroup
{
public:
class
CustomTask
:
public
ProcessGroup
::
Task
,
public
std
::
enable_shared_from_this
<
CustomTask
>
{
public:
CustomTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
CommType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
bool
IsCompleted
();
void
SynchronizeStreams
();
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
Synchronize
();
void
SetOutputs
(
std
::
vector
<
phi
::
DenseTensor
>&
outputs
);
// NOLINT
virtual
~
CustomTask
();
std
::
vector
<
CustomEventManager
>
control_events_
;
std
::
vector
<
phi
::
DenseTensor
>
barrierTensors_
;
protected:
std
::
vector
<
Place
>
places_
;
std
::
vector
<
std
::
shared_ptr
<
CustomCCLCommManager
>>
cclComms_
;
std
::
shared_ptr
<
std
::
vector
<
phi
::
DenseTensor
>>
outputs_
;
private:
const
std
::
string
device_type_
;
};
ProcessGroupCustom
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
const
platform
::
Place
&
place
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
return
"XCCL_"
+
device_type_
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather_Partial
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
int64_t
offset
,
int64_t
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
override
;
protected:
virtual
std
::
shared_ptr
<
ProcessGroupCustom
::
CustomTask
>
CreateTask
(
std
::
vector
<
Place
>
places
,
int
rank
,
CommType
opType
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
CustomCCLCommManager
>
custom_comm_
;
std
::
mutex
mutex_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
CustomCCLCommManager
>>>
places_to_customcomm_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
CustomEventManager
>>
places_to_events_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
unique_ptr
<
CustomDeviceContext
>>>
places_to_ctx_
;
std
::
set
<
int
>
used_place_ids_
;
private:
void
BcastCustomId
(
std
::
vector
<
phi
::
ccl
::
CCLRootId
>&
ccl_ids
,
// NOLINT
int
root
,
int
server_fd
);
void
BroadcastUniqueCustomID
(
std
::
vector
<
phi
::
ccl
::
CCLRootId
>&
custom_ccl_ids
);
// NOLINT
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Collective
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
Fn
fn
,
CommType
op_type
);
void
CreateCustomManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
);
const
std
::
string
device_type_
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
0 → 100644
View file @
f0ef3442
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#ifdef _WIN32
#include <gloo/common/win.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#else
#include <netdb.h>
#include <sys/socket.h>
#include <unistd.h>
#endif
#include <gloo/broadcast.h>
#include <gloo/reduce.h>
#include <gloo/scatter.h>
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
distributed
{
#ifdef _WIN32
#define GENERATE_FUNC(type, func, ...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(__VA_ARGS__); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(__VA_ARGS__); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(__VA_ARGS__); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#define HOST_NAME_MAX 256
#else
#define GENERATE_FUNC(type, func, args...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(args); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(args); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(args); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(args); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(args); \
break; \
case experimental::DataType::INT8: \
func<int8_t>(args); \
break; \
case experimental::DataType::UINT8: \
func<uint8_t>(args); \
break; \
case experimental::DataType::BOOL: \
func<bool>(args); \
break; \
case experimental::DataType::BFLOAT16: \
func<bfloat16>(args); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#endif
typedef
void
(
*
reduce_func
)(
void
*
,
const
void
*
,
const
void
*
,
size_t
);
template
<
typename
T
>
reduce_func
get_function
(
const
ReduceOp
&
r
)
{
switch
(
r
)
{
case
ReduceOp
::
SUM
:
return
reduce_func
(
&::
gloo
::
sum
<
T
>
);
case
ReduceOp
::
PRODUCT
:
return
reduce_func
(
&::
gloo
::
product
<
T
>
);
case
ReduceOp
::
MIN
:
return
reduce_func
(
&::
gloo
::
min
<
T
>
);
case
ReduceOp
::
MAX
:
return
reduce_func
(
&::
gloo
::
max
<
T
>
);
case
ReduceOp
::
AVG
:
VLOG
(
0
)
<<
"Error: Unsupported ReduceOp::AVG."
;
exit
(
-
1
);
}
VLOG
(
0
)
<<
"Error: Unknown ReduceOp."
;
exit
(
-
1
);
}
template
<
typename
T
>
T
*
get_data
(
phi
::
DenseTensor
&
tensor
)
{
// NOLINT
return
reinterpret_cast
<
T
*>
(
tensor
.
data
());
}
template
<
typename
T
>
std
::
vector
<
T
*>
get_multi_data
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
)
{
// NOLINT
std
::
vector
<
T
*>
ret
;
ret
.
reserve
(
tensors
.
size
());
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
i
++
)
{
ret
.
push_back
(
get_data
<
T
>
(
tensors
[
i
]));
}
return
ret
;
}
template
<
typename
T
,
typename
P
>
void
set_output
(
P
&
opts
,
phi
::
DenseTensor
&
tensor
)
{
// NOLINT
opts
.
setOutput
(
get_data
<
T
>
(
tensor
),
tensor
.
numel
());
}
template
<
typename
T
,
typename
P
>
void
set_input
(
P
&
opts
,
phi
::
DenseTensor
&
tensor
)
{
// NOLINT
opts
.
setInput
(
get_data
<
T
>
(
tensor
),
tensor
.
numel
());
}
template
<
typename
T
,
typename
P
>
void
set_outputs
(
P
&
opts
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
tensors
)
{
// NOLINT
opts
.
setOutputs
(
get_multi_data
<
T
>
(
tensors
),
tensors
[
0
].
numel
());
}
template
<
typename
T
,
typename
P
>
void
set_inputs
(
P
&
opts
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
tensors
)
{
// NOLINT
opts
.
setInputs
(
get_multi_data
<
T
>
(
tensors
),
tensors
[
0
].
numel
());
}
template
<
typename
T
,
typename
P
>
void
set_inputs_for_scatter
(
P
&
opts
,
// NOLINT
phi
::
DenseTensor
&
tensor
,
// NOLINT
int
nranks
)
{
std
::
vector
<
T
*>
ret
;
ret
.
reserve
(
nranks
);
T
*
raw_pointer
=
reinterpret_cast
<
T
*>
(
tensor
.
data
());
size_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
nranks
;
i
++
)
{
ret
.
push_back
(
raw_pointer
+
offset
);
offset
+=
tensor
.
numel
()
/
nranks
;
}
opts
.
setInputs
(
ret
,
tensor
.
numel
()
/
nranks
);
}
ProcessGroupGloo
::
GlooTask
::
GlooTask
(
int
rank
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
CommType
comm_type
)
:
ProcessGroup
::
Task
(
rank
,
inputs
,
comm_type
)
{}
ProcessGroupGloo
::
ProcessGroupGloo
(
const
std
::
shared_ptr
<
distributed
::
Store
>&
store
,
int
rank
,
int
world_size
,
const
platform
::
Place
&
place
,
int
gid
,
const
std
::
shared_ptr
<
GlooOptions
>
options
)
:
ProcessGroup
(
rank
,
world_size
,
place
,
gid
),
_tag
(
0
),
_store
(
new
GlooStore
(
store
))
{
_context
=
std
::
make_shared
<
gloo
::
rendezvous
::
Context
>
(
rank
,
world_size
);
auto
prefix_store
=
::
gloo
::
rendezvous
::
PrefixStore
(
std
::
to_string
(
gid
),
*
_store
);
_context
->
connectFullMesh
(
prefix_store
,
options
->
device
);
}
class
BroadcastGlooTask
:
public
ProcessGroupGloo
::
GlooTask
{
public:
BroadcastGlooTask
(
const
std
::
shared_ptr
<
gloo
::
Context
>&
context
,
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
int
rank
,
int
root
,
uint32_t
tag
)
:
ProcessGroupGloo
::
GlooTask
(
rank
,
inputs
,
CommType
::
BROADCAST
),
_context
(
context
),
_root
(
root
),
_inputs
(
inputs
),
_outputs
(
outputs
),
_tag
(
tag
)
{}
void
Run
()
override
{
_do_broadcast
(
_inputs
[
0
],
_outputs
[
0
]);
}
private:
std
::
shared_ptr
<
gloo
::
Context
>
_context
;
const
int
_root
;
std
::
vector
<
phi
::
DenseTensor
>
_inputs
{};
std
::
vector
<
phi
::
DenseTensor
>
_outputs
{};
const
uint32_t
_tag
;
void
_do_broadcast
(
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
&
out
)
{
// NOLINT
gloo
::
BroadcastOptions
opts
(
_context
);
const
auto
&
dtype
=
in
.
dtype
();
if
(
rank_
==
_root
)
{
GENERATE_FUNC
(
dtype
,
set_input
,
opts
,
in
);
}
GENERATE_FUNC
(
dtype
,
set_output
,
opts
,
out
);
opts
.
setRoot
(
_root
);
opts
.
setTag
(
_tag
);
gloo
::
broadcast
(
opts
);
}
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
BroadcastOptions
&
opts
)
{
auto
root
=
opts
.
source_rank
;
std
::
unique_ptr
<
BroadcastGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
task
=
std
::
make_unique
<
BroadcastGlooTask
>
(
context
,
inputs
,
outputs
,
rank_
,
root
,
tag
);
task
->
Run
();
return
task
;
}
class
AllreduceGlooTask
:
public
ProcessGroupGloo
::
GlooTask
{
public:
AllreduceGlooTask
(
int
rank
,
const
std
::
shared_ptr
<
gloo
::
Context
>&
context
,
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
ReduceOp
reduce_op
,
uint32_t
tag
)
:
ProcessGroupGloo
::
GlooTask
(
rank
,
inputs
,
CommType
::
ALLREDUCE
),
_context
(
context
),
_inputs
(
inputs
),
_outputs
(
outputs
),
_reduce_op
(
reduce_op
),
_tag
(
tag
)
{}
void
Run
()
override
{
_do_allreduce
(
_inputs
,
_outputs
);
}
private:
std
::
shared_ptr
<
gloo
::
Context
>
_context
;
std
::
vector
<
phi
::
DenseTensor
>
_inputs
;
std
::
vector
<
phi
::
DenseTensor
>
_outputs
;
const
ReduceOp
_reduce_op
;
uint32_t
_tag
;
gloo
::
AllreduceOptions
::
Func
_get_function
(
const
experimental
::
DataType
type
,
const
ReduceOp
op
)
{
gloo
::
AllreduceOptions
::
Func
fn
;
GENERATE_FUNC
(
type
,
_get_function_impl
,
fn
,
op
);
return
fn
;
}
template
<
typename
T
>
void
_get_function_impl
(
gloo
::
AllreduceOptions
::
Func
&
fn
,
// NOLINT
const
ReduceOp
op
)
{
fn
=
get_function
<
T
>
(
op
);
}
void
_do_allreduce
(
std
::
vector
<
phi
::
DenseTensor
>&
ins
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outs
)
{
// NOLINT
const
auto
&
dtype
=
ins
[
0
].
dtype
();
gloo
::
AllreduceOptions
opts
(
_context
);
GENERATE_FUNC
(
dtype
,
set_inputs
,
opts
,
ins
);
GENERATE_FUNC
(
dtype
,
set_outputs
,
opts
,
outs
);
opts
.
setReduceFunction
(
_get_function
(
dtype
,
_reduce_op
));
opts
.
setTag
(
_tag
);
gloo
::
allreduce
(
opts
);
}
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
AllreduceOptions
&
opts
)
{
return
AllReduce
(
inputs
,
outputs
,
opts
,
true
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
AllreduceOptions
&
opts
,
bool
sync_op
)
{
auto
tag
=
next_tag
();
std
::
shared_ptr
<
GlooTask
>
task
;
auto
context
=
get_context
();
task
=
std
::
make_shared
<
AllreduceGlooTask
>
(
rank_
,
context
,
inputs
,
outputs
,
opts
.
reduce_op
,
tag
);
task
->
Run
();
return
task
;
}
class
BarrierGlooTask
:
public
ProcessGroupGloo
::
GlooTask
{
public:
BarrierGlooTask
(
int
rank
,
const
std
::
shared_ptr
<
gloo
::
Context
>&
context
)
:
ProcessGroupGloo
::
GlooTask
(
rank
,
std
::
vector
<
phi
::
DenseTensor
>
{},
CommType
::
BARRIER
),
_context
(
context
)
{}
void
Run
()
override
{
_do_barrier
();
}
private:
std
::
shared_ptr
<
gloo
::
Context
>
_context
;
void
_do_barrier
()
{
gloo
::
BarrierOptions
opts
(
_context
);
gloo
::
barrier
(
opts
);
}
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Barrier
(
const
BarrierOptions
&
opts
)
{
std
::
shared_ptr
<
BarrierGlooTask
>
task
;
auto
context
=
get_context
();
task
=
std
::
make_shared
<
BarrierGlooTask
>
(
rank_
,
context
);
task
->
Run
();
return
task
;
}
class
AllgatherGlooTask
:
public
ProcessGroupGloo
::
GlooTask
{
public:
AllgatherGlooTask
(
int
rank
,
const
std
::
shared_ptr
<
gloo
::
Context
>&
context
,
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
uint32_t
tag
)
:
ProcessGroupGloo
::
GlooTask
(
rank
,
inputs
,
CommType
::
ALLGATHER
),
_context
(
context
),
_inputs
(
inputs
),
_outputs
(
outputs
),
_tag
(
tag
)
{}
void
Run
()
override
{
_do_allgather
(
_inputs
,
_outputs
);
}
private:
std
::
shared_ptr
<
gloo
::
Context
>
_context
;
std
::
vector
<
phi
::
DenseTensor
>
_inputs
;
std
::
vector
<
phi
::
DenseTensor
>
_outputs
;
uint32_t
_tag
;
void
_do_allgather
(
std
::
vector
<
phi
::
DenseTensor
>&
in
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out
)
{
// NOLINT
const
auto
&
dtype
=
in
[
0
].
dtype
();
gloo
::
AllgatherOptions
opts
(
_context
);
GENERATE_FUNC
(
dtype
,
set_input
,
opts
,
in
[
0
]);
GENERATE_FUNC
(
dtype
,
set_output
,
opts
,
out
[
0
]);
opts
.
setTag
(
_tag
);
gloo
::
allgather
(
opts
);
}
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
std
::
shared_ptr
<
AllgatherGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
task
=
std
::
make_shared
<
AllgatherGlooTask
>
(
rank_
,
context
,
in_tensors
,
out_tensors
,
tag
);
task
->
Run
();
return
task
;
}
class
ReduceGlooTask
:
public
ProcessGroupGloo
::
GlooTask
{
public:
ReduceGlooTask
(
int
rank
,
const
std
::
shared_ptr
<
gloo
::
Context
>&
context
,
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
ReduceOp
reduce_op
,
int
dst
,
uint32_t
tag
)
:
ProcessGroupGloo
::
GlooTask
(
rank
,
inputs
,
CommType
::
REDUCE
),
_context
(
context
),
_inputs
(
inputs
),
_outputs
(
outputs
),
_reduce_op
(
reduce_op
),
_dst
(
dst
),
_tag
(
tag
)
{}
void
Run
()
override
{
_do_reduce
(
_inputs
,
_outputs
,
_dst
);
}
private:
std
::
shared_ptr
<
gloo
::
Context
>
_context
;
std
::
vector
<
phi
::
DenseTensor
>
_inputs
;
std
::
vector
<
phi
::
DenseTensor
>
_outputs
;
const
ReduceOp
_reduce_op
;
int
_dst
;
uint32_t
_tag
;
gloo
::
ReduceOptions
::
Func
_get_function
(
const
experimental
::
DataType
type
,
const
ReduceOp
op
)
{
gloo
::
ReduceOptions
::
Func
fn
;
GENERATE_FUNC
(
type
,
_get_function_impl
,
fn
,
op
);
return
fn
;
}
template
<
typename
T
>
void
_get_function_impl
(
gloo
::
ReduceOptions
::
Func
&
fn
,
// NOLINT
const
ReduceOp
op
)
{
fn
=
get_function
<
T
>
(
op
);
}
void
_do_reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
int
dst
)
{
const
auto
&
dtype
=
inputs
[
0
].
dtype
();
gloo
::
ReduceOptions
opts
(
_context
);
GENERATE_FUNC
(
dtype
,
set_input
,
opts
,
inputs
[
0
]);
GENERATE_FUNC
(
dtype
,
set_output
,
opts
,
outputs
[
0
]);
opts
.
setReduceFunction
(
_get_function
(
dtype
,
_reduce_op
));
opts
.
setTag
(
_tag
);
opts
.
setRoot
(
dst
);
gloo
::
reduce
(
opts
);
}
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
ReduceOptions
&
opts
)
{
std
::
shared_ptr
<
ReduceGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
task
=
std
::
make_shared
<
ReduceGlooTask
>
(
rank_
,
context
,
inputs
,
outputs
,
opts
.
reduce_op
,
opts
.
root_rank
,
tag
);
task
->
Run
();
return
task
;
}
class
ScatterGlooTask
:
public
ProcessGroupGloo
::
GlooTask
{
public:
ScatterGlooTask
(
int
rank
,
const
std
::
shared_ptr
<
gloo
::
Context
>&
context
,
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
// NOLINT
int
src
,
int
size
,
uint32_t
tag
)
:
ProcessGroupGloo
::
GlooTask
(
rank
,
inputs
,
CommType
::
SCATTER
),
_context
(
context
),
_inputs
(
inputs
),
_outputs
(
outputs
),
_src
(
src
),
_size
(
size
),
_tag
(
tag
)
{}
void
Run
()
override
{
_do_scatter
(
_inputs
,
_outputs
,
_src
);
}
private:
std
::
shared_ptr
<
gloo
::
Context
>
_context
;
std
::
vector
<
phi
::
DenseTensor
>
_inputs
;
std
::
vector
<
phi
::
DenseTensor
>
_outputs
;
int
_src
;
int
_size
;
uint32_t
_tag
;
void
_do_scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out
,
// NOLINT
int
src
)
{
const
auto
&
dtype
=
in
[
0
].
dtype
();
gloo
::
ScatterOptions
opts
(
_context
);
if
(
rank_
==
src
)
{
GENERATE_FUNC
(
dtype
,
set_inputs_for_scatter
,
opts
,
in
[
0
],
_size
);
}
GENERATE_FUNC
(
dtype
,
set_output
,
opts
,
out
[
0
]);
opts
.
setRoot
(
src
);
opts
.
setTag
(
_tag
);
gloo
::
scatter
(
opts
);
}
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
{
std
::
shared_ptr
<
ScatterGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
task
=
std
::
make_shared
<
ScatterGlooTask
>
(
rank_
,
context
,
in_tensors
,
out_tensors
,
opts
.
root_rank
,
size_
,
tag
);
task
->
Run
();
return
task
;
}
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
ProcessGroupGloo
::
createDeviceForInterface
(
const
std
::
string
&
ifname
)
{
::
gloo
::
transport
::
tcp
::
attr
attr
;
attr
.
iface
=
ifname
;
return
::
gloo
::
transport
::
tcp
::
CreateDevice
(
attr
);
}
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
ProcessGroupGloo
::
createDeviceForHostname
(
const
std
::
string
&
hostname
)
{
::
gloo
::
transport
::
tcp
::
attr
attr
;
attr
.
hostname
=
hostname
;
return
::
gloo
::
transport
::
tcp
::
CreateDevice
(
attr
);
}
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
ProcessGroupGloo
::
createDefaultDevice
()
{
std
::
array
<
char
,
HOST_NAME_MAX
>
hostname
{};
auto
ret
=
::
gethostname
(
hostname
.
data
(),
HOST_NAME_MAX
);
PADDLE_ENFORCE_EQ
(
ret
,
0
,
platform
::
errors
::
Fatal
(
"Get hostname error for createDefaultDevice."
));
::
addrinfo
*
result
;
result
=
tcputils
::
get_addr_info
(
hostname
.
data
(),
""
,
0
,
AF_UNSPEC
);
::
addrinfo
*
cur
;
for
(
cur
=
result
;
cur
!=
nullptr
;
cur
=
cur
->
ai_next
)
{
SocketType
socket
=
::
socket
(
cur
->
ai_family
,
cur
->
ai_socktype
,
cur
->
ai_protocol
);
if
(
socket
==
-
1
)
{
continue
;
}
ret
=
::
bind
(
socket
,
cur
->
ai_addr
,
cur
->
ai_addrlen
);
#ifdef _WIN32
closesocket
(
socket
);
#else
close
(
socket
);
#endif
if
(
ret
==
-
1
)
{
continue
;
}
break
;
}
freeaddrinfo
(
result
);
if
(
cur
!=
nullptr
)
{
return
createDeviceForHostname
(
hostname
.
data
());
}
return
createDeviceForHostname
(
"127.0.0.1"
);
}
}
// namespace distributed
}
// namespace paddle
Prev
1
…
3
4
5
6
7
8
9
10
11
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment