Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
5dfaf99e
Unverified
Commit
5dfaf99e
authored
Oct 12, 2023
by
Israt Nisa
Committed by
GitHub
Oct 12, 2023
Browse files
[Performance] Add NCCL support (#5929)
Co-authored-by:
Israt Nisa
<
nisisrat@amazon.com
>
parent
e594b4a8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
95 additions
and
15 deletions
+95
-15
python/dgl/distributed/optim/pytorch/sparse_optim.py
python/dgl/distributed/optim/pytorch/sparse_optim.py
+31
-13
python/dgl/distributed/optim/pytorch/utils.py
python/dgl/distributed/optim/pytorch/utils.py
+64
-2
No files found.
python/dgl/distributed/optim/pytorch/sparse_optim.py
100644 → 100755
View file @
5dfaf99e
...
@@ -12,7 +12,7 @@ from .... import backend as F
...
@@ -12,7 +12,7 @@ from .... import backend as F
from
...dist_tensor
import
DistTensor
from
...dist_tensor
import
DistTensor
from
...graph_partition_book
import
EDGE_PART_POLICY
,
NODE_PART_POLICY
from
...graph_partition_book
import
EDGE_PART_POLICY
,
NODE_PART_POLICY
from
...nn.pytorch
import
DistEmbedding
from
...nn.pytorch
import
DistEmbedding
from
.utils
import
alltoall
_cpu
,
alltoallv
_cpu
from
.utils
import
alltoall
,
alltoallv
EMB_STATES
=
"emb_states"
EMB_STATES
=
"emb_states"
WORLD_SIZE
=
"world_size"
WORLD_SIZE
=
"world_size"
...
@@ -256,9 +256,13 @@ class DistSparseGradOptimizer(abc.ABC):
...
@@ -256,9 +256,13 @@ class DistSparseGradOptimizer(abc.ABC):
of the embeddings involved in a mini-batch to DGL's servers and update the embeddings.
of the embeddings involved in a mini-batch to DGL's servers and update the embeddings.
"""
"""
with
th
.
no_grad
():
with
th
.
no_grad
():
device
=
(
th
.
device
(
f
"cuda:
{
self
.
_rank
}
"
)
if
th
.
distributed
.
get_backend
()
==
"nccl"
else
th
.
device
(
"cpu"
)
)
local_indics
=
{
emb
.
name
:
[]
for
emb
in
self
.
_params
}
local_indics
=
{
emb
.
name
:
[]
for
emb
in
self
.
_params
}
local_grads
=
{
emb
.
name
:
[]
for
emb
in
self
.
_params
}
local_grads
=
{
emb
.
name
:
[]
for
emb
in
self
.
_params
}
device
=
th
.
device
(
"cpu"
)
for
emb
in
self
.
_params
:
for
emb
in
self
.
_params
:
name
=
emb
.
weight
.
name
name
=
emb
.
weight
.
name
kvstore
=
emb
.
weight
.
kvstore
kvstore
=
emb
.
weight
.
kvstore
...
@@ -310,7 +314,11 @@ class DistSparseGradOptimizer(abc.ABC):
...
@@ -310,7 +314,11 @@ class DistSparseGradOptimizer(abc.ABC):
if
trainers_per_server
<=
1
:
if
trainers_per_server
<=
1
:
idx_split_size
.
append
(
idx_split_size
.
append
(
th
.
tensor
([
idx_i
.
shape
[
0
]],
dtype
=
th
.
int64
)
th
.
tensor
(
[
idx_i
.
shape
[
0
]],
dtype
=
th
.
int64
,
device
=
device
,
)
)
)
idics_list
.
append
(
idx_i
)
idics_list
.
append
(
idx_i
)
grad_list
.
append
(
grad_i
)
grad_list
.
append
(
grad_i
)
...
@@ -323,7 +331,11 @@ class DistSparseGradOptimizer(abc.ABC):
...
@@ -323,7 +331,11 @@ class DistSparseGradOptimizer(abc.ABC):
idx_j
=
idx_i
[
mask
]
idx_j
=
idx_i
[
mask
]
grad_j
=
grad_i
[
mask
]
grad_j
=
grad_i
[
mask
]
idx_split_size
.
append
(
idx_split_size
.
append
(
th
.
tensor
([
idx_j
.
shape
[
0
]],
dtype
=
th
.
int64
)
th
.
tensor
(
[
idx_j
.
shape
[
0
]],
dtype
=
th
.
int64
,
device
=
device
,
)
)
)
idics_list
.
append
(
idx_j
)
idics_list
.
append
(
idx_j
)
grad_list
.
append
(
grad_j
)
grad_list
.
append
(
grad_j
)
...
@@ -336,39 +348,45 @@ class DistSparseGradOptimizer(abc.ABC):
...
@@ -336,39 +348,45 @@ class DistSparseGradOptimizer(abc.ABC):
# Note: If we have GPU nccl support, we can use all_to_all to
# Note: If we have GPU nccl support, we can use all_to_all to
# sync information here
# sync information here
gather_list
=
list
(
gather_list
=
list
(
th
.
empty
(
[
self
.
_world_size
],
dtype
=
th
.
int64
).
chunk
(
th
.
empty
(
self
.
_world_size
[
self
.
_world_size
],
dtype
=
th
.
int64
,
device
=
device
)
)
.
chunk
(
self
.
_world_size
)
)
)
alltoall
_cpu
(
alltoall
(
self
.
_rank
,
self
.
_rank
,
self
.
_world_size
,
self
.
_world_size
,
gather_list
,
gather_list
,
idx_split_size
,
idx_split_size
,
device
,
)
)
# use cpu until we have GPU alltoallv
idx_gather_list
=
[
idx_gather_list
=
[
th
.
empty
((
int
(
num_emb
),),
dtype
=
idics
.
dtype
)
th
.
empty
(
(
int
(
num_emb
),),
dtype
=
idics
.
dtype
,
device
=
device
)
for
num_emb
in
gather_list
for
num_emb
in
gather_list
]
]
alltoallv
_cpu
(
alltoallv
(
self
.
_rank
,
self
.
_rank
,
self
.
_world_size
,
self
.
_world_size
,
idx_gather_list
,
idx_gather_list
,
idics_list
,
idics_list
,
device
,
)
)
local_indics
[
name
]
=
idx_gather_list
local_indics
[
name
]
=
idx_gather_list
grad_gather_list
=
[
grad_gather_list
=
[
th
.
empty
(
th
.
empty
(
(
int
(
num_emb
),
grads
.
shape
[
1
]),
dtype
=
grads
.
dtype
(
int
(
num_emb
),
grads
.
shape
[
1
]),
dtype
=
grads
.
dtype
,
device
=
device
,
)
)
for
num_emb
in
gather_list
for
num_emb
in
gather_list
]
]
alltoallv
_cpu
(
alltoallv
(
self
.
_rank
,
self
.
_rank
,
self
.
_world_size
,
self
.
_world_size
,
grad_gather_list
,
grad_gather_list
,
grad_list
,
grad_list
,
device
,
)
)
local_grads
[
name
]
=
grad_gather_list
local_grads
[
name
]
=
grad_gather_list
else
:
else
:
...
...
python/dgl/distributed/optim/pytorch/utils.py
View file @
5dfaf99e
...
@@ -13,7 +13,7 @@ def alltoall_cpu(rank, world_size, output_tensor_list, input_tensor_list):
...
@@ -13,7 +13,7 @@ def alltoall_cpu(rank, world_size, output_tensor_list, input_tensor_list):
rank : int
rank : int
The rank of current worker
The rank of current worker
world_size : int
world_size : int
The size of the entire
The size of the entire
communicator
output_tensor_list : List of tensor
output_tensor_list : List of tensor
The received tensors
The received tensors
input_tensor_list : List of tensor
input_tensor_list : List of tensor
...
@@ -37,7 +37,7 @@ def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list):
...
@@ -37,7 +37,7 @@ def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list):
rank : int
rank : int
The rank of current worker
The rank of current worker
world_size : int
world_size : int
The size of the entire
The size of the entire
communicator
output_tensor_list : List of tensor
output_tensor_list : List of tensor
The received tensors
The received tensors
input_tensor_list : List of tensor
input_tensor_list : List of tensor
...
@@ -60,3 +60,65 @@ def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list):
...
@@ -60,3 +60,65 @@ def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list):
dist
.
recv
(
output_tensor_list
[
i
],
src
=
i
)
dist
.
recv
(
output_tensor_list
[
i
],
src
=
i
)
th
.
distributed
.
barrier
()
th
.
distributed
.
barrier
()
def
alltoall
(
rank
,
world_size
,
output_tensor_list
,
input_tensor_list
,
device
):
"""Each process scatters list of input tensors to all processes in a cluster
and return gathered list of tensors in output list. The tensors should have the same shape.
Parameters
----------
rank : int
The rank of current worker
world_size : int
The size of the entire communicator
output_tensor_list : List of tensor
The received tensors
input_tensor_list : List of tensor
The tensors to exchange
device: th.device
Device of the tensors
"""
if
th
.
distributed
.
get_backend
()
==
"nccl"
:
input_tensor_list
=
[
tensor
.
to
(
th
.
device
(
device
))
for
tensor
in
input_tensor_list
]
th
.
distributed
.
all_to_all
(
output_tensor_list
,
input_tensor_list
)
else
:
alltoall_cpu
(
rank
,
world_size
,
output_tensor_list
,
input_tensor_list
,
)
def
alltoallv
(
rank
,
world_size
,
output_tensor_list
,
input_tensor_list
,
device
):
"""Each process scatters list of input tensors to all processes in a cluster
and return gathered list of tensors in output list.
Parameters
----------
rank : int
The rank of current worker
world_size : int
The size of the entire communicator
output_tensor_list : List of tensor
The received tensors
input_tensor_list : List of tensor
The tensors to exchange
device: th.device
Device of the tensors
"""
if
th
.
distributed
.
get_backend
()
==
"nccl"
:
input_tensor_list
=
[
tensor
.
to
(
th
.
device
(
device
))
for
tensor
in
input_tensor_list
]
th
.
distributed
.
all_to_all
(
output_tensor_list
,
input_tensor_list
)
else
:
alltoallv_cpu
(
rank
,
world_size
,
output_tensor_list
,
input_tensor_list
,
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment