Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
d3dd8e37
Unverified
Commit
d3dd8e37
authored
Sep 28, 2023
by
peizhou001
Committed by
GitHub
Sep 28, 2023
Browse files
[Graphbolt]Add to dgl datapipe warpper (#6390)
parent
d8101fe4
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
93 additions
and
28 deletions
+93
-28
examples/sampling/graphbolt/lightning/node_classification.py
examples/sampling/graphbolt/lightning/node_classification.py
+5
-5
examples/sampling/graphbolt/link_prediction.py
examples/sampling/graphbolt/link_prediction.py
+16
-13
examples/sampling/graphbolt/node_classification.py
examples/sampling/graphbolt/node_classification.py
+14
-6
examples/sampling/graphbolt/rgcn/hetero_rgcn.py
examples/sampling/graphbolt/rgcn/hetero_rgcn.py
+7
-4
python/dgl/graphbolt/minibatch_transformer.py
python/dgl/graphbolt/minibatch_transformer.py
+18
-0
tests/python/pytorch/graphbolt/test_minibatch_transformer.py
tests/python/pytorch/graphbolt/test_minibatch_transformer.py
+33
-0
No files found.
examples/sampling/graphbolt/lightning/node_classification.py
View file @
d3dd8e37
...
@@ -93,9 +93,8 @@ class SAGE(LightningModule):
...
@@ -93,9 +93,8 @@ class SAGE(LightningModule):
)
)
def
training_step
(
self
,
batch
,
batch_idx
):
def
training_step
(
self
,
batch
,
batch_idx
):
# TODO: Move this to the data pipeline as a stage.
blocks
=
[
block
.
to
(
"cuda"
)
for
block
in
batch
.
blocks
]
blocks
=
[
block
.
to
(
"cuda"
)
for
block
in
batch
.
to_dgl_blocks
()]
x
=
batch
.
node_features
[
"feat"
]
x
=
blocks
[
0
].
srcdata
[
"feat"
]
y
=
batch
.
labels
.
to
(
"cuda"
)
y
=
batch
.
labels
.
to
(
"cuda"
)
y_hat
=
self
(
blocks
,
x
)
y_hat
=
self
(
blocks
,
x
)
loss
=
F
.
cross_entropy
(
y_hat
,
y
)
loss
=
F
.
cross_entropy
(
y_hat
,
y
)
...
@@ -111,8 +110,8 @@ class SAGE(LightningModule):
...
@@ -111,8 +110,8 @@ class SAGE(LightningModule):
return
loss
return
loss
def
validation_step
(
self
,
batch
,
batch_idx
):
def
validation_step
(
self
,
batch
,
batch_idx
):
blocks
=
[
block
.
to
(
"cuda"
)
for
block
in
batch
.
to_dgl_
blocks
()
]
blocks
=
[
block
.
to
(
"cuda"
)
for
block
in
batch
.
blocks
]
x
=
b
locks
[
0
].
srcdata
[
"feat"
]
x
=
b
atch
.
node_features
[
"feat"
]
y
=
batch
.
labels
.
to
(
"cuda"
)
y
=
batch
.
labels
.
to
(
"cuda"
)
y_hat
=
self
(
blocks
,
x
)
y_hat
=
self
(
blocks
,
x
)
self
.
val_acc
(
torch
.
argmax
(
y_hat
,
1
),
y
)
self
.
val_acc
(
torch
.
argmax
(
y_hat
,
1
),
y
)
...
@@ -160,6 +159,7 @@ class DataModule(LightningDataModule):
...
@@ -160,6 +159,7 @@ class DataModule(LightningDataModule):
)
)
datapipe
=
sampler
(
self
.
graph
,
self
.
fanouts
)
datapipe
=
sampler
(
self
.
graph
,
self
.
fanouts
)
datapipe
=
datapipe
.
fetch_feature
(
self
.
feature_store
,
[
"feat"
])
datapipe
=
datapipe
.
fetch_feature
(
self
.
feature_store
,
[
"feat"
])
datapipe
=
datapipe
.
to_dgl
()
dataloader
=
gb
.
MultiProcessDataLoader
(
dataloader
=
gb
.
MultiProcessDataLoader
(
datapipe
,
num_workers
=
self
.
num_workers
datapipe
,
num_workers
=
self
.
num_workers
)
)
...
...
examples/sampling/graphbolt/link_prediction.py
View file @
d3dd8e37
...
@@ -167,6 +167,18 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
...
@@ -167,6 +167,18 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
############################################################################
############################################################################
datapipe
=
datapipe
.
fetch_feature
(
features
,
node_feature_keys
=
[
"feat"
])
datapipe
=
datapipe
.
fetch_feature
(
features
,
node_feature_keys
=
[
"feat"
])
############################################################################
# [Step-4]:
# gb.to_dgl()
# [Input]:
# 'datapipe': The previous datapipe object.
# [Output]:
# A DGLMiniBatch used for computing.
# [Role]:
# Convert a mini-batch to dgl-minibatch.
############################################################################
datapipe
=
gb
.
to_dgl
()
############################################################################
############################################################################
# [Input]:
# [Input]:
# 'device': The device to copy the data to.
# 'device': The device to copy the data to.
...
@@ -193,19 +205,10 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
...
@@ -193,19 +205,10 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
return
dataloader
return
dataloader
# TODO[Keli]: Remove this helper function later.
def
to_binary_link_dgl_computing_pack
(
data
:
gb
.
MiniBatch
):
def
to_binary_link_dgl_computing_pack
(
data
:
gb
.
MiniBatch
):
"""Convert the minibatch to a training pair and a label tensor."""
"""Convert the minibatch to a training pair and a label tensor."""
batch_size
=
data
.
compacted_node_pairs
[
0
].
shape
[
0
]
pos_src
,
pos_dst
=
data
.
positive_node_pairs
neg_ratio
=
data
.
compacted_negative_dsts
.
shape
[
0
]
//
batch_size
neg_src
,
neg_dst
=
data
.
negative_node_pairs
pos_src
,
pos_dst
=
data
.
compacted_node_pairs
if
data
.
compacted_negative_srcs
is
None
:
neg_src
=
pos_src
.
repeat_interleave
(
neg_ratio
,
dim
=
0
)
else
:
neg_src
=
data
.
compacted_negative_srcs
neg_dst
=
data
.
compacted_negative_dsts
node_pairs
=
(
node_pairs
=
(
torch
.
cat
((
pos_src
,
neg_src
),
dim
=
0
),
torch
.
cat
((
pos_src
,
neg_src
),
dim
=
0
),
torch
.
cat
((
pos_dst
,
neg_dst
),
dim
=
0
),
torch
.
cat
((
pos_dst
,
neg_dst
),
dim
=
0
),
...
@@ -234,7 +237,7 @@ def evaluate(args, graph, features, itemset, model):
...
@@ -234,7 +237,7 @@ def evaluate(args, graph, features, itemset, model):
# Unpack MiniBatch.
# Unpack MiniBatch.
compacted_pairs
,
_
=
to_binary_link_dgl_computing_pack
(
data
)
compacted_pairs
,
_
=
to_binary_link_dgl_computing_pack
(
data
)
node_feature
=
data
.
node_features
[
"feat"
].
float
()
node_feature
=
data
.
node_features
[
"feat"
].
float
()
blocks
=
data
.
to_dgl_
blocks
()
blocks
=
data
.
blocks
# Get the embeddings of the input nodes.
# Get the embeddings of the input nodes.
y
=
model
(
blocks
,
node_feature
)
y
=
model
(
blocks
,
node_feature
)
...
@@ -272,7 +275,7 @@ def train(args, graph, features, train_set, valid_set, model):
...
@@ -272,7 +275,7 @@ def train(args, graph, features, train_set, valid_set, model):
compacted_pairs
,
labels
=
to_binary_link_dgl_computing_pack
(
data
)
compacted_pairs
,
labels
=
to_binary_link_dgl_computing_pack
(
data
)
node_feature
=
data
.
node_features
[
"feat"
].
float
()
node_feature
=
data
.
node_features
[
"feat"
].
float
()
# Convert sampled subgraphs to DGL blocks.
# Convert sampled subgraphs to DGL blocks.
blocks
=
data
.
to_dgl_
blocks
()
blocks
=
data
.
blocks
# Get the embeddings of the input nodes.
# Get the embeddings of the input nodes.
y
=
model
(
blocks
,
node_feature
)
y
=
model
(
blocks
,
node_feature
)
...
...
examples/sampling/graphbolt/node_classification.py
View file @
d3dd8e37
...
@@ -140,6 +140,18 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
...
@@ -140,6 +140,18 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
############################################################################
############################################################################
# [Step-4]:
# [Step-4]:
# self.to_dgl()
# [Input]:
# 'datapipe': The previous datapipe object.
# [Output]:
# A DGLMiniBatch used for computing.
# [Role]:
# Convert a mini-batch to dgl-minibatch.
############################################################################
datapipe
=
datapipe
.
to_dgl
()
############################################################################
# [Step-5]:
# gb.MultiProcessDataLoader()
# gb.MultiProcessDataLoader()
# [Input]:
# [Input]:
# 'datapipe': The datapipe object to be used for data loading.
# 'datapipe': The datapipe object to be used for data loading.
...
@@ -167,11 +179,9 @@ def evaluate(args, model, graph, features, itemset, num_classes):
...
@@ -167,11 +179,9 @@ def evaluate(args, model, graph, features, itemset, num_classes):
)
)
for
step
,
data
in
tqdm
.
tqdm
(
enumerate
(
dataloader
)):
for
step
,
data
in
tqdm
.
tqdm
(
enumerate
(
dataloader
)):
blocks
=
data
.
to_dgl_blocks
()
x
=
data
.
node_features
[
"feat"
]
x
=
data
.
node_features
[
"feat"
]
y
.
append
(
data
.
labels
)
y
.
append
(
data
.
labels
)
y_hats
.
append
(
model
(
blocks
,
x
))
y_hats
.
append
(
model
(
data
.
blocks
,
x
))
res
=
MF
.
accuracy
(
res
=
MF
.
accuracy
(
torch
.
cat
(
y_hats
),
torch
.
cat
(
y_hats
),
...
@@ -201,9 +211,7 @@ def train(args, graph, features, train_set, valid_set, num_classes, model):
...
@@ -201,9 +211,7 @@ def train(args, graph, features, train_set, valid_set, num_classes, model):
# in the last layer's computation graph.
# in the last layer's computation graph.
y
=
data
.
labels
y
=
data
.
labels
# TODO[Mingbang]: Move the to_dgl_blocks() to a datapipe stage later
y_hat
=
model
(
data
.
blocks
,
x
)
# The predicted labels.
y_hat
=
model
(
data
.
to_dgl_blocks
(),
x
)
# Compute loss.
# Compute loss.
loss
=
F
.
cross_entropy
(
y_hat
,
y
)
loss
=
F
.
cross_entropy
(
y_hat
,
y
)
...
...
examples/sampling/graphbolt/rgcn/hetero_rgcn.py
View file @
d3dd8e37
...
@@ -123,6 +123,9 @@ def create_dataloader(
...
@@ -123,6 +123,9 @@ def create_dataloader(
node_feature_keys
[
"institution"
]
=
[
"feat"
]
node_feature_keys
[
"institution"
]
=
[
"feat"
]
datapipe
=
datapipe
.
fetch_feature
(
features
,
node_feature_keys
)
datapipe
=
datapipe
.
fetch_feature
(
features
,
node_feature_keys
)
# Convert a mini-batch to dgl mini-batch for computing.
datapipe
=
datapipe
.
to_dgl
()
# Move the mini-batch to the appropriate device.
# Move the mini-batch to the appropriate device.
# `device`:
# `device`:
# The device to move the mini-batch to.
# The device to move the mini-batch to.
...
@@ -435,7 +438,7 @@ def extract_node_features(name, block, data, node_embed, device):
...
@@ -435,7 +438,7 @@ def extract_node_features(name, block, data, node_embed, device):
)
)
else
:
else
:
node_features
=
{
node_features
=
{
ntype
:
block
.
srcnod
es
[
ntype
].
data
[
"feat"
].
to
(
device
)
ntype
:
data
.
node_featur
es
[
(
ntype
,
"feat"
)]
for
ntype
in
block
.
srctypes
for
ntype
in
block
.
srctypes
}
}
# Original feature data are stored in float16 while model weights are
# Original feature data are stored in float16 while model weights are
...
@@ -495,7 +498,7 @@ def evaluate(
...
@@ -495,7 +498,7 @@ def evaluate(
y_true
=
list
()
y_true
=
list
()
for
data
in
tqdm
(
data_loader
,
desc
=
"Inference"
):
for
data
in
tqdm
(
data_loader
,
desc
=
"Inference"
):
blocks
=
[
block
.
to
(
device
)
for
block
in
data
.
to_dgl_
blocks
()
]
blocks
=
[
block
.
to
(
device
)
for
block
in
data
.
blocks
]
node_features
=
extract_node_features
(
node_features
=
extract_node_features
(
name
,
blocks
[
0
],
data
,
node_embed
,
device
name
,
blocks
[
0
],
data
,
node_embed
,
device
)
)
...
@@ -563,10 +566,10 @@ def run(
...
@@ -563,10 +566,10 @@ def run(
)
)
for
data
in
tqdm
(
data_loader
,
desc
=
f
"Training~Epoch
{
epoch
:
02
d
}
"
):
for
data
in
tqdm
(
data_loader
,
desc
=
f
"Training~Epoch
{
epoch
:
02
d
}
"
):
# Fetch the number of seed nodes in the batch.
# Fetch the number of seed nodes in the batch.
num_seeds
=
data
.
seed
_nodes
[
category
].
shape
[
0
]
num_seeds
=
data
.
output
_nodes
[
category
].
shape
[
0
]
# Convert MiniBatch to DGL Blocks.
# Convert MiniBatch to DGL Blocks.
blocks
=
[
block
.
to
(
device
)
for
block
in
data
.
to_dgl_
blocks
()
]
blocks
=
[
block
.
to
(
device
)
for
block
in
data
.
blocks
]
# Extract the node features from embedding layer or raw features.
# Extract the node features from embedding layer or raw features.
node_features
=
extract_node_features
(
node_features
=
extract_node_features
(
...
...
python/dgl/graphbolt/minibatch_transformer.py
View file @
d3dd8e37
...
@@ -35,3 +35,21 @@ class MiniBatchTransformer(Mapper):
...
@@ -35,3 +35,21 @@ class MiniBatchTransformer(Mapper):
minibatch
,
MiniBatch
minibatch
,
MiniBatch
),
"The transformer output should be an instance of MiniBatch"
),
"The transformer output should be an instance of MiniBatch"
return
minibatch
return
minibatch
@
functional_datapipe
(
"to_dgl"
)
class
DGLMiniBatchConverter
(
Mapper
):
"""Convert a graphbolt mini-batch to a dgl mini-batch."""
def
__init__
(
self
,
datapipe
,
):
"""
Initlization for a subgraph transformer.
Parameters
----------
datapipe : DataPipe
The datapipe.
"""
super
().
__init__
(
datapipe
,
MiniBatch
.
to_dgl
)
tests/python/pytorch/graphbolt/test_minibatch_transformer.py
0 → 100644
View file @
d3dd8e37
import
dgl.graphbolt
as
gb
import
gb_test_utils
import
torch
def
test_dgl_minibatch_converter
():
N
=
32
B
=
4
itemset
=
gb
.
ItemSet
(
torch
.
arange
(
N
),
names
=
"seed_nodes"
)
graph
=
gb_test_utils
.
rand_csc_graph
(
200
,
0.15
)
features
=
{}
keys
=
[(
"node"
,
None
,
"a"
),
(
"node"
,
None
,
"b"
)]
features
[
keys
[
0
]]
=
gb
.
TorchBasedFeature
(
torch
.
randn
(
200
,
4
))
features
[
keys
[
1
]]
=
gb
.
TorchBasedFeature
(
torch
.
randn
(
200
,
4
))
feature_store
=
gb
.
BasicFeatureStore
(
features
)
item_sampler
=
gb
.
ItemSampler
(
itemset
,
batch_size
=
B
)
subgraph_sampler
=
gb
.
NeighborSampler
(
item_sampler
,
graph
,
fanouts
=
[
torch
.
LongTensor
([
2
])
for
_
in
range
(
2
)],
)
feature_fetcher
=
gb
.
FeatureFetcher
(
subgraph_sampler
,
feature_store
,
[
"a"
],
)
dgl_converter
=
gb
.
DGLMiniBatchConverter
(
feature_fetcher
)
dataloader
=
gb
.
SingleProcessDataLoader
(
dgl_converter
)
assert
len
(
list
(
dataloader
))
==
N
//
B
minibatch
=
next
(
iter
(
dataloader
))
assert
isinstance
(
minibatch
,
gb
.
DGLMiniBatch
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment