Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
d912947b
Unverified
Commit
d912947b
authored
Sep 22, 2023
by
Rhett Ying
Committed by
GitHub
Sep 22, 2023
Browse files
[GraphBolt] enable gpu train for ogbn-mag (#6373)
parent
cef5a14a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
27 additions
and
18 deletions
+27
-18
examples/sampling/graphbolt/rgcn/README.md
examples/sampling/graphbolt/rgcn/README.md
+14
-7
examples/sampling/graphbolt/rgcn/hetero_rgcn.py
examples/sampling/graphbolt/rgcn/hetero_rgcn.py
+13
-11
No files found.
examples/sampling/graphbolt/rgcn/README.md
View file @
d912947b
...
...
@@ -4,18 +4,25 @@ This example aims to demonstrate how to run node classification task on heteroge
## Run on `ogbn-mag` dataset
###
Command
###
Sample on CPU and train/infer on CPU
```
python3 hetero_rgcn.py
python3 hetero_rgcn.py
--dataset ogbn-mag
```
### Statistics of train/validation/test
Below results are run on AWS EC2 r6idn.metal, 1024GB RAM, 128 vCPUs(Ice Lake 8375C), 0 GPUs.
### Sample on CPU and train/infer on GPU
```
python3 hetero_rgcn.py --dataset ogbn-mag --num_gups 1
```
| Dataset Size | Peak CPU RAM Usage | Time Per Epoch(Training) | Time Per Epoch(Inference: train/val/test set) |
| ------------ | ------------- | ------------------------ | --------------------------- |
| ~1.1GB | ~5GB | ~3min | ~1min40s + ~0min9s + ~0min7s |
### Resource usage and time cost
Below results are roughly collected from an AWS EC2
**g4dn.metal**
, 384GB RAM, 96 vCPUs(Cascade Lake P-8259L), 8 NVIDIA T4 GPUs.
| Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) | Time Per Epoch(Inference: train/val/test set) |
| ------------ | ------------- | ----------- | ---------- | --------- | --------------------------- |
| ~1.1GB | ~5GB | 0 | 0GB | ~4min5s | ~2min7s + ~0min12s + ~0min8s |
| ~1.1GB | ~4.3GB | 1 | 4.7GB | ~1min18s | ~1min54s + ~0min12s + ~0min8s |
### Accuracies
```
Final performance:
All runs:
...
...
examples/sampling/graphbolt/rgcn/hetero_rgcn.py
View file @
d912947b
...
...
@@ -76,7 +76,6 @@ def load_dataset(dataset_name):
valid_set
=
dataset
.
tasks
[
0
].
validation_set
test_set
=
dataset
.
tasks
[
0
].
test_set
num_classes
=
dataset
.
tasks
[
0
].
metadata
[
"num_classes"
]
print
(
len
(
train_set
),
len
(
valid_set
),
len
(
test_set
))
return
graph
,
features
,
train_set
,
valid_set
,
test_set
,
num_classes
...
...
@@ -127,6 +126,8 @@ def create_dataloader(
# Move the mini-batch to the appropriate device.
# `device`:
# The device to move the mini-batch to.
# [TODO] Moving `MiniBatch` to GPU is not supported yet.
device
=
th
.
device
(
"cpu"
)
datapipe
=
datapipe
.
copy_to
(
device
)
# Create a DataLoader from the datapipe.
...
...
@@ -424,11 +425,14 @@ class Logger(object):
def
extract_node_features
(
name
,
block
,
data
,
node_embed
,
device
):
"""Extract the node features from embedding layer or raw features."""
if
name
==
"ogbn-mag"
:
input_nodes
=
{
k
:
v
.
to
(
device
)
for
k
,
v
in
data
.
input_nodes
.
items
()}
# Extract node embeddings for the input nodes.
node_features
=
extract_embed
(
node_embed
,
data
.
input_nodes
)
node_features
=
extract_embed
(
node_embed
,
input_nodes
)
# Add the batch's raw "paper" features. Corresponds to the content
# in the function `rel_graph_embed` comment.
node_features
.
update
({
"paper"
:
data
.
node_features
[(
"paper"
,
"feat"
)]})
node_features
.
update
(
{
"paper"
:
data
.
node_features
[(
"paper"
,
"feat"
)].
to
(
device
)}
)
else
:
node_features
=
{
ntype
:
block
.
srcnodes
[
ntype
].
data
[
"feat"
]
...
...
@@ -491,7 +495,7 @@ def evaluate(
y_true
=
list
()
for
data
in
tqdm
(
data_loader
,
desc
=
"Inference"
):
blocks
=
data
.
to_dgl_blocks
()
blocks
=
[
block
.
to
(
device
)
for
block
in
data
.
to_dgl_blocks
()
]
node_features
=
extract_node_features
(
name
,
blocks
[
0
],
data
,
node_embed
,
device
)
...
...
@@ -503,7 +507,7 @@ def evaluate(
# argmax.
y_hat
=
logits
.
log_softmax
(
dim
=-
1
).
argmax
(
dim
=
1
,
keepdims
=
True
)
y_hats
.
append
(
y_hat
.
cpu
())
y_true
.
append
(
data
.
labels
[
category
].
long
()
.
cpu
()
)
y_true
.
append
(
data
.
labels
[
category
].
long
())
y_pred
=
th
.
cat
(
y_hats
,
dim
=
0
)
y_true
=
th
.
cat
(
y_true
,
dim
=
0
)
...
...
@@ -562,7 +566,7 @@ def run(
num_seeds
=
data
.
seed_nodes
[
category
].
shape
[
0
]
# Convert MiniBatch to DGL Blocks.
blocks
=
data
.
to_dgl_blocks
()
blocks
=
[
block
.
to
(
device
)
for
block
in
data
.
to_dgl_blocks
()
]
# Extract the node features from embedding layer or raw features.
node_features
=
extract_node_features
(
...
...
@@ -574,7 +578,7 @@ def run(
# Generate predictions.
logits
=
model
(
node_features
,
blocks
)[
category
]
y_hat
=
logits
.
log_softmax
(
dim
=-
1
)
y_hat
=
logits
.
log_softmax
(
dim
=-
1
)
.
cpu
()
loss
=
F
.
nll_loss
(
y_hat
,
data
.
labels
[
category
].
long
())
loss
.
backward
()
optimizer
.
step
()
...
...
@@ -625,9 +629,7 @@ def run(
def
main
(
args
):
if
args
.
gpu
>
0
:
raise
RuntimeError
(
"GPU training is not supported."
)
device
=
th
.
device
(
"cpu"
)
device
=
th
.
device
(
"cuda"
)
if
args
.
num_gpus
>
0
else
th
.
device
(
"cpu"
)
# Initialize a logger.
logger
=
Logger
(
args
.
runs
)
...
...
@@ -729,7 +731,7 @@ if __name__ == "__main__":
)
parser
.
add_argument
(
"--runs"
,
type
=
int
,
default
=
5
)
parser
.
add_argument
(
"--num_workers"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--gpu"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--
num_
gpu
s
"
,
type
=
int
,
default
=
0
)
args
=
parser
.
parse_args
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment