Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
704bcaf6
Unverified
Commit
704bcaf6
authored
Feb 19, 2023
by
Hongzhi (Steve), Chen
Committed by
GitHub
Feb 19, 2023
Browse files
examples (#5323)
Co-authored-by:
Ubuntu
<
ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal
>
parent
6bc82161
Changes
332
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
921 additions
and
485 deletions
+921
-485
examples/pytorch/gat/train.py
examples/pytorch/gat/train.py
+2
-2
examples/pytorch/gat/train_ppi.py
examples/pytorch/gat/train_ppi.py
+57
-25
examples/pytorch/gatv2/train.py
examples/pytorch/gatv2/train.py
+8
-5
examples/pytorch/gcmc/data.py
examples/pytorch/gcmc/data.py
+420
-176
examples/pytorch/gcmc/model.py
examples/pytorch/gcmc/model.py
+95
-78
examples/pytorch/gcmc/train.py
examples/pytorch/gcmc/train.py
+7
-2
examples/pytorch/gcmc/train_sampling.py
examples/pytorch/gcmc/train_sampling.py
+271
-151
examples/pytorch/gcn/train.py
examples/pytorch/gcn/train.py
+3
-3
examples/pytorch/geniepath/model.py
examples/pytorch/geniepath/model.py
+1
-1
examples/pytorch/geniepath/ppi.py
examples/pytorch/geniepath/ppi.py
+2
-2
examples/pytorch/geniepath/pubmed.py
examples/pytorch/geniepath/pubmed.py
+2
-2
examples/pytorch/ggnn/data_utils.py
examples/pytorch/ggnn/data_utils.py
+3
-3
examples/pytorch/ggnn/ggnn_gc.py
examples/pytorch/ggnn/ggnn_gc.py
+1
-1
examples/pytorch/ggnn/ggnn_ns.py
examples/pytorch/ggnn/ggnn_ns.py
+2
-3
examples/pytorch/ggnn/ggsnn.py
examples/pytorch/ggnn/ggsnn.py
+1
-1
examples/pytorch/gin/train.py
examples/pytorch/gin/train.py
+2
-2
examples/pytorch/gnn_explainer/explain_main.py
examples/pytorch/gnn_explainer/explain_main.py
+10
-6
examples/pytorch/gnn_explainer/models.py
examples/pytorch/gnn_explainer/models.py
+8
-6
examples/pytorch/gnn_explainer/train_main.py
examples/pytorch/gnn_explainer/train_main.py
+25
-14
examples/pytorch/grace/aug.py
examples/pytorch/grace/aug.py
+1
-2
No files found.
examples/pytorch/gat/train.py
View file @
704bcaf6
import
argparse
import
dgl.nn
as
dglnn
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
dgl.nn
as
dglnn
from
dgl
import
AddSelfLoop
from
dgl.data
import
CiteseerGraphDataset
,
CoraGraphDataset
,
PubmedGraphDataset
...
...
examples/pytorch/gat/train_ppi.py
View file @
704bcaf6
import
dgl.nn
as
dglnn
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
dgl.nn
as
dglnn
from
dgl.data.ppi
import
PPIDataset
from
dgl.dataloading
import
GraphDataLoader
from
sklearn.metrics
import
f1_score
class
GAT
(
nn
.
Module
):
def
__init__
(
self
,
in_size
,
hid_size
,
out_size
,
heads
):
super
().
__init__
()
self
.
gat_layers
=
nn
.
ModuleList
()
# three-layer GAT
self
.
gat_layers
.
append
(
dglnn
.
GATConv
(
in_size
,
hid_size
,
heads
[
0
],
activation
=
F
.
elu
))
self
.
gat_layers
.
append
(
dglnn
.
GATConv
(
hid_size
*
heads
[
0
],
hid_size
,
heads
[
1
],
residual
=
True
,
activation
=
F
.
elu
))
self
.
gat_layers
.
append
(
dglnn
.
GATConv
(
hid_size
*
heads
[
1
],
out_size
,
heads
[
2
],
residual
=
True
,
activation
=
None
))
self
.
gat_layers
.
append
(
dglnn
.
GATConv
(
in_size
,
hid_size
,
heads
[
0
],
activation
=
F
.
elu
)
)
self
.
gat_layers
.
append
(
dglnn
.
GATConv
(
hid_size
*
heads
[
0
],
hid_size
,
heads
[
1
],
residual
=
True
,
activation
=
F
.
elu
,
)
)
self
.
gat_layers
.
append
(
dglnn
.
GATConv
(
hid_size
*
heads
[
1
],
out_size
,
heads
[
2
],
residual
=
True
,
activation
=
None
,
)
)
def
forward
(
self
,
g
,
inputs
):
h
=
inputs
for
i
,
layer
in
enumerate
(
self
.
gat_layers
):
h
=
layer
(
g
,
h
)
if
i
==
2
:
# last layer
if
i
==
2
:
# last layer
h
=
h
.
mean
(
1
)
else
:
# other layer(s)
else
:
# other layer(s)
h
=
h
.
flatten
(
1
)
return
h
def
evaluate
(
g
,
features
,
labels
,
model
):
model
.
eval
()
with
torch
.
no_grad
():
output
=
model
(
g
,
features
)
pred
=
np
.
where
(
output
.
data
.
cpu
().
numpy
()
>=
0
,
1
,
0
)
score
=
f1_score
(
labels
.
data
.
cpu
().
numpy
(),
pred
,
average
=
'
micro
'
)
score
=
f1_score
(
labels
.
data
.
cpu
().
numpy
(),
pred
,
average
=
"
micro
"
)
return
score
def
evaluate_in_batches
(
dataloader
,
device
,
model
):
total_score
=
0
for
batch_id
,
batched_graph
in
enumerate
(
dataloader
):
batched_graph
=
batched_graph
.
to
(
device
)
features
=
batched_graph
.
ndata
[
'
feat
'
]
labels
=
batched_graph
.
ndata
[
'
label
'
]
features
=
batched_graph
.
ndata
[
"
feat
"
]
labels
=
batched_graph
.
ndata
[
"
label
"
]
score
=
evaluate
(
batched_graph
,
features
,
labels
,
model
)
total_score
+=
score
return
total_score
/
(
batch_id
+
1
)
# return average score
return
total_score
/
(
batch_id
+
1
)
# return average score
def
train
(
train_dataloader
,
val_dataloader
,
device
,
model
):
# define loss function and optimizer
...
...
@@ -57,44 +79,54 @@ def train(train_dataloader, val_dataloader, device, model):
# mini-batch loop
for
batch_id
,
batched_graph
in
enumerate
(
train_dataloader
):
batched_graph
=
batched_graph
.
to
(
device
)
features
=
batched_graph
.
ndata
[
'
feat
'
].
float
()
labels
=
batched_graph
.
ndata
[
'
label
'
].
float
()
features
=
batched_graph
.
ndata
[
"
feat
"
].
float
()
labels
=
batched_graph
.
ndata
[
"
label
"
].
float
()
logits
=
model
(
batched_graph
,
features
)
loss
=
loss_fcn
(
logits
,
labels
)
optimizer
.
zero_grad
()
loss
.
backward
()
optimizer
.
step
()
total_loss
+=
loss
.
item
()
print
(
"Epoch {:05d} | Loss {:.4f} |"
.
format
(
epoch
,
total_loss
/
(
batch_id
+
1
)
))
print
(
"Epoch {:05d} | Loss {:.4f} |"
.
format
(
epoch
,
total_loss
/
(
batch_id
+
1
)
)
)
if
(
epoch
+
1
)
%
5
==
0
:
avg_score
=
evaluate_in_batches
(
val_dataloader
,
device
,
model
)
# evaluate F1-score instead of loss
print
(
" Acc. (F1-score) {:.4f} "
.
format
(
avg_score
))
avg_score
=
evaluate_in_batches
(
val_dataloader
,
device
,
model
)
# evaluate F1-score instead of loss
print
(
" Acc. (F1-score) {:.4f} "
.
format
(
avg_score
)
)
if
__name__
==
'
__main__
'
:
print
(
f
'
Training PPI Dataset with DGL built-in GATConv module.
'
)
device
=
torch
.
device
(
'
cuda
'
if
torch
.
cuda
.
is_available
()
else
'
cpu
'
)
if
__name__
==
"
__main__
"
:
print
(
f
"
Training PPI Dataset with DGL built-in GATConv module.
"
)
device
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
# load and preprocess datasets
train_dataset
=
PPIDataset
(
mode
=
'
train
'
)
val_dataset
=
PPIDataset
(
mode
=
'
valid
'
)
test_dataset
=
PPIDataset
(
mode
=
'
test
'
)
features
=
train_dataset
[
0
].
ndata
[
'
feat
'
]
train_dataset
=
PPIDataset
(
mode
=
"
train
"
)
val_dataset
=
PPIDataset
(
mode
=
"
valid
"
)
test_dataset
=
PPIDataset
(
mode
=
"
test
"
)
features
=
train_dataset
[
0
].
ndata
[
"
feat
"
]
# create GAT model
in_size
=
features
.
shape
[
1
]
out_size
=
train_dataset
.
num_labels
model
=
GAT
(
in_size
,
256
,
out_size
,
heads
=
[
4
,
4
,
6
]).
to
(
device
)
model
=
GAT
(
in_size
,
256
,
out_size
,
heads
=
[
4
,
4
,
6
]).
to
(
device
)
# model training
print
(
'
Training...
'
)
print
(
"
Training...
"
)
train_dataloader
=
GraphDataLoader
(
train_dataset
,
batch_size
=
2
)
val_dataloader
=
GraphDataLoader
(
val_dataset
,
batch_size
=
2
)
train
(
train_dataloader
,
val_dataloader
,
device
,
model
)
# test the model
print
(
'
Testing...
'
)
print
(
"
Testing...
"
)
test_dataloader
=
GraphDataLoader
(
test_dataset
,
batch_size
=
2
)
avg_score
=
evaluate_in_batches
(
test_dataloader
,
device
,
model
)
print
(
"Test Accuracy (F1-score) {:.4f}"
.
format
(
avg_score
))
examples/pytorch/gatv2/train.py
View file @
704bcaf6
...
...
@@ -6,15 +6,19 @@ Multiple heads are also batched together for faster training.
import
argparse
import
time
import
dgl
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
from
dgl.data
import
(
CiteseerGraphDataset
,
CoraGraphDataset
,
PubmedGraphDataset
,
register_data_args
,
)
from
gatv2
import
GATv2
import
dgl
from
dgl.data
import
(
CiteseerGraphDataset
,
CoraGraphDataset
,
PubmedGraphDataset
,
register_data_args
)
class
EarlyStopping
:
def
__init__
(
self
,
patience
=
10
):
...
...
@@ -180,7 +184,6 @@ def main(args):
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"GAT"
)
register_data_args
(
parser
)
parser
.
add_argument
(
...
...
examples/pytorch/gcmc/data.py
View file @
704bcaf6
"""MovieLens dataset"""
import
numpy
as
np
import
os
import
re
import
dgl
import
numpy
as
np
import
pandas
as
pd
import
scipy.sparse
as
sp
import
torch
as
th
import
dgl
from
dgl.data.utils
import
download
,
extract_archive
,
get_download_dir
from
utils
import
to_etype_name
_urls
=
{
'
ml-100k
'
:
'
http://files.grouplens.org/datasets/movielens/ml-100k.zip
'
,
'
ml-1m
'
:
'
http://files.grouplens.org/datasets/movielens/ml-1m.zip
'
,
'
ml-10m
'
:
'
http://files.grouplens.org/datasets/movielens/ml-10m.zip
'
,
"
ml-100k
"
:
"
http://files.grouplens.org/datasets/movielens/ml-100k.zip
"
,
"
ml-1m
"
:
"
http://files.grouplens.org/datasets/movielens/ml-1m.zip
"
,
"
ml-10m
"
:
"
http://files.grouplens.org/datasets/movielens/ml-10m.zip
"
,
}
READ_DATASET_PATH
=
get_download_dir
()
GENRES_ML_100K
=
\
[
'unknown'
,
'Action'
,
'Adventure'
,
'Animation'
,
'Children'
,
'Comedy'
,
'Crime'
,
'Documentary'
,
'Drama'
,
'Fantasy'
,
'Film-Noir'
,
'Horror'
,
'Musical'
,
'Mystery'
,
'Romance'
,
'Sci-Fi'
,
'Thriller'
,
'War'
,
'Western'
]
GENRES_ML_100K
=
[
"unknown"
,
"Action"
,
"Adventure"
,
"Animation"
,
"Children"
,
"Comedy"
,
"Crime"
,
"Documentary"
,
"Drama"
,
"Fantasy"
,
"Film-Noir"
,
"Horror"
,
"Musical"
,
"Mystery"
,
"Romance"
,
"Sci-Fi"
,
"Thriller"
,
"War"
,
"Western"
,
]
GENRES_ML_1M
=
GENRES_ML_100K
[
1
:]
GENRES_ML_10M
=
GENRES_ML_100K
+
[
'IMAX'
]
GENRES_ML_10M
=
GENRES_ML_100K
+
[
"IMAX"
]
class
MovieLens
(
object
):
"""MovieLens dataset used by GCMC model
...
...
@@ -97,9 +114,17 @@ class MovieLens(object):
Ratio of validation data
"""
def
__init__
(
self
,
name
,
device
,
mix_cpu_gpu
=
False
,
use_one_hot_fea
=
False
,
symm
=
True
,
test_ratio
=
0.1
,
valid_ratio
=
0.1
):
def
__init__
(
self
,
name
,
device
,
mix_cpu_gpu
=
False
,
use_one_hot_fea
=
False
,
symm
=
True
,
test_ratio
=
0.1
,
valid_ratio
=
0.1
,
):
self
.
_name
=
name
self
.
_device
=
device
self
.
_symm
=
symm
...
...
@@ -107,57 +132,106 @@ class MovieLens(object):
self
.
_valid_ratio
=
valid_ratio
# download and extract
download_dir
=
get_download_dir
()
zip_file_path
=
'
{}/{}.zip
'
.
format
(
download_dir
,
name
)
zip_file_path
=
"
{}/{}.zip
"
.
format
(
download_dir
,
name
)
download
(
_urls
[
name
],
path
=
zip_file_path
)
extract_archive
(
zip_file_path
,
'
{}/{}
'
.
format
(
download_dir
,
name
))
if
name
==
'
ml-10m
'
:
root_folder
=
'
ml-10M100K
'
extract_archive
(
zip_file_path
,
"
{}/{}
"
.
format
(
download_dir
,
name
))
if
name
==
"
ml-10m
"
:
root_folder
=
"
ml-10M100K
"
else
:
root_folder
=
name
self
.
_dir
=
os
.
path
.
join
(
download_dir
,
name
,
root_folder
)
print
(
"Starting processing {} ..."
.
format
(
self
.
_name
))
self
.
_load_raw_user_info
()
self
.
_load_raw_movie_info
()
print
(
'......'
)
if
self
.
_name
==
'ml-100k'
:
self
.
all_train_rating_info
=
self
.
_load_raw_rates
(
os
.
path
.
join
(
self
.
_dir
,
'u1.base'
),
'
\t
'
)
self
.
test_rating_info
=
self
.
_load_raw_rates
(
os
.
path
.
join
(
self
.
_dir
,
'u1.test'
),
'
\t
'
)
self
.
all_rating_info
=
pd
.
concat
([
self
.
all_train_rating_info
,
self
.
test_rating_info
])
elif
self
.
_name
==
'ml-1m'
or
self
.
_name
==
'ml-10m'
:
self
.
all_rating_info
=
self
.
_load_raw_rates
(
os
.
path
.
join
(
self
.
_dir
,
'ratings.dat'
),
'::'
)
num_test
=
int
(
np
.
ceil
(
self
.
all_rating_info
.
shape
[
0
]
*
self
.
_test_ratio
))
print
(
"......"
)
if
self
.
_name
==
"ml-100k"
:
self
.
all_train_rating_info
=
self
.
_load_raw_rates
(
os
.
path
.
join
(
self
.
_dir
,
"u1.base"
),
"
\t
"
)
self
.
test_rating_info
=
self
.
_load_raw_rates
(
os
.
path
.
join
(
self
.
_dir
,
"u1.test"
),
"
\t
"
)
self
.
all_rating_info
=
pd
.
concat
(
[
self
.
all_train_rating_info
,
self
.
test_rating_info
]
)
elif
self
.
_name
==
"ml-1m"
or
self
.
_name
==
"ml-10m"
:
self
.
all_rating_info
=
self
.
_load_raw_rates
(
os
.
path
.
join
(
self
.
_dir
,
"ratings.dat"
),
"::"
)
num_test
=
int
(
np
.
ceil
(
self
.
all_rating_info
.
shape
[
0
]
*
self
.
_test_ratio
)
)
shuffled_idx
=
np
.
random
.
permutation
(
self
.
all_rating_info
.
shape
[
0
])
self
.
test_rating_info
=
self
.
all_rating_info
.
iloc
[
shuffled_idx
[:
num_test
]]
self
.
all_train_rating_info
=
self
.
all_rating_info
.
iloc
[
shuffled_idx
[
num_test
:
]]
self
.
test_rating_info
=
self
.
all_rating_info
.
iloc
[
shuffled_idx
[:
num_test
]
]
self
.
all_train_rating_info
=
self
.
all_rating_info
.
iloc
[
shuffled_idx
[
num_test
:]
]
else
:
raise
NotImplementedError
print
(
'......'
)
num_valid
=
int
(
np
.
ceil
(
self
.
all_train_rating_info
.
shape
[
0
]
*
self
.
_valid_ratio
))
shuffled_idx
=
np
.
random
.
permutation
(
self
.
all_train_rating_info
.
shape
[
0
])
self
.
valid_rating_info
=
self
.
all_train_rating_info
.
iloc
[
shuffled_idx
[:
num_valid
]]
self
.
train_rating_info
=
self
.
all_train_rating_info
.
iloc
[
shuffled_idx
[
num_valid
:
]]
self
.
possible_rating_values
=
np
.
unique
(
self
.
train_rating_info
[
"rating"
].
values
)
print
(
"......"
)
num_valid
=
int
(
np
.
ceil
(
self
.
all_train_rating_info
.
shape
[
0
]
*
self
.
_valid_ratio
)
)
shuffled_idx
=
np
.
random
.
permutation
(
self
.
all_train_rating_info
.
shape
[
0
]
)
self
.
valid_rating_info
=
self
.
all_train_rating_info
.
iloc
[
shuffled_idx
[:
num_valid
]
]
self
.
train_rating_info
=
self
.
all_train_rating_info
.
iloc
[
shuffled_idx
[
num_valid
:]
]
self
.
possible_rating_values
=
np
.
unique
(
self
.
train_rating_info
[
"rating"
].
values
)
print
(
"All rating pairs : {}"
.
format
(
self
.
all_rating_info
.
shape
[
0
]))
print
(
"
\t
All train rating pairs : {}"
.
format
(
self
.
all_train_rating_info
.
shape
[
0
]))
print
(
"
\t\t
Train rating pairs : {}"
.
format
(
self
.
train_rating_info
.
shape
[
0
]))
print
(
"
\t\t
Valid rating pairs : {}"
.
format
(
self
.
valid_rating_info
.
shape
[
0
]))
print
(
"
\t
Test rating pairs : {}"
.
format
(
self
.
test_rating_info
.
shape
[
0
]))
self
.
user_info
=
self
.
_drop_unseen_nodes
(
orign_info
=
self
.
user_info
,
cmp_col_name
=
"id"
,
reserved_ids_set
=
set
(
self
.
all_rating_info
[
"user_id"
].
values
),
label
=
"user"
)
self
.
movie_info
=
self
.
_drop_unseen_nodes
(
orign_info
=
self
.
movie_info
,
cmp_col_name
=
"id"
,
reserved_ids_set
=
set
(
self
.
all_rating_info
[
"movie_id"
].
values
),
label
=
"movie"
)
print
(
"
\t
All train rating pairs : {}"
.
format
(
self
.
all_train_rating_info
.
shape
[
0
]
)
)
print
(
"
\t\t
Train rating pairs : {}"
.
format
(
self
.
train_rating_info
.
shape
[
0
]
)
)
print
(
"
\t\t
Valid rating pairs : {}"
.
format
(
self
.
valid_rating_info
.
shape
[
0
]
)
)
print
(
"
\t
Test rating pairs : {}"
.
format
(
self
.
test_rating_info
.
shape
[
0
])
)
self
.
user_info
=
self
.
_drop_unseen_nodes
(
orign_info
=
self
.
user_info
,
cmp_col_name
=
"id"
,
reserved_ids_set
=
set
(
self
.
all_rating_info
[
"user_id"
].
values
),
label
=
"user"
,
)
self
.
movie_info
=
self
.
_drop_unseen_nodes
(
orign_info
=
self
.
movie_info
,
cmp_col_name
=
"id"
,
reserved_ids_set
=
set
(
self
.
all_rating_info
[
"movie_id"
].
values
),
label
=
"movie"
,
)
# Map user/movie to the global id
self
.
global_user_id_map
=
{
ele
:
i
for
i
,
ele
in
enumerate
(
self
.
user_info
[
'id'
])}
self
.
global_movie_id_map
=
{
ele
:
i
for
i
,
ele
in
enumerate
(
self
.
movie_info
[
'id'
])}
print
(
'Total user number = {}, movie number = {}'
.
format
(
len
(
self
.
global_user_id_map
),
len
(
self
.
global_movie_id_map
)))
self
.
global_user_id_map
=
{
ele
:
i
for
i
,
ele
in
enumerate
(
self
.
user_info
[
"id"
])
}
self
.
global_movie_id_map
=
{
ele
:
i
for
i
,
ele
in
enumerate
(
self
.
movie_info
[
"id"
])
}
print
(
"Total user number = {}, movie number = {}"
.
format
(
len
(
self
.
global_user_id_map
),
len
(
self
.
global_movie_id_map
)
)
)
self
.
_num_user
=
len
(
self
.
global_user_id_map
)
self
.
_num_movie
=
len
(
self
.
global_movie_id_map
)
...
...
@@ -171,8 +245,12 @@ class MovieLens(object):
self
.
user_feature
=
th
.
FloatTensor
(
self
.
_process_user_fea
())
self
.
movie_feature
=
th
.
FloatTensor
(
self
.
_process_movie_fea
())
else
:
self
.
user_feature
=
th
.
FloatTensor
(
self
.
_process_user_fea
()).
to
(
self
.
_device
)
self
.
movie_feature
=
th
.
FloatTensor
(
self
.
_process_movie_fea
()).
to
(
self
.
_device
)
self
.
user_feature
=
th
.
FloatTensor
(
self
.
_process_user_fea
()).
to
(
self
.
_device
)
self
.
movie_feature
=
th
.
FloatTensor
(
self
.
_process_movie_fea
()
).
to
(
self
.
_device
)
if
self
.
user_feature
is
None
:
self
.
user_feature_shape
=
(
self
.
num_user
,
self
.
num_user
)
self
.
movie_feature_shape
=
(
self
.
num_movie
,
self
.
num_movie
)
...
...
@@ -184,16 +262,29 @@ class MovieLens(object):
info_line
+=
"
\n
movie: {}"
.
format
(
self
.
movie_feature_shape
)
print
(
info_line
)
all_train_rating_pairs
,
all_train_rating_values
=
self
.
_generate_pair_value
(
self
.
all_train_rating_info
)
train_rating_pairs
,
train_rating_values
=
self
.
_generate_pair_value
(
self
.
train_rating_info
)
valid_rating_pairs
,
valid_rating_values
=
self
.
_generate_pair_value
(
self
.
valid_rating_info
)
test_rating_pairs
,
test_rating_values
=
self
.
_generate_pair_value
(
self
.
test_rating_info
)
(
all_train_rating_pairs
,
all_train_rating_values
,
)
=
self
.
_generate_pair_value
(
self
.
all_train_rating_info
)
train_rating_pairs
,
train_rating_values
=
self
.
_generate_pair_value
(
self
.
train_rating_info
)
valid_rating_pairs
,
valid_rating_values
=
self
.
_generate_pair_value
(
self
.
valid_rating_info
)
test_rating_pairs
,
test_rating_values
=
self
.
_generate_pair_value
(
self
.
test_rating_info
)
def
_make_labels
(
ratings
):
labels
=
th
.
LongTensor
(
np
.
searchsorted
(
self
.
possible_rating_values
,
ratings
)).
to
(
device
)
labels
=
th
.
LongTensor
(
np
.
searchsorted
(
self
.
possible_rating_values
,
ratings
)
).
to
(
device
)
return
labels
self
.
train_enc_graph
=
self
.
_generate_enc_graph
(
train_rating_pairs
,
train_rating_values
,
add_support
=
True
)
self
.
train_enc_graph
=
self
.
_generate_enc_graph
(
train_rating_pairs
,
train_rating_values
,
add_support
=
True
)
self
.
train_dec_graph
=
self
.
_generate_dec_graph
(
train_rating_pairs
)
self
.
train_labels
=
_make_labels
(
train_rating_values
)
self
.
train_truths
=
th
.
FloatTensor
(
train_rating_values
).
to
(
device
)
...
...
@@ -203,7 +294,9 @@ class MovieLens(object):
self
.
valid_labels
=
_make_labels
(
valid_rating_values
)
self
.
valid_truths
=
th
.
FloatTensor
(
valid_rating_values
).
to
(
device
)
self
.
test_enc_graph
=
self
.
_generate_enc_graph
(
all_train_rating_pairs
,
all_train_rating_values
,
add_support
=
True
)
self
.
test_enc_graph
=
self
.
_generate_enc_graph
(
all_train_rating_pairs
,
all_train_rating_values
,
add_support
=
True
)
self
.
test_dec_graph
=
self
.
_generate_dec_graph
(
test_rating_pairs
)
self
.
test_labels
=
_make_labels
(
test_rating_values
)
self
.
test_truths
=
th
.
FloatTensor
(
test_rating_values
).
to
(
device
)
...
...
@@ -215,71 +308,118 @@ class MovieLens(object):
rst
+=
graph
.
number_of_edges
(
str
(
r
))
return
rst
print
(
"Train enc graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
train_enc_graph
.
number_of_nodes
(
'user'
),
self
.
train_enc_graph
.
number_of_nodes
(
'movie'
),
_npairs
(
self
.
train_enc_graph
)))
print
(
"Train dec graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
train_dec_graph
.
number_of_nodes
(
'user'
),
self
.
train_dec_graph
.
number_of_nodes
(
'movie'
),
self
.
train_dec_graph
.
number_of_edges
()))
print
(
"Valid enc graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
valid_enc_graph
.
number_of_nodes
(
'user'
),
self
.
valid_enc_graph
.
number_of_nodes
(
'movie'
),
_npairs
(
self
.
valid_enc_graph
)))
print
(
"Valid dec graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
valid_dec_graph
.
number_of_nodes
(
'user'
),
self
.
valid_dec_graph
.
number_of_nodes
(
'movie'
),
self
.
valid_dec_graph
.
number_of_edges
()))
print
(
"Test enc graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
test_enc_graph
.
number_of_nodes
(
'user'
),
self
.
test_enc_graph
.
number_of_nodes
(
'movie'
),
_npairs
(
self
.
test_enc_graph
)))
print
(
"Test dec graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
test_dec_graph
.
number_of_nodes
(
'user'
),
self
.
test_dec_graph
.
number_of_nodes
(
'movie'
),
self
.
test_dec_graph
.
number_of_edges
()))
print
(
"Train enc graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
train_enc_graph
.
number_of_nodes
(
"user"
),
self
.
train_enc_graph
.
number_of_nodes
(
"movie"
),
_npairs
(
self
.
train_enc_graph
),
)
)
print
(
"Train dec graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
train_dec_graph
.
number_of_nodes
(
"user"
),
self
.
train_dec_graph
.
number_of_nodes
(
"movie"
),
self
.
train_dec_graph
.
number_of_edges
(),
)
)
print
(
"Valid enc graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
valid_enc_graph
.
number_of_nodes
(
"user"
),
self
.
valid_enc_graph
.
number_of_nodes
(
"movie"
),
_npairs
(
self
.
valid_enc_graph
),
)
)
print
(
"Valid dec graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
valid_dec_graph
.
number_of_nodes
(
"user"
),
self
.
valid_dec_graph
.
number_of_nodes
(
"movie"
),
self
.
valid_dec_graph
.
number_of_edges
(),
)
)
print
(
"Test enc graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
test_enc_graph
.
number_of_nodes
(
"user"
),
self
.
test_enc_graph
.
number_of_nodes
(
"movie"
),
_npairs
(
self
.
test_enc_graph
),
)
)
print
(
"Test dec graph:
\t
#user:{}
\t
#movie:{}
\t
#pairs:{}"
.
format
(
self
.
test_dec_graph
.
number_of_nodes
(
"user"
),
self
.
test_dec_graph
.
number_of_nodes
(
"movie"
),
self
.
test_dec_graph
.
number_of_edges
(),
)
)
def
_generate_pair_value
(
self
,
rating_info
):
rating_pairs
=
(
np
.
array
([
self
.
global_user_id_map
[
ele
]
for
ele
in
rating_info
[
"user_id"
]],
dtype
=
np
.
int64
),
np
.
array
([
self
.
global_movie_id_map
[
ele
]
for
ele
in
rating_info
[
"movie_id"
]],
dtype
=
np
.
int64
))
rating_pairs
=
(
np
.
array
(
[
self
.
global_user_id_map
[
ele
]
for
ele
in
rating_info
[
"user_id"
]
],
dtype
=
np
.
int64
,
),
np
.
array
(
[
self
.
global_movie_id_map
[
ele
]
for
ele
in
rating_info
[
"movie_id"
]
],
dtype
=
np
.
int64
,
),
)
rating_values
=
rating_info
[
"rating"
].
values
.
astype
(
np
.
float32
)
return
rating_pairs
,
rating_values
def
_generate_enc_graph
(
self
,
rating_pairs
,
rating_values
,
add_support
=
False
):
user_movie_R
=
np
.
zeros
((
self
.
_num_user
,
self
.
_num_movie
),
dtype
=
np
.
float32
)
def
_generate_enc_graph
(
self
,
rating_pairs
,
rating_values
,
add_support
=
False
):
user_movie_R
=
np
.
zeros
(
(
self
.
_num_user
,
self
.
_num_movie
),
dtype
=
np
.
float32
)
user_movie_R
[
rating_pairs
]
=
rating_values
data_dict
=
dict
()
num_nodes_dict
=
{
'
user
'
:
self
.
_num_user
,
'
movie
'
:
self
.
_num_movie
}
num_nodes_dict
=
{
"
user
"
:
self
.
_num_user
,
"
movie
"
:
self
.
_num_movie
}
rating_row
,
rating_col
=
rating_pairs
for
rating
in
self
.
possible_rating_values
:
ridx
=
np
.
where
(
rating_values
==
rating
)
rrow
=
rating_row
[
ridx
]
rcol
=
rating_col
[
ridx
]
rating
=
to_etype_name
(
rating
)
data_dict
.
update
({
(
'user'
,
str
(
rating
),
'movie'
):
(
rrow
,
rcol
),
(
'movie'
,
'rev-%s'
%
str
(
rating
),
'user'
):
(
rcol
,
rrow
)
})
data_dict
.
update
(
{
(
"user"
,
str
(
rating
),
"movie"
):
(
rrow
,
rcol
),
(
"movie"
,
"rev-%s"
%
str
(
rating
),
"user"
):
(
rcol
,
rrow
),
}
)
graph
=
dgl
.
heterograph
(
data_dict
,
num_nodes_dict
=
num_nodes_dict
)
# sanity check
assert
len
(
rating_pairs
[
0
])
==
sum
([
graph
.
number_of_edges
(
et
)
for
et
in
graph
.
etypes
])
//
2
assert
(
len
(
rating_pairs
[
0
])
==
sum
([
graph
.
number_of_edges
(
et
)
for
et
in
graph
.
etypes
])
//
2
)
if
add_support
:
def
_calc_norm
(
x
):
x
=
x
.
numpy
().
astype
(
'
float32
'
)
x
[
x
==
0.
]
=
np
.
inf
x
=
th
.
FloatTensor
(
1.
/
np
.
sqrt
(
x
))
x
=
x
.
numpy
().
astype
(
"
float32
"
)
x
[
x
==
0.
0
]
=
np
.
inf
x
=
th
.
FloatTensor
(
1.
0
/
np
.
sqrt
(
x
))
return
x
.
unsqueeze
(
1
)
user_ci
=
[]
user_cj
=
[]
movie_ci
=
[]
movie_cj
=
[]
for
r
in
self
.
possible_rating_values
:
r
=
to_etype_name
(
r
)
user_ci
.
append
(
graph
[
'
rev-%s
'
%
r
].
in_degrees
())
user_ci
.
append
(
graph
[
"
rev-%s
"
%
r
].
in_degrees
())
movie_ci
.
append
(
graph
[
r
].
in_degrees
())
if
self
.
_symm
:
user_cj
.
append
(
graph
[
r
].
out_degrees
())
movie_cj
.
append
(
graph
[
'
rev-%s
'
%
r
].
out_degrees
())
movie_cj
.
append
(
graph
[
"
rev-%s
"
%
r
].
out_degrees
())
else
:
user_cj
.
append
(
th
.
zeros
((
self
.
num_user
,)))
movie_cj
.
append
(
th
.
zeros
((
self
.
num_movie
,)))
...
...
@@ -289,10 +429,14 @@ class MovieLens(object):
user_cj
=
_calc_norm
(
sum
(
user_cj
))
movie_cj
=
_calc_norm
(
sum
(
movie_cj
))
else
:
user_cj
=
th
.
ones
(
self
.
num_user
,)
movie_cj
=
th
.
ones
(
self
.
num_movie
,)
graph
.
nodes
[
'user'
].
data
.
update
({
'ci'
:
user_ci
,
'cj'
:
user_cj
})
graph
.
nodes
[
'movie'
].
data
.
update
({
'ci'
:
movie_ci
,
'cj'
:
movie_cj
})
user_cj
=
th
.
ones
(
self
.
num_user
,
)
movie_cj
=
th
.
ones
(
self
.
num_movie
,
)
graph
.
nodes
[
"user"
].
data
.
update
({
"ci"
:
user_ci
,
"cj"
:
user_cj
})
graph
.
nodes
[
"movie"
].
data
.
update
({
"ci"
:
movie_ci
,
"cj"
:
movie_cj
})
return
graph
...
...
@@ -300,10 +444,16 @@ class MovieLens(object):
ones
=
np
.
ones_like
(
rating_pairs
[
0
])
user_movie_ratings_coo
=
sp
.
coo_matrix
(
(
ones
,
rating_pairs
),
shape
=
(
self
.
num_user
,
self
.
num_movie
),
dtype
=
np
.
float32
)
g
=
dgl
.
bipartite_from_scipy
(
user_movie_ratings_coo
,
utype
=
'_U'
,
etype
=
'_E'
,
vtype
=
'_V'
)
return
dgl
.
heterograph
({(
'user'
,
'rate'
,
'movie'
):
g
.
edges
()},
num_nodes_dict
=
{
'user'
:
self
.
num_user
,
'movie'
:
self
.
num_movie
})
shape
=
(
self
.
num_user
,
self
.
num_movie
),
dtype
=
np
.
float32
,
)
g
=
dgl
.
bipartite_from_scipy
(
user_movie_ratings_coo
,
utype
=
"_U"
,
etype
=
"_E"
,
vtype
=
"_V"
)
return
dgl
.
heterograph
(
{(
"user"
,
"rate"
,
"movie"
):
g
.
edges
()},
num_nodes_dict
=
{
"user"
:
self
.
num_user
,
"movie"
:
self
.
num_movie
},
)
@
property
def
num_links
(
self
):
...
...
@@ -317,15 +467,24 @@ class MovieLens(object):
def
num_movie
(
self
):
return
self
.
_num_movie
def
_drop_unseen_nodes
(
self
,
orign_info
,
cmp_col_name
,
reserved_ids_set
,
label
):
def
_drop_unseen_nodes
(
self
,
orign_info
,
cmp_col_name
,
reserved_ids_set
,
label
):
# print(" -----------------")
# print("{}: {}(reserved) v.s. {}(from info)".format(label, len(reserved_ids_set),
# len(set(orign_info[cmp_col_name].values))))
if
reserved_ids_set
!=
set
(
orign_info
[
cmp_col_name
].
values
):
pd_rating_ids
=
pd
.
DataFrame
(
list
(
reserved_ids_set
),
columns
=
[
"id_graph"
])
pd_rating_ids
=
pd
.
DataFrame
(
list
(
reserved_ids_set
),
columns
=
[
"id_graph"
]
)
# print("\torign_info: ({}, {})".format(orign_info.shape[0], orign_info.shape[1]))
data_info
=
orign_info
.
merge
(
pd_rating_ids
,
left_on
=
cmp_col_name
,
right_on
=
'id_graph'
,
how
=
'outer'
)
data_info
=
data_info
.
dropna
(
subset
=
[
cmp_col_name
,
'id_graph'
])
data_info
=
orign_info
.
merge
(
pd_rating_ids
,
left_on
=
cmp_col_name
,
right_on
=
"id_graph"
,
how
=
"outer"
,
)
data_info
=
data_info
.
dropna
(
subset
=
[
cmp_col_name
,
"id_graph"
])
data_info
=
data_info
.
drop
(
columns
=
[
"id_graph"
])
data_info
=
data_info
.
reset_index
(
drop
=
True
)
# print("\tAfter dropping, data shape: ({}, {})".format(data_info.shape[0], data_info.shape[1]))
...
...
@@ -354,10 +513,18 @@ class MovieLens(object):
rating_info : pd.DataFrame
"""
rating_info
=
pd
.
read_csv
(
file_path
,
sep
=
sep
,
header
=
None
,
names
=
[
'user_id'
,
'movie_id'
,
'rating'
,
'timestamp'
],
dtype
=
{
'user_id'
:
np
.
int32
,
'movie_id'
:
np
.
int32
,
'ratings'
:
np
.
float32
,
'timestamp'
:
np
.
int64
},
engine
=
'python'
)
file_path
,
sep
=
sep
,
header
=
None
,
names
=
[
"user_id"
,
"movie_id"
,
"rating"
,
"timestamp"
],
dtype
=
{
"user_id"
:
np
.
int32
,
"movie_id"
:
np
.
int32
,
"ratings"
:
np
.
float32
,
"timestamp"
:
np
.
int64
,
},
engine
=
"python"
,
)
return
rating_info
def
_load_raw_user_info
(
self
):
...
...
@@ -379,20 +546,40 @@ class MovieLens(object):
-------
user_info : pd.DataFrame
"""
if
self
.
_name
==
'ml-100k'
:
self
.
user_info
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
_dir
,
'u.user'
),
sep
=
'|'
,
header
=
None
,
names
=
[
'id'
,
'age'
,
'gender'
,
'occupation'
,
'zip_code'
],
engine
=
'python'
)
elif
self
.
_name
==
'ml-1m'
:
self
.
user_info
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
_dir
,
'users.dat'
),
sep
=
'::'
,
header
=
None
,
names
=
[
'id'
,
'gender'
,
'age'
,
'occupation'
,
'zip_code'
],
engine
=
'python'
)
elif
self
.
_name
==
'ml-10m'
:
if
self
.
_name
==
"ml-100k"
:
self
.
user_info
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
_dir
,
"u.user"
),
sep
=
"|"
,
header
=
None
,
names
=
[
"id"
,
"age"
,
"gender"
,
"occupation"
,
"zip_code"
],
engine
=
"python"
,
)
elif
self
.
_name
==
"ml-1m"
:
self
.
user_info
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
_dir
,
"users.dat"
),
sep
=
"::"
,
header
=
None
,
names
=
[
"id"
,
"gender"
,
"age"
,
"occupation"
,
"zip_code"
],
engine
=
"python"
,
)
elif
self
.
_name
==
"ml-10m"
:
rating_info
=
pd
.
read_csv
(
os
.
path
.
join
(
self
.
_dir
,
'ratings.dat'
),
sep
=
'::'
,
header
=
None
,
names
=
[
'user_id'
,
'movie_id'
,
'rating'
,
'timestamp'
],
dtype
=
{
'user_id'
:
np
.
int32
,
'movie_id'
:
np
.
int32
,
'ratings'
:
np
.
float32
,
'timestamp'
:
np
.
int64
},
engine
=
'python'
)
self
.
user_info
=
pd
.
DataFrame
(
np
.
unique
(
rating_info
[
'user_id'
].
values
.
astype
(
np
.
int32
)),
columns
=
[
'id'
])
os
.
path
.
join
(
self
.
_dir
,
"ratings.dat"
),
sep
=
"::"
,
header
=
None
,
names
=
[
"user_id"
,
"movie_id"
,
"rating"
,
"timestamp"
],
dtype
=
{
"user_id"
:
np
.
int32
,
"movie_id"
:
np
.
int32
,
"ratings"
:
np
.
float32
,
"timestamp"
:
np
.
int64
,
},
engine
=
"python"
,
)
self
.
user_info
=
pd
.
DataFrame
(
np
.
unique
(
rating_info
[
"user_id"
].
values
.
astype
(
np
.
int32
)),
columns
=
[
"id"
],
)
else
:
raise
NotImplementedError
...
...
@@ -412,20 +599,36 @@ class MovieLens(object):
user_features : np.ndarray
"""
if
self
.
_name
==
'
ml-100k
'
or
self
.
_name
==
'
ml-1m
'
:
ages
=
self
.
user_info
[
'
age
'
].
values
.
astype
(
np
.
float32
)
gender
=
(
self
.
user_info
[
'
gender
'
]
==
'F'
).
values
.
astype
(
np
.
float32
)
all_occupations
=
set
(
self
.
user_info
[
'
occupation
'
])
if
self
.
_name
==
"
ml-100k
"
or
self
.
_name
==
"
ml-1m
"
:
ages
=
self
.
user_info
[
"
age
"
].
values
.
astype
(
np
.
float32
)
gender
=
(
self
.
user_info
[
"
gender
"
]
==
"F"
).
values
.
astype
(
np
.
float32
)
all_occupations
=
set
(
self
.
user_info
[
"
occupation
"
])
occupation_map
=
{
ele
:
i
for
i
,
ele
in
enumerate
(
all_occupations
)}
occupation_one_hot
=
np
.
zeros
(
shape
=
(
self
.
user_info
.
shape
[
0
],
len
(
all_occupations
)),
dtype
=
np
.
float32
)
occupation_one_hot
[
np
.
arange
(
self
.
user_info
.
shape
[
0
]),
np
.
array
([
occupation_map
[
ele
]
for
ele
in
self
.
user_info
[
'occupation'
]])]
=
1
user_features
=
np
.
concatenate
([
ages
.
reshape
((
self
.
user_info
.
shape
[
0
],
1
))
/
50.0
,
gender
.
reshape
((
self
.
user_info
.
shape
[
0
],
1
)),
occupation_one_hot
],
axis
=
1
)
elif
self
.
_name
==
'ml-10m'
:
user_features
=
np
.
zeros
(
shape
=
(
self
.
user_info
.
shape
[
0
],
1
),
dtype
=
np
.
float32
)
occupation_one_hot
=
np
.
zeros
(
shape
=
(
self
.
user_info
.
shape
[
0
],
len
(
all_occupations
)),
dtype
=
np
.
float32
,
)
occupation_one_hot
[
np
.
arange
(
self
.
user_info
.
shape
[
0
]),
np
.
array
(
[
occupation_map
[
ele
]
for
ele
in
self
.
user_info
[
"occupation"
]
]
),
]
=
1
user_features
=
np
.
concatenate
(
[
ages
.
reshape
((
self
.
user_info
.
shape
[
0
],
1
))
/
50.0
,
gender
.
reshape
((
self
.
user_info
.
shape
[
0
],
1
)),
occupation_one_hot
,
],
axis
=
1
,
)
elif
self
.
_name
==
"ml-10m"
:
user_features
=
np
.
zeros
(
shape
=
(
self
.
user_info
.
shape
[
0
],
1
),
dtype
=
np
.
float32
)
else
:
raise
NotImplementedError
return
user_features
...
...
@@ -453,35 +656,57 @@ class MovieLens(object):
For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]]
For ml-1m and ml-10m, the column name is ['id', 'title'] + [GENRES (18/20)]]
"""
if
self
.
_name
==
'
ml-100k
'
:
if
self
.
_name
==
"
ml-100k
"
:
GENRES
=
GENRES_ML_100K
elif
self
.
_name
==
'
ml-1m
'
:
elif
self
.
_name
==
"
ml-1m
"
:
GENRES
=
GENRES_ML_1M
elif
self
.
_name
==
'
ml-10m
'
:
elif
self
.
_name
==
"
ml-10m
"
:
GENRES
=
GENRES_ML_10M
else
:
raise
NotImplementedError
if
self
.
_name
==
'ml-100k'
:
file_path
=
os
.
path
.
join
(
self
.
_dir
,
'u.item'
)
self
.
movie_info
=
pd
.
read_csv
(
file_path
,
sep
=
'|'
,
header
=
None
,
names
=
[
'id'
,
'title'
,
'release_date'
,
'video_release_date'
,
'url'
]
+
GENRES
,
encoding
=
'iso-8859-1'
)
elif
self
.
_name
==
'ml-1m'
or
self
.
_name
==
'ml-10m'
:
file_path
=
os
.
path
.
join
(
self
.
_dir
,
'movies.dat'
)
movie_info
=
pd
.
read_csv
(
file_path
,
sep
=
'::'
,
header
=
None
,
names
=
[
'id'
,
'title'
,
'genres'
],
encoding
=
'iso-8859-1'
)
if
self
.
_name
==
"ml-100k"
:
file_path
=
os
.
path
.
join
(
self
.
_dir
,
"u.item"
)
self
.
movie_info
=
pd
.
read_csv
(
file_path
,
sep
=
"|"
,
header
=
None
,
names
=
[
"id"
,
"title"
,
"release_date"
,
"video_release_date"
,
"url"
,
]
+
GENRES
,
encoding
=
"iso-8859-1"
,
)
elif
self
.
_name
==
"ml-1m"
or
self
.
_name
==
"ml-10m"
:
file_path
=
os
.
path
.
join
(
self
.
_dir
,
"movies.dat"
)
movie_info
=
pd
.
read_csv
(
file_path
,
sep
=
"::"
,
header
=
None
,
names
=
[
"id"
,
"title"
,
"genres"
],
encoding
=
"iso-8859-1"
,
)
genre_map
=
{
ele
:
i
for
i
,
ele
in
enumerate
(
GENRES
)}
genre_map
[
'Children
\'
s'
]
=
genre_map
[
'Children'
]
genre_map
[
'Childrens'
]
=
genre_map
[
'Children'
]
movie_genres
=
np
.
zeros
(
shape
=
(
movie_info
.
shape
[
0
],
len
(
GENRES
)),
dtype
=
np
.
float32
)
for
i
,
genres
in
enumerate
(
movie_info
[
'genres'
]):
for
ele
in
genres
.
split
(
'|'
):
genre_map
[
"Children's"
]
=
genre_map
[
"Children"
]
genre_map
[
"Childrens"
]
=
genre_map
[
"Children"
]
movie_genres
=
np
.
zeros
(
shape
=
(
movie_info
.
shape
[
0
],
len
(
GENRES
)),
dtype
=
np
.
float32
)
for
i
,
genres
in
enumerate
(
movie_info
[
"genres"
]):
for
ele
in
genres
.
split
(
"|"
):
if
ele
in
genre_map
:
movie_genres
[
i
,
genre_map
[
ele
]]
=
1.0
else
:
print
(
'genres not found, filled with unknown: {}'
.
format
(
genres
))
movie_genres
[
i
,
genre_map
[
'unknown'
]]
=
1.0
print
(
"genres not found, filled with unknown: {}"
.
format
(
genres
)
)
movie_genres
[
i
,
genre_map
[
"unknown"
]]
=
1.0
for
idx
,
genre_name
in
enumerate
(
GENRES
):
assert
idx
==
genre_map
[
genre_name
]
movie_info
[
genre_name
]
=
movie_genres
[:,
idx
]
...
...
@@ -506,39 +731,58 @@ class MovieLens(object):
import
torchtext
from
torchtext.data.utils
import
get_tokenizer
if
self
.
_name
==
'
ml-100k
'
:
if
self
.
_name
==
"
ml-100k
"
:
GENRES
=
GENRES_ML_100K
elif
self
.
_name
==
'
ml-1m
'
:
elif
self
.
_name
==
"
ml-1m
"
:
GENRES
=
GENRES_ML_1M
elif
self
.
_name
==
'
ml-10m
'
:
elif
self
.
_name
==
"
ml-10m
"
:
GENRES
=
GENRES_ML_10M
else
:
raise
NotImplementedError
# Old torchtext-legacy API commented below
# TEXT = torchtext.legacy.data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm')
tokenizer
=
get_tokenizer
(
'spacy'
,
language
=
'en_core_web_sm'
)
# new API (torchtext 0.9+)
embedding
=
torchtext
.
vocab
.
GloVe
(
name
=
'840B'
,
dim
=
300
)
title_embedding
=
np
.
zeros
(
shape
=
(
self
.
movie_info
.
shape
[
0
],
300
),
dtype
=
np
.
float32
)
release_years
=
np
.
zeros
(
shape
=
(
self
.
movie_info
.
shape
[
0
],
1
),
dtype
=
np
.
float32
)
p
=
re
.
compile
(
r
'(.+)\s*\((\d+)\)'
)
for
i
,
title
in
enumerate
(
self
.
movie_info
[
'title'
]):
tokenizer
=
get_tokenizer
(
"spacy"
,
language
=
"en_core_web_sm"
)
# new API (torchtext 0.9+)
embedding
=
torchtext
.
vocab
.
GloVe
(
name
=
"840B"
,
dim
=
300
)
title_embedding
=
np
.
zeros
(
shape
=
(
self
.
movie_info
.
shape
[
0
],
300
),
dtype
=
np
.
float32
)
release_years
=
np
.
zeros
(
shape
=
(
self
.
movie_info
.
shape
[
0
],
1
),
dtype
=
np
.
float32
)
p
=
re
.
compile
(
r
"(.+)\s*\((\d+)\)"
)
for
i
,
title
in
enumerate
(
self
.
movie_info
[
"title"
]):
match_res
=
p
.
match
(
title
)
if
match_res
is
None
:
print
(
'{} cannot be matched, index={}, name={}'
.
format
(
title
,
i
,
self
.
_name
))
print
(
"{} cannot be matched, index={}, name={}"
.
format
(
title
,
i
,
self
.
_name
)
)
title_context
,
year
=
title
,
1950
else
:
title_context
,
year
=
match_res
.
groups
()
# We use average of glove
# Upgraded torchtext API: TEXT.tokenize(title_context) --> tokenizer(title_context)
title_embedding
[
i
,
:]
=
embedding
.
get_vecs_by_tokens
(
tokenizer
(
title_context
)).
numpy
().
mean
(
axis
=
0
)
# Upgraded torchtext API: TEXT.tokenize(title_context) --> tokenizer(title_context)
title_embedding
[
i
,
:]
=
(
embedding
.
get_vecs_by_tokens
(
tokenizer
(
title_context
))
.
numpy
()
.
mean
(
axis
=
0
)
)
release_years
[
i
]
=
float
(
year
)
movie_features
=
np
.
concatenate
((
title_embedding
,
(
release_years
-
1950.0
)
/
100.0
,
self
.
movie_info
[
GENRES
]),
axis
=
1
)
movie_features
=
np
.
concatenate
(
(
title_embedding
,
(
release_years
-
1950.0
)
/
100.0
,
self
.
movie_info
[
GENRES
],
),
axis
=
1
,
)
return
movie_features
if
__name__
==
'__main__'
:
MovieLens
(
"ml-100k"
,
device
=
th
.
device
(
'cpu'
),
symm
=
True
)
if
__name__
==
"__main__"
:
MovieLens
(
"ml-100k"
,
device
=
th
.
device
(
"cpu"
),
symm
=
True
)
examples/pytorch/gcmc/model.py
View file @
704bcaf6
"""NN modules"""
import
dgl.function
as
fn
import
dgl.nn.pytorch
as
dglnn
import
torch
as
th
import
torch.nn
as
nn
from
torch.nn
import
init
import
dgl.function
as
fn
import
dgl.nn.pytorch
as
dglnn
from
utils
import
get_activation
,
to_etype_name
class
GCMCGraphConv
(
nn
.
Module
):
"""Graph convolution module used in the GCMC model.
...
...
@@ -23,12 +24,10 @@ class GCMCGraphConv(nn.Module):
Which device to put data in. Useful in mix_cpu_gpu training and
multi-gpu training
"""
def
__init__
(
self
,
in_feats
,
out_feats
,
weight
=
True
,
device
=
None
,
dropout_rate
=
0.0
):
def
__init__
(
self
,
in_feats
,
out_feats
,
weight
=
True
,
device
=
None
,
dropout_rate
=
0.0
):
super
(
GCMCGraphConv
,
self
).
__init__
()
self
.
_in_feats
=
in_feats
self
.
_out_feats
=
out_feats
...
...
@@ -38,7 +37,7 @@ class GCMCGraphConv(nn.Module):
if
weight
:
self
.
weight
=
nn
.
Parameter
(
th
.
Tensor
(
in_feats
,
out_feats
))
else
:
self
.
register_parameter
(
'
weight
'
,
None
)
self
.
register_parameter
(
"
weight
"
,
None
)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
...
...
@@ -70,17 +69,19 @@ class GCMCGraphConv(nn.Module):
"""
with
graph
.
local_scope
():
if
isinstance
(
feat
,
tuple
):
feat
,
_
=
feat
# dst feature not used
cj
=
graph
.
srcdata
[
'
cj
'
]
ci
=
graph
.
dstdata
[
'
ci
'
]
feat
,
_
=
feat
# dst feature not used
cj
=
graph
.
srcdata
[
"
cj
"
]
ci
=
graph
.
dstdata
[
"
ci
"
]
if
self
.
device
is
not
None
:
cj
=
cj
.
to
(
self
.
device
)
ci
=
ci
.
to
(
self
.
device
)
if
weight
is
not
None
:
if
self
.
weight
is
not
None
:
raise
DGLError
(
'External weight is provided while at the same time the'
' module has defined its own weight parameter. Please'
' create the module with flag weight=False.'
)
raise
DGLError
(
"External weight is provided while at the same time the"
" module has defined its own weight parameter. Please"
" create the module with flag weight=False."
)
else
:
weight
=
self
.
weight
...
...
@@ -88,14 +89,16 @@ class GCMCGraphConv(nn.Module):
feat
=
dot_or_identity
(
feat
,
weight
,
self
.
device
)
feat
=
feat
*
self
.
dropout
(
cj
)
graph
.
srcdata
[
'h'
]
=
feat
graph
.
update_all
(
fn
.
copy_u
(
u
=
'h'
,
out
=
'm'
),
fn
.
sum
(
msg
=
'm'
,
out
=
'h'
))
rst
=
graph
.
dstdata
[
'h'
]
graph
.
srcdata
[
"h"
]
=
feat
graph
.
update_all
(
fn
.
copy_u
(
u
=
"h"
,
out
=
"m"
),
fn
.
sum
(
msg
=
"m"
,
out
=
"h"
)
)
rst
=
graph
.
dstdata
[
"h"
]
rst
=
rst
*
ci
return
rst
class
GCMCLayer
(
nn
.
Module
):
r
"""GCMC layer
...
...
@@ -144,18 +147,21 @@ class GCMCLayer(nn.Module):
Which device to put data in. Useful in mix_cpu_gpu training and
multi-gpu training
"""
def
__init__
(
self
,
rating_vals
,
user_in_units
,
movie_in_units
,
msg_units
,
out_units
,
dropout_rate
=
0.0
,
agg
=
'stack'
,
# or 'sum'
agg_act
=
None
,
out_act
=
None
,
share_user_item_param
=
False
,
device
=
None
):
def
__init__
(
self
,
rating_vals
,
user_in_units
,
movie_in_units
,
msg_units
,
out_units
,
dropout_rate
=
0.0
,
agg
=
"stack"
,
# or 'sum'
agg_act
=
None
,
out_act
=
None
,
share_user_item_param
=
False
,
device
=
None
,
):
super
(
GCMCLayer
,
self
).
__init__
()
self
.
rating_vals
=
rating_vals
self
.
agg
=
agg
...
...
@@ -165,7 +171,7 @@ class GCMCLayer(nn.Module):
self
.
ifc
=
self
.
ufc
else
:
self
.
ifc
=
nn
.
Linear
(
msg_units
,
out_units
)
if
agg
==
'
stack
'
:
if
agg
==
"
stack
"
:
# divide the original msg unit size by number of ratings to keep
# the dimensionality
assert
msg_units
%
len
(
rating_vals
)
==
0
...
...
@@ -176,32 +182,42 @@ class GCMCLayer(nn.Module):
for
rating
in
rating_vals
:
# PyTorch parameter name can't contain "."
rating
=
to_etype_name
(
rating
)
rev_rating
=
'
rev-%s
'
%
rating
rev_rating
=
"
rev-%s
"
%
rating
if
share_user_item_param
and
user_in_units
==
movie_in_units
:
self
.
W_r
[
rating
]
=
nn
.
Parameter
(
th
.
randn
(
user_in_units
,
msg_units
))
self
.
W_r
[
'rev-%s'
%
rating
]
=
self
.
W_r
[
rating
]
subConv
[
rating
]
=
GCMCGraphConv
(
user_in_units
,
msg_units
,
weight
=
False
,
device
=
device
,
dropout_rate
=
dropout_rate
)
subConv
[
rev_rating
]
=
GCMCGraphConv
(
user_in_units
,
msg_units
,
weight
=
False
,
device
=
device
,
dropout_rate
=
dropout_rate
)
self
.
W_r
[
rating
]
=
nn
.
Parameter
(
th
.
randn
(
user_in_units
,
msg_units
)
)
self
.
W_r
[
"rev-%s"
%
rating
]
=
self
.
W_r
[
rating
]
subConv
[
rating
]
=
GCMCGraphConv
(
user_in_units
,
msg_units
,
weight
=
False
,
device
=
device
,
dropout_rate
=
dropout_rate
,
)
subConv
[
rev_rating
]
=
GCMCGraphConv
(
user_in_units
,
msg_units
,
weight
=
False
,
device
=
device
,
dropout_rate
=
dropout_rate
,
)
else
:
self
.
W_r
=
None
subConv
[
rating
]
=
GCMCGraphConv
(
user_in_units
,
msg_units
,
weight
=
True
,
device
=
device
,
dropout_rate
=
dropout_rate
)
subConv
[
rev_rating
]
=
GCMCGraphConv
(
movie_in_units
,
msg_units
,
weight
=
True
,
device
=
device
,
dropout_rate
=
dropout_rate
)
subConv
[
rating
]
=
GCMCGraphConv
(
user_in_units
,
msg_units
,
weight
=
True
,
device
=
device
,
dropout_rate
=
dropout_rate
,
)
subConv
[
rev_rating
]
=
GCMCGraphConv
(
movie_in_units
,
msg_units
,
weight
=
True
,
device
=
device
,
dropout_rate
=
dropout_rate
,
)
self
.
conv
=
dglnn
.
HeteroGraphConv
(
subConv
,
aggregate
=
agg
)
self
.
agg_act
=
get_activation
(
agg_act
)
self
.
out_act
=
get_activation
(
out_act
)
...
...
@@ -248,16 +264,20 @@ class GCMCLayer(nn.Module):
new_ifeat : torch.Tensor
New movie features
"""
in_feats
=
{
'
user
'
:
ufeat
,
'
movie
'
:
ifeat
}
in_feats
=
{
"
user
"
:
ufeat
,
"
movie
"
:
ifeat
}
mod_args
=
{}
for
i
,
rating
in
enumerate
(
self
.
rating_vals
):
rating
=
to_etype_name
(
rating
)
rev_rating
=
'rev-%s'
%
rating
mod_args
[
rating
]
=
(
self
.
W_r
[
rating
]
if
self
.
W_r
is
not
None
else
None
,)
mod_args
[
rev_rating
]
=
(
self
.
W_r
[
rev_rating
]
if
self
.
W_r
is
not
None
else
None
,)
rev_rating
=
"rev-%s"
%
rating
mod_args
[
rating
]
=
(
self
.
W_r
[
rating
]
if
self
.
W_r
is
not
None
else
None
,
)
mod_args
[
rev_rating
]
=
(
self
.
W_r
[
rev_rating
]
if
self
.
W_r
is
not
None
else
None
,
)
out_feats
=
self
.
conv
(
graph
,
in_feats
,
mod_args
=
mod_args
)
ufeat
=
out_feats
[
'
user
'
]
ifeat
=
out_feats
[
'
movie
'
]
ufeat
=
out_feats
[
"
user
"
]
ifeat
=
out_feats
[
"
movie
"
]
ufeat
=
ufeat
.
view
(
ufeat
.
shape
[
0
],
-
1
)
ifeat
=
ifeat
.
view
(
ifeat
.
shape
[
0
],
-
1
)
...
...
@@ -270,6 +290,7 @@ class GCMCLayer(nn.Module):
ifeat
=
self
.
ifc
(
ifeat
)
return
self
.
out_act
(
ufeat
),
self
.
out_act
(
ifeat
)
class
BiDecoder
(
nn
.
Module
):
r
"""Bi-linear decoder.
...
...
@@ -296,17 +317,14 @@ class BiDecoder(nn.Module):
dropout_rate : float, optional
Dropout raite (Default: 0.0)
"""
def
__init__
(
self
,
in_units
,
num_classes
,
num_basis
=
2
,
dropout_rate
=
0.0
):
def
__init__
(
self
,
in_units
,
num_classes
,
num_basis
=
2
,
dropout_rate
=
0.0
):
super
(
BiDecoder
,
self
).
__init__
()
self
.
_num_basis
=
num_basis
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
Ps
=
nn
.
ParameterList
(
nn
.
Parameter
(
th
.
randn
(
in_units
,
in_units
))
for
_
in
range
(
num_basis
)
)
nn
.
Parameter
(
th
.
randn
(
in_units
,
in_units
))
for
_
in
range
(
num_basis
)
)
self
.
combine_basis
=
nn
.
Linear
(
self
.
_num_basis
,
num_classes
,
bias
=
False
)
self
.
reset_parameters
()
...
...
@@ -335,16 +353,17 @@ class BiDecoder(nn.Module):
with
graph
.
local_scope
():
ufeat
=
self
.
dropout
(
ufeat
)
ifeat
=
self
.
dropout
(
ifeat
)
graph
.
nodes
[
'
movie
'
].
data
[
'h'
]
=
ifeat
graph
.
nodes
[
"
movie
"
].
data
[
"h"
]
=
ifeat
basis_out
=
[]
for
i
in
range
(
self
.
_num_basis
):
graph
.
nodes
[
'
user
'
].
data
[
'h'
]
=
ufeat
@
self
.
Ps
[
i
]
graph
.
apply_edges
(
fn
.
u_dot_v
(
'h'
,
'h'
,
'
sr
'
))
basis_out
.
append
(
graph
.
edata
[
'
sr
'
])
graph
.
nodes
[
"
user
"
].
data
[
"h"
]
=
ufeat
@
self
.
Ps
[
i
]
graph
.
apply_edges
(
fn
.
u_dot_v
(
"h"
,
"h"
,
"
sr
"
))
basis_out
.
append
(
graph
.
edata
[
"
sr
"
])
out
=
th
.
cat
(
basis_out
,
dim
=
1
)
out
=
self
.
combine_basis
(
out
)
return
out
class
DenseBiDecoder
(
nn
.
Module
):
r
"""Dense bi-linear decoder.
...
...
@@ -363,11 +382,8 @@ class DenseBiDecoder(nn.Module):
dropout_rate : float, optional
Dropout raite (Default: 0.0)
"""
def
__init__
(
self
,
in_units
,
num_classes
,
num_basis
=
2
,
dropout_rate
=
0.0
):
def
__init__
(
self
,
in_units
,
num_classes
,
num_basis
=
2
,
dropout_rate
=
0.0
):
super
().
__init__
()
self
.
_num_basis
=
num_basis
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
...
...
@@ -399,10 +415,11 @@ class DenseBiDecoder(nn.Module):
"""
ufeat
=
self
.
dropout
(
ufeat
)
ifeat
=
self
.
dropout
(
ifeat
)
out
=
th
.
einsum
(
'
ai,bij,aj->ab
'
,
ufeat
,
self
.
P
,
ifeat
)
out
=
th
.
einsum
(
"
ai,bij,aj->ab
"
,
ufeat
,
self
.
P
,
ifeat
)
out
=
self
.
combine_basis
(
out
)
return
out
def
dot_or_identity
(
A
,
B
,
device
=
None
):
# if A is None, treat as identity matrix
if
A
is
None
:
...
...
examples/pytorch/gcmc/train.py
View file @
704bcaf6
...
...
@@ -14,8 +14,13 @@ import torch as th
import
torch.nn
as
nn
from
data
import
MovieLens
from
model
import
BiDecoder
,
GCMCLayer
from
utils
import
(
MetricLogger
,
get_activation
,
get_optimizer
,
torch_net_info
,
torch_total_param_num
)
from
utils
import
(
get_activation
,
get_optimizer
,
MetricLogger
,
torch_net_info
,
torch_total_param_num
,
)
class
Net
(
nn
.
Module
):
...
...
examples/pytorch/gcmc/train_sampling.py
View file @
704bcaf6
...
...
@@ -4,38 +4,49 @@ The script loads the full graph in CPU and samples subgraphs for computing
gradients on the training device. The script also supports multi-GPU for
further acceleration.
"""
import
os
,
time
import
argparse
import
logging
import
os
,
time
import
random
import
string
import
traceback
import
dgl
import
numpy
as
np
import
tqdm
import
torch
as
th
import
torch.multiprocessing
as
mp
import
torch.nn
as
nn
from
torch.utils.data
import
DataLoader
from
torch.nn.parallel
import
DistributedDataParallel
import
tqdm
from
data
import
MovieLens
from
model
import
GCMCLayer
,
DenseBiDecoder
,
BiDecoder
from
utils
import
get_activation
,
get_optimizer
,
torch_total_param_num
,
torch_net_info
,
MetricLogger
,
to_etype_name
import
dgl
import
torch.multiprocessing
as
mp
from
model
import
BiDecoder
,
DenseBiDecoder
,
GCMCLayer
from
torch.nn.parallel
import
DistributedDataParallel
from
torch.utils.data
import
DataLoader
from
utils
import
(
get_activation
,
get_optimizer
,
MetricLogger
,
to_etype_name
,
torch_net_info
,
torch_total_param_num
,
)
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
args
,
dev_id
):
super
(
Net
,
self
).
__init__
()
self
.
_act
=
get_activation
(
args
.
model_activation
)
self
.
encoder
=
GCMCLayer
(
args
.
rating_vals
,
args
.
src_in_units
,
args
.
dst_in_units
,
args
.
gcn_agg_units
,
args
.
gcn_out_units
,
args
.
gcn_dropout
,
args
.
gcn_agg_accum
,
agg_act
=
self
.
_act
,
share_user_item_param
=
args
.
share_param
,
device
=
dev_id
)
self
.
encoder
=
GCMCLayer
(
args
.
rating_vals
,
args
.
src_in_units
,
args
.
dst_in_units
,
args
.
gcn_agg_units
,
args
.
gcn_out_units
,
args
.
gcn_dropout
,
args
.
gcn_agg_accum
,
agg_act
=
self
.
_act
,
share_user_item_param
=
args
.
share_param
,
device
=
dev_id
,
)
if
args
.
mix_cpu_gpu
and
args
.
use_one_hot_fea
:
# if use_one_hot_fea, user and movie feature is None
# W can be extremely large, with mix_cpu_gpu W should be stored in CPU
...
...
@@ -43,45 +54,63 @@ class Net(nn.Module):
else
:
self
.
encoder
.
to
(
dev_id
)
self
.
decoder
=
BiDecoder
(
in_units
=
args
.
gcn_out_units
,
num_classes
=
len
(
args
.
rating_vals
),
num_basis
=
args
.
gen_r_num_basis_func
)
self
.
decoder
=
BiDecoder
(
in_units
=
args
.
gcn_out_units
,
num_classes
=
len
(
args
.
rating_vals
),
num_basis
=
args
.
gen_r_num_basis_func
,
)
self
.
decoder
.
to
(
dev_id
)
def
forward
(
self
,
compact_g
,
frontier
,
ufeat
,
ifeat
,
possible_rating_values
):
def
forward
(
self
,
compact_g
,
frontier
,
ufeat
,
ifeat
,
possible_rating_values
):
user_out
,
movie_out
=
self
.
encoder
(
frontier
,
ufeat
,
ifeat
)
pred_ratings
=
self
.
decoder
(
compact_g
,
user_out
,
movie_out
)
return
pred_ratings
def
load_subtensor
(
input_nodes
,
pair_graph
,
blocks
,
dataset
,
parent_graph
):
output_nodes
=
pair_graph
.
ndata
[
dgl
.
NID
]
head_feat
=
input_nodes
[
'user'
]
if
dataset
.
user_feature
is
None
else
\
dataset
.
user_feature
[
input_nodes
[
'user'
]]
tail_feat
=
input_nodes
[
'movie'
]
if
dataset
.
movie_feature
is
None
else
\
dataset
.
movie_feature
[
input_nodes
[
'movie'
]]
head_feat
=
(
input_nodes
[
"user"
]
if
dataset
.
user_feature
is
None
else
dataset
.
user_feature
[
input_nodes
[
"user"
]]
)
tail_feat
=
(
input_nodes
[
"movie"
]
if
dataset
.
movie_feature
is
None
else
dataset
.
movie_feature
[
input_nodes
[
"movie"
]]
)
for
block
in
blocks
:
block
.
dstnodes
[
'user'
].
data
[
'ci'
]
=
\
parent_graph
.
nodes
[
'user'
].
data
[
'ci'
][
block
.
dstnodes
[
'user'
].
data
[
dgl
.
NID
]]
block
.
srcnodes
[
'user'
].
data
[
'cj'
]
=
\
parent_graph
.
nodes
[
'user'
].
data
[
'cj'
][
block
.
srcnodes
[
'user'
].
data
[
dgl
.
NID
]]
block
.
dstnodes
[
'movie'
].
data
[
'ci'
]
=
\
parent_graph
.
nodes
[
'movie'
].
data
[
'ci'
][
block
.
dstnodes
[
'movie'
].
data
[
dgl
.
NID
]]
block
.
srcnodes
[
'movie'
].
data
[
'cj'
]
=
\
parent_graph
.
nodes
[
'movie'
].
data
[
'cj'
][
block
.
srcnodes
[
'movie'
].
data
[
dgl
.
NID
]]
block
.
dstnodes
[
"user"
].
data
[
"ci"
]
=
parent_graph
.
nodes
[
"user"
].
data
[
"ci"
][
block
.
dstnodes
[
"user"
].
data
[
dgl
.
NID
]]
block
.
srcnodes
[
"user"
].
data
[
"cj"
]
=
parent_graph
.
nodes
[
"user"
].
data
[
"cj"
][
block
.
srcnodes
[
"user"
].
data
[
dgl
.
NID
]]
block
.
dstnodes
[
"movie"
].
data
[
"ci"
]
=
parent_graph
.
nodes
[
"movie"
].
data
[
"ci"
][
block
.
dstnodes
[
"movie"
].
data
[
dgl
.
NID
]]
block
.
srcnodes
[
"movie"
].
data
[
"cj"
]
=
parent_graph
.
nodes
[
"movie"
].
data
[
"cj"
][
block
.
srcnodes
[
"movie"
].
data
[
dgl
.
NID
]]
return
head_feat
,
tail_feat
,
blocks
def
flatten_etypes
(
pair_graph
,
dataset
,
segment
):
n_users
=
pair_graph
.
number_of_nodes
(
'
user
'
)
n_movies
=
pair_graph
.
number_of_nodes
(
'
movie
'
)
n_users
=
pair_graph
.
number_of_nodes
(
"
user
"
)
n_movies
=
pair_graph
.
number_of_nodes
(
"
movie
"
)
src
=
[]
dst
=
[]
labels
=
[]
ratings
=
[]
for
rating
in
dataset
.
possible_rating_values
:
src_etype
,
dst_etype
=
pair_graph
.
edges
(
order
=
'eid'
,
etype
=
to_etype_name
(
rating
))
src_etype
,
dst_etype
=
pair_graph
.
edges
(
order
=
"eid"
,
etype
=
to_etype_name
(
rating
)
)
src
.
append
(
src_etype
)
dst
.
append
(
dst_etype
)
label
=
np
.
searchsorted
(
dataset
.
possible_rating_values
,
rating
)
...
...
@@ -92,85 +121,117 @@ def flatten_etypes(pair_graph, dataset, segment):
ratings
=
th
.
cat
(
ratings
)
labels
=
th
.
cat
(
labels
)
flattened_pair_graph
=
dgl
.
heterograph
({
(
'user'
,
'rate'
,
'movie'
):
(
src
,
dst
)},
num_nodes_dict
=
{
'user'
:
n_users
,
'movie'
:
n_movies
})
flattened_pair_graph
.
edata
[
'rating'
]
=
ratings
flattened_pair_graph
.
edata
[
'label'
]
=
labels
flattened_pair_graph
=
dgl
.
heterograph
(
{(
"user"
,
"rate"
,
"movie"
):
(
src
,
dst
)},
num_nodes_dict
=
{
"user"
:
n_users
,
"movie"
:
n_movies
},
)
flattened_pair_graph
.
edata
[
"rating"
]
=
ratings
flattened_pair_graph
.
edata
[
"label"
]
=
labels
return
flattened_pair_graph
def
evaluate
(
args
,
dev_id
,
net
,
dataset
,
dataloader
,
segment
=
'valid'
):
def
evaluate
(
args
,
dev_id
,
net
,
dataset
,
dataloader
,
segment
=
"valid"
):
possible_rating_values
=
dataset
.
possible_rating_values
nd_possible_rating_values
=
th
.
FloatTensor
(
possible_rating_values
).
to
(
dev_id
)
nd_possible_rating_values
=
th
.
FloatTensor
(
possible_rating_values
).
to
(
dev_id
)
real_pred_ratings
=
[]
true_rel_ratings
=
[]
for
input_nodes
,
pair_graph
,
blocks
in
dataloader
:
head_feat
,
tail_feat
,
blocks
=
load_subtensor
(
input_nodes
,
pair_graph
,
blocks
,
dataset
,
dataset
.
valid_enc_graph
if
segment
==
'valid'
else
dataset
.
test_enc_graph
)
input_nodes
,
pair_graph
,
blocks
,
dataset
,
dataset
.
valid_enc_graph
if
segment
==
"valid"
else
dataset
.
test_enc_graph
,
)
frontier
=
blocks
[
0
]
true_relation_ratings
=
\
dataset
.
valid_truths
[
pair_graph
.
edata
[
dgl
.
EID
]]
if
segment
==
'valid'
else
\
dataset
.
test_truths
[
pair_graph
.
edata
[
dgl
.
EID
]]
true_relation_ratings
=
(
dataset
.
valid_truths
[
pair_graph
.
edata
[
dgl
.
EID
]]
if
segment
==
"valid"
else
dataset
.
test_truths
[
pair_graph
.
edata
[
dgl
.
EID
]]
)
frontier
=
frontier
.
to
(
dev_id
)
head_feat
=
head_feat
.
to
(
dev_id
)
tail_feat
=
tail_feat
.
to
(
dev_id
)
pair_graph
=
pair_graph
.
to
(
dev_id
)
with
th
.
no_grad
():
pred_ratings
=
net
(
pair_graph
,
frontier
,
head_feat
,
tail_feat
,
possible_rating_values
)
batch_pred_ratings
=
(
th
.
softmax
(
pred_ratings
,
dim
=
1
)
*
nd_possible_rating_values
.
view
(
1
,
-
1
)).
sum
(
dim
=
1
)
pred_ratings
=
net
(
pair_graph
,
frontier
,
head_feat
,
tail_feat
,
possible_rating_values
,
)
batch_pred_ratings
=
(
th
.
softmax
(
pred_ratings
,
dim
=
1
)
*
nd_possible_rating_values
.
view
(
1
,
-
1
)
).
sum
(
dim
=
1
)
real_pred_ratings
.
append
(
batch_pred_ratings
)
true_rel_ratings
.
append
(
true_relation_ratings
)
real_pred_ratings
=
th
.
cat
(
real_pred_ratings
,
dim
=
0
)
true_rel_ratings
=
th
.
cat
(
true_rel_ratings
,
dim
=
0
).
to
(
dev_id
)
rmse
=
((
real_pred_ratings
-
true_rel_ratings
)
**
2.
).
mean
().
item
()
rmse
=
((
real_pred_ratings
-
true_rel_ratings
)
**
2.
0
).
mean
().
item
()
rmse
=
np
.
sqrt
(
rmse
)
return
rmse
def
config
():
parser
=
argparse
.
ArgumentParser
(
description
=
'GCMC'
)
parser
.
add_argument
(
'--seed'
,
default
=
123
,
type
=
int
)
parser
.
add_argument
(
'--gpu'
,
type
=
str
,
default
=
'0'
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
help
=
'The saving directory'
)
parser
.
add_argument
(
'--save_id'
,
type
=
int
,
help
=
'The saving log id'
)
parser
.
add_argument
(
'--silent'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--data_name'
,
default
=
'ml-1m'
,
type
=
str
,
help
=
'The dataset name: ml-100k, ml-1m, ml-10m'
)
parser
.
add_argument
(
'--data_test_ratio'
,
type
=
float
,
default
=
0.1
)
## for ml-100k the test ration is 0.2
parser
.
add_argument
(
'--data_valid_ratio'
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
'--use_one_hot_fea'
,
action
=
'store_true'
,
default
=
False
)
parser
.
add_argument
(
'--model_activation'
,
type
=
str
,
default
=
"leaky"
)
parser
.
add_argument
(
'--gcn_dropout'
,
type
=
float
,
default
=
0.7
)
parser
.
add_argument
(
'--gcn_agg_norm_symm'
,
type
=
bool
,
default
=
True
)
parser
.
add_argument
(
'--gcn_agg_units'
,
type
=
int
,
default
=
500
)
parser
.
add_argument
(
'--gcn_agg_accum'
,
type
=
str
,
default
=
"sum"
)
parser
.
add_argument
(
'--gcn_out_units'
,
type
=
int
,
default
=
75
)
parser
.
add_argument
(
'--gen_r_num_basis_func'
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--train_max_epoch'
,
type
=
int
,
default
=
1000
)
parser
.
add_argument
(
'--train_log_interval'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--train_valid_interval'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--train_optimizer'
,
type
=
str
,
default
=
"adam"
)
parser
.
add_argument
(
'--train_grad_clip'
,
type
=
float
,
default
=
1.0
)
parser
.
add_argument
(
'--train_lr'
,
type
=
float
,
default
=
0.01
)
parser
.
add_argument
(
'--train_min_lr'
,
type
=
float
,
default
=
0.0001
)
parser
.
add_argument
(
'--train_lr_decay_factor'
,
type
=
float
,
default
=
0.5
)
parser
.
add_argument
(
'--train_decay_patience'
,
type
=
int
,
default
=
25
)
parser
.
add_argument
(
'--train_early_stopping_patience'
,
type
=
int
,
default
=
50
)
parser
.
add_argument
(
'--share_param'
,
default
=
False
,
action
=
'store_true'
)
parser
.
add_argument
(
'--mix_cpu_gpu'
,
default
=
False
,
action
=
'store_true'
)
parser
.
add_argument
(
'--minibatch_size'
,
type
=
int
,
default
=
20000
)
parser
.
add_argument
(
'--num_workers_per_gpu'
,
type
=
int
,
default
=
8
)
parser
=
argparse
.
ArgumentParser
(
description
=
"GCMC"
)
parser
.
add_argument
(
"--seed"
,
default
=
123
,
type
=
int
)
parser
.
add_argument
(
"--gpu"
,
type
=
str
,
default
=
"0"
)
parser
.
add_argument
(
"--save_dir"
,
type
=
str
,
help
=
"The saving directory"
)
parser
.
add_argument
(
"--save_id"
,
type
=
int
,
help
=
"The saving log id"
)
parser
.
add_argument
(
"--silent"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--data_name"
,
default
=
"ml-1m"
,
type
=
str
,
help
=
"The dataset name: ml-100k, ml-1m, ml-10m"
,
)
parser
.
add_argument
(
"--data_test_ratio"
,
type
=
float
,
default
=
0.1
)
## for ml-100k the test ration is 0.2
parser
.
add_argument
(
"--data_valid_ratio"
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
"--use_one_hot_fea"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--model_activation"
,
type
=
str
,
default
=
"leaky"
)
parser
.
add_argument
(
"--gcn_dropout"
,
type
=
float
,
default
=
0.7
)
parser
.
add_argument
(
"--gcn_agg_norm_symm"
,
type
=
bool
,
default
=
True
)
parser
.
add_argument
(
"--gcn_agg_units"
,
type
=
int
,
default
=
500
)
parser
.
add_argument
(
"--gcn_agg_accum"
,
type
=
str
,
default
=
"sum"
)
parser
.
add_argument
(
"--gcn_out_units"
,
type
=
int
,
default
=
75
)
parser
.
add_argument
(
"--gen_r_num_basis_func"
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
"--train_max_epoch"
,
type
=
int
,
default
=
1000
)
parser
.
add_argument
(
"--train_log_interval"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--train_valid_interval"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--train_optimizer"
,
type
=
str
,
default
=
"adam"
)
parser
.
add_argument
(
"--train_grad_clip"
,
type
=
float
,
default
=
1.0
)
parser
.
add_argument
(
"--train_lr"
,
type
=
float
,
default
=
0.01
)
parser
.
add_argument
(
"--train_min_lr"
,
type
=
float
,
default
=
0.0001
)
parser
.
add_argument
(
"--train_lr_decay_factor"
,
type
=
float
,
default
=
0.5
)
parser
.
add_argument
(
"--train_decay_patience"
,
type
=
int
,
default
=
25
)
parser
.
add_argument
(
"--train_early_stopping_patience"
,
type
=
int
,
default
=
50
)
parser
.
add_argument
(
"--share_param"
,
default
=
False
,
action
=
"store_true"
)
parser
.
add_argument
(
"--mix_cpu_gpu"
,
default
=
False
,
action
=
"store_true"
)
parser
.
add_argument
(
"--minibatch_size"
,
type
=
int
,
default
=
20000
)
parser
.
add_argument
(
"--num_workers_per_gpu"
,
type
=
int
,
default
=
8
)
args
=
parser
.
parse_args
()
### configure save_fir to save all the info
if
args
.
save_dir
is
None
:
args
.
save_dir
=
args
.
data_name
+
"_"
+
''
.
join
(
random
.
choices
(
string
.
ascii_uppercase
+
string
.
digits
,
k
=
2
))
args
.
save_dir
=
(
args
.
data_name
+
"_"
+
""
.
join
(
random
.
choices
(
string
.
ascii_uppercase
+
string
.
digits
,
k
=
2
)
)
)
if
args
.
save_id
is
None
:
args
.
save_id
=
np
.
random
.
randint
(
20
)
args
.
save_dir
=
os
.
path
.
join
(
"log"
,
args
.
save_dir
)
...
...
@@ -179,16 +240,20 @@ def config():
return
args
def
run
(
proc_id
,
n_gpus
,
args
,
devices
,
dataset
):
dev_id
=
devices
[
proc_id
]
if
n_gpus
>
1
:
dist_init_method
=
'tcp://{master_ip}:{master_port}'
.
format
(
master_ip
=
'127.0.0.1'
,
master_port
=
'12345'
)
dist_init_method
=
"tcp://{master_ip}:{master_port}"
.
format
(
master_ip
=
"127.0.0.1"
,
master_port
=
"12345"
)
world_size
=
n_gpus
th
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
init_method
=
dist_init_method
,
world_size
=
world_size
,
rank
=
dev_id
)
th
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
init_method
=
dist_init_method
,
world_size
=
world_size
,
rank
=
dev_id
,
)
if
n_gpus
>
0
:
th
.
cuda
.
set_device
(
dev_id
)
...
...
@@ -196,21 +261,29 @@ def run(proc_id, n_gpus, args, devices, dataset):
train_truths
=
dataset
.
train_truths
num_edges
=
train_truths
.
shape
[
0
]
reverse_types
=
{
to_etype_name
(
k
):
'rev-'
+
to_etype_name
(
k
)
for
k
in
dataset
.
possible_rating_values
}
reverse_types
=
{
to_etype_name
(
k
):
"rev-"
+
to_etype_name
(
k
)
for
k
in
dataset
.
possible_rating_values
}
reverse_types
.
update
({
v
:
k
for
k
,
v
in
reverse_types
.
items
()})
sampler
=
dgl
.
dataloading
.
MultiLayerNeighborSampler
([
None
],
return_eids
=
True
)
sampler
=
dgl
.
dataloading
.
MultiLayerNeighborSampler
(
[
None
],
return_eids
=
True
)
sampler
=
dgl
.
dataloading
.
as_edge_prediction_sampler
(
sampler
)
dataloader
=
dgl
.
dataloading
.
DataLoader
(
dataset
.
train_enc_graph
,
{
to_etype_name
(
k
):
th
.
arange
(
dataset
.
train_enc_graph
.
number_of_edges
(
etype
=
to_etype_name
(
k
)))
for
k
in
dataset
.
possible_rating_values
},
{
to_etype_name
(
k
):
th
.
arange
(
dataset
.
train_enc_graph
.
number_of_edges
(
etype
=
to_etype_name
(
k
))
)
for
k
in
dataset
.
possible_rating_values
},
sampler
,
use_ddp
=
n_gpus
>
1
,
batch_size
=
args
.
minibatch_size
,
shuffle
=
True
,
drop_last
=
False
)
drop_last
=
False
,
)
if
proc_id
==
0
:
valid_dataloader
=
dgl
.
dataloading
.
DataLoader
(
...
...
@@ -220,7 +293,8 @@ def run(proc_id, n_gpus, args, devices, dataset):
g_sampling
=
dataset
.
valid_enc_graph
,
batch_size
=
args
.
minibatch_size
,
shuffle
=
False
,
drop_last
=
False
)
drop_last
=
False
,
)
test_dataloader
=
dgl
.
dataloading
.
DataLoader
(
dataset
.
test_dec_graph
,
th
.
arange
(
dataset
.
test_dec_graph
.
number_of_edges
()),
...
...
@@ -228,19 +302,23 @@ def run(proc_id, n_gpus, args, devices, dataset):
g_sampling
=
dataset
.
test_enc_graph
,
batch_size
=
args
.
minibatch_size
,
shuffle
=
False
,
drop_last
=
False
)
drop_last
=
False
,
)
nd_possible_rating_values
=
\
th
.
FloatTensor
(
dataset
.
possible_rating_values
)
nd_possible_rating_values
=
th
.
FloatTensor
(
dataset
.
possible_rating_values
)
nd_possible_rating_values
=
nd_possible_rating_values
.
to
(
dev_id
)
net
=
Net
(
args
=
args
,
dev_id
=
dev_id
)
net
=
net
.
to
(
dev_id
)
if
n_gpus
>
1
:
net
=
DistributedDataParallel
(
net
,
device_ids
=
[
dev_id
],
output_device
=
dev_id
)
net
=
DistributedDataParallel
(
net
,
device_ids
=
[
dev_id
],
output_device
=
dev_id
)
rating_loss_net
=
nn
.
CrossEntropyLoss
()
learning_rate
=
args
.
train_lr
optimizer
=
get_optimizer
(
args
.
train_optimizer
)(
net
.
parameters
(),
lr
=
learning_rate
)
optimizer
=
get_optimizer
(
args
.
train_optimizer
)(
net
.
parameters
(),
lr
=
learning_rate
)
print
(
"Loading network finished ...
\n
"
)
### declare the loss information
...
...
@@ -263,18 +341,33 @@ def run(proc_id, n_gpus, args, devices, dataset):
with
tqdm
.
tqdm
(
dataloader
)
as
tq
:
for
step
,
(
input_nodes
,
pair_graph
,
blocks
)
in
enumerate
(
tq
):
head_feat
,
tail_feat
,
blocks
=
load_subtensor
(
input_nodes
,
pair_graph
,
blocks
,
dataset
,
dataset
.
train_enc_graph
)
input_nodes
,
pair_graph
,
blocks
,
dataset
,
dataset
.
train_enc_graph
,
)
frontier
=
blocks
[
0
]
compact_g
=
flatten_etypes
(
pair_graph
,
dataset
,
'train'
).
to
(
dev_id
)
true_relation_labels
=
compact_g
.
edata
[
'label'
]
true_relation_ratings
=
compact_g
.
edata
[
'rating'
]
compact_g
=
flatten_etypes
(
pair_graph
,
dataset
,
"train"
).
to
(
dev_id
)
true_relation_labels
=
compact_g
.
edata
[
"label"
]
true_relation_ratings
=
compact_g
.
edata
[
"rating"
]
head_feat
=
head_feat
.
to
(
dev_id
)
tail_feat
=
tail_feat
.
to
(
dev_id
)
frontier
=
frontier
.
to
(
dev_id
)
pred_ratings
=
net
(
compact_g
,
frontier
,
head_feat
,
tail_feat
,
dataset
.
possible_rating_values
)
loss
=
rating_loss_net
(
pred_ratings
,
true_relation_labels
.
to
(
dev_id
)).
mean
()
pred_ratings
=
net
(
compact_g
,
frontier
,
head_feat
,
tail_feat
,
dataset
.
possible_rating_values
,
)
loss
=
rating_loss_net
(
pred_ratings
,
true_relation_labels
.
to
(
dev_id
)
).
mean
()
count_loss
+=
loss
.
item
()
optimizer
.
zero_grad
()
loss
.
backward
()
...
...
@@ -282,17 +375,27 @@ def run(proc_id, n_gpus, args, devices, dataset):
optimizer
.
step
()
if
proc_id
==
0
and
iter_idx
==
1
:
print
(
"Total #Param of net: %d"
%
(
torch_total_param_num
(
net
)))
real_pred_ratings
=
(
th
.
softmax
(
pred_ratings
,
dim
=
1
)
*
nd_possible_rating_values
.
view
(
1
,
-
1
)).
sum
(
dim
=
1
)
rmse
=
((
real_pred_ratings
-
true_relation_ratings
.
to
(
dev_id
))
**
2
).
sum
()
print
(
"Total #Param of net: %d"
%
(
torch_total_param_num
(
net
))
)
real_pred_ratings
=
(
th
.
softmax
(
pred_ratings
,
dim
=
1
)
*
nd_possible_rating_values
.
view
(
1
,
-
1
)
).
sum
(
dim
=
1
)
rmse
=
(
(
real_pred_ratings
-
true_relation_ratings
.
to
(
dev_id
))
**
2
).
sum
()
count_rmse
+=
rmse
.
item
()
count_num
+=
pred_ratings
.
shape
[
0
]
tq
.
set_postfix
({
'loss'
:
'{:.4f}'
.
format
(
count_loss
/
iter_idx
),
'rmse'
:
'{:.4f}'
.
format
(
count_rmse
/
count_num
)},
refresh
=
False
)
tq
.
set_postfix
(
{
"loss"
:
"{:.4f}"
.
format
(
count_loss
/
iter_idx
),
"rmse"
:
"{:.4f}"
.
format
(
count_rmse
/
count_num
),
},
refresh
=
False
,
)
iter_idx
+=
1
...
...
@@ -304,39 +407,50 @@ def run(proc_id, n_gpus, args, devices, dataset):
if
n_gpus
>
1
:
th
.
distributed
.
barrier
()
if
proc_id
==
0
:
valid_rmse
=
evaluate
(
args
=
args
,
dev_id
=
dev_id
,
net
=
net
,
dataset
=
dataset
,
dataloader
=
valid_dataloader
,
segment
=
'valid'
)
logging_str
=
'Val RMSE={:.4f}'
.
format
(
valid_rmse
)
valid_rmse
=
evaluate
(
args
=
args
,
dev_id
=
dev_id
,
net
=
net
,
dataset
=
dataset
,
dataloader
=
valid_dataloader
,
segment
=
"valid"
,
)
logging_str
=
"Val RMSE={:.4f}"
.
format
(
valid_rmse
)
if
valid_rmse
<
best_valid_rmse
:
best_valid_rmse
=
valid_rmse
no_better_valid
=
0
best_epoch
=
epoch
test_rmse
=
evaluate
(
args
=
args
,
dev_id
=
dev_id
,
net
=
net
,
dataset
=
dataset
,
dataloader
=
test_dataloader
,
segment
=
'test'
)
test_rmse
=
evaluate
(
args
=
args
,
dev_id
=
dev_id
,
net
=
net
,
dataset
=
dataset
,
dataloader
=
test_dataloader
,
segment
=
"test"
,
)
best_test_rmse
=
test_rmse
logging_str
+=
'
, Test RMSE={:.4f}
'
.
format
(
test_rmse
)
logging_str
+=
"
, Test RMSE={:.4f}
"
.
format
(
test_rmse
)
else
:
no_better_valid
+=
1
if
no_better_valid
>
args
.
train_early_stopping_patience
\
and
learning_rate
<=
args
.
train_min_lr
:
logging
.
info
(
"Early stopping threshold reached. Stop training."
)
if
(
no_better_valid
>
args
.
train_early_stopping_patience
and
learning_rate
<=
args
.
train_min_lr
):
logging
.
info
(
"Early stopping threshold reached. Stop training."
)
break
if
no_better_valid
>
args
.
train_decay_patience
:
new_lr
=
max
(
learning_rate
*
args
.
train_lr_decay_factor
,
args
.
train_min_lr
)
new_lr
=
max
(
learning_rate
*
args
.
train_lr_decay_factor
,
args
.
train_min_lr
,
)
if
new_lr
<
learning_rate
:
logging
.
info
(
"
\t
Change the LR to %g"
%
new_lr
)
learning_rate
=
new_lr
for
p
in
optimizer
.
param_groups
:
p
[
'
lr
'
]
=
learning_rate
p
[
"
lr
"
]
=
learning_rate
no_better_valid
=
0
print
(
"Change the LR to %g"
%
new_lr
)
# sync on evalution
...
...
@@ -346,24 +460,30 @@ def run(proc_id, n_gpus, args, devices, dataset):
if
proc_id
==
0
:
print
(
logging_str
)
if
proc_id
==
0
:
print
(
'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'
.
format
(
best_epoch
,
best_valid_rmse
,
best_test_rmse
))
print
(
"Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}"
.
format
(
best_epoch
,
best_valid_rmse
,
best_test_rmse
)
)
if
__name__
==
'
__main__
'
:
if
__name__
==
"
__main__
"
:
args
=
config
()
devices
=
list
(
map
(
int
,
args
.
gpu
.
split
(
','
)))
devices
=
list
(
map
(
int
,
args
.
gpu
.
split
(
","
)))
n_gpus
=
len
(
devices
)
# For GCMC based on sampling, we require node has its own features.
# Otherwise (node_id is the feature), the model can not scale
dataset
=
MovieLens
(
args
.
data_name
,
'cpu'
,
mix_cpu_gpu
=
args
.
mix_cpu_gpu
,
use_one_hot_fea
=
args
.
use_one_hot_fea
,
symm
=
args
.
gcn_agg_norm_symm
,
test_ratio
=
args
.
data_test_ratio
,
valid_ratio
=
args
.
data_valid_ratio
)
dataset
=
MovieLens
(
args
.
data_name
,
"cpu"
,
mix_cpu_gpu
=
args
.
mix_cpu_gpu
,
use_one_hot_fea
=
args
.
use_one_hot_fea
,
symm
=
args
.
gcn_agg_norm_symm
,
test_ratio
=
args
.
data_test_ratio
,
valid_ratio
=
args
.
data_valid_ratio
,
)
print
(
"Loading data finished ...
\n
"
)
args
.
src_in_units
=
dataset
.
user_feature_shape
[
1
]
...
...
@@ -372,7 +492,7 @@ if __name__ == '__main__':
# cpu
if
devices
[
0
]
==
-
1
:
run
(
0
,
0
,
args
,
[
'
cpu
'
],
dataset
)
run
(
0
,
0
,
args
,
[
"
cpu
"
],
dataset
)
# gpu
elif
n_gpus
==
1
:
run
(
0
,
n_gpus
,
args
,
devices
,
dataset
)
...
...
examples/pytorch/gcn/train.py
View file @
704bcaf6
import
argparse
import
dgl
import
dgl.nn
as
dglnn
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
dgl
import
dgl.nn
as
dglnn
from
dgl
import
AddSelfLoop
from
dgl.data
import
CiteseerGraphDataset
,
CoraGraphDataset
,
PubmedGraphDataset
...
...
examples/pytorch/geniepath/model.py
View file @
704bcaf6
import
torch
as
th
import
torch.nn
as
nn
from
torch.nn
import
LSTM
from
dgl.nn
import
GATConv
from
torch.nn
import
LSTM
class
GeniePathConv
(
nn
.
Module
):
...
...
examples/pytorch/geniepath/ppi.py
View file @
704bcaf6
...
...
@@ -3,11 +3,11 @@ import argparse
import
numpy
as
np
import
torch
as
th
import
torch.optim
as
optim
from
model
import
GeniePath
,
GeniePathLazy
from
sklearn.metrics
import
f1_score
from
dgl.data
import
PPIDataset
from
dgl.dataloading
import
GraphDataLoader
from
model
import
GeniePath
,
GeniePathLazy
from
sklearn.metrics
import
f1_score
def
evaluate
(
model
,
loss_fn
,
dataloader
,
device
=
"cpu"
):
...
...
examples/pytorch/geniepath/pubmed.py
View file @
704bcaf6
...
...
@@ -2,10 +2,10 @@ import argparse
import
torch
as
th
import
torch.optim
as
optim
from
model
import
GeniePath
,
GeniePathLazy
from
sklearn.metrics
import
accuracy_score
from
dgl.data
import
PubmedGraphDataset
from
model
import
GeniePath
,
GeniePathLazy
from
sklearn.metrics
import
accuracy_score
def
main
(
args
):
...
...
examples/pytorch/ggnn/data_utils.py
View file @
704bcaf6
...
...
@@ -5,16 +5,16 @@ Data utils for processing bAbI datasets
import
os
import
string
import
torch
from
torch.utils.data
import
DataLoader
import
dgl
import
torch
from
dgl.data.utils
import
(
_get_dgl_url
,
download
,
extract_archive
,
get_download_dir
,
)
from
torch.utils.data
import
DataLoader
def
get_babi_dataloaders
(
batch_size
,
train_size
=
50
,
task_id
=
4
,
q_type
=
0
):
...
...
examples/pytorch/ggnn/ggnn_gc.py
View file @
704bcaf6
...
...
@@ -2,9 +2,9 @@
Gated Graph Neural Network module for graph classification tasks
"""
import
torch
from
torch
import
nn
from
dgl.nn.pytorch
import
GatedGraphConv
,
GlobalAttentionPooling
from
torch
import
nn
class
GraphClsGGNN
(
nn
.
Module
):
...
...
examples/pytorch/ggnn/ggnn_ns.py
View file @
704bcaf6
"""
Gated Graph Neural Network module for node selection tasks
"""
import
torch
from
torch
import
nn
import
dgl
import
torch
from
dgl.nn.pytorch
import
GatedGraphConv
from
torch
import
nn
class
NodeSelectionGGNN
(
nn
.
Module
):
...
...
examples/pytorch/ggnn/ggsnn.py
View file @
704bcaf6
...
...
@@ -4,9 +4,9 @@ Gated Graph Sequence Neural Network for sequence outputs
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
dgl.nn.pytorch
import
GatedGraphConv
,
GlobalAttentionPooling
from
torch
import
nn
class
GGSNN
(
nn
.
Module
):
...
...
examples/pytorch/gin/train.py
View file @
704bcaf6
...
...
@@ -5,13 +5,13 @@ import torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.optim
as
optim
from
sklearn.model_selection
import
StratifiedKFold
from
torch.utils.data.sampler
import
SubsetRandomSampler
from
dgl.data
import
GINDataset
from
dgl.dataloading
import
GraphDataLoader
from
dgl.nn.pytorch.conv
import
GINConv
from
dgl.nn.pytorch.glob
import
SumPooling
from
sklearn.model_selection
import
StratifiedKFold
from
torch.utils.data.sampler
import
SubsetRandomSampler
class
MLP
(
nn
.
Module
):
...
...
examples/pytorch/gnn_explainer/explain_main.py
View file @
704bcaf6
import
argparse
import
os
import
torch
as
th
from
gnnlens
import
Writer
from
models
import
Model
import
dgl
import
torch
as
th
from
dgl
import
load_graphs
from
dgl.data
import
(
BACommunityDataset
,
BAShapeDataset
,
TreeCycleDataset
,
TreeGridDataset
)
from
dgl.data
import
(
BACommunityDataset
,
BAShapeDataset
,
TreeCycleDataset
,
TreeGridDataset
,
)
from
dgl.nn
import
GNNExplainer
from
gnnlens
import
Writer
from
models
import
Model
def
main
(
args
):
...
...
examples/pytorch/gnn_explainer/models.py
View file @
704bcaf6
import
dgl.function
as
fn
import
torch
as
th
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
dgl.function
as
fn
class
Layer
(
nn
.
Module
):
def
__init__
(
self
,
in_dim
,
out_dim
):
...
...
@@ -10,18 +11,19 @@ class Layer(nn.Module):
def
forward
(
self
,
graph
,
feat
,
eweight
=
None
):
with
graph
.
local_scope
():
graph
.
ndata
[
'h'
]
=
feat
graph
.
ndata
[
"h"
]
=
feat
if
eweight
is
None
:
graph
.
update_all
(
fn
.
copy_u
(
'h'
,
'm'
),
fn
.
mean
(
'm'
,
'h'
))
graph
.
update_all
(
fn
.
copy_u
(
"h"
,
"m"
),
fn
.
mean
(
"m"
,
"h"
))
else
:
graph
.
edata
[
'
ew
'
]
=
eweight
graph
.
update_all
(
fn
.
u_mul_e
(
'h'
,
'
ew
'
,
'm'
),
fn
.
mean
(
'm'
,
'h'
))
graph
.
edata
[
"
ew
"
]
=
eweight
graph
.
update_all
(
fn
.
u_mul_e
(
"h"
,
"
ew
"
,
"m"
),
fn
.
mean
(
"m"
,
"h"
))
h
=
self
.
layer
(
th
.
cat
([
graph
.
ndata
[
'h'
],
feat
],
dim
=-
1
))
h
=
self
.
layer
(
th
.
cat
([
graph
.
ndata
[
"h"
],
feat
],
dim
=-
1
))
return
h
class
Model
(
nn
.
Module
):
def
__init__
(
self
,
in_dim
,
out_dim
,
hid_dim
=
40
):
super
().
__init__
()
...
...
examples/pytorch/gnn_explainer/train_main.py
View file @
704bcaf6
import
os
import
argparse
import
os
import
torch
as
th
import
torch.nn
as
nn
from
dgl
import
save_graphs
from
dgl.data
import
(
BACommunityDataset
,
BAShapeDataset
,
TreeCycleDataset
,
TreeGridDataset
,
)
from
models
import
Model
from
dgl.data
import
BAShapeDataset
,
BACommunityDataset
,
TreeCycleDataset
,
TreeGridDataset
def
main
(
args
):
if
args
.
dataset
==
'
BAShape
'
:
if
args
.
dataset
==
"
BAShape
"
:
dataset
=
BAShapeDataset
(
seed
=
0
)
elif
args
.
dataset
==
'
BACommunity
'
:
elif
args
.
dataset
==
"
BACommunity
"
:
dataset
=
BACommunityDataset
(
seed
=
0
)
elif
args
.
dataset
==
'
TreeCycle
'
:
elif
args
.
dataset
==
"
TreeCycle
"
:
dataset
=
TreeCycleDataset
(
seed
=
0
)
elif
args
.
dataset
==
'
TreeGrid
'
:
elif
args
.
dataset
==
"
TreeGrid
"
:
dataset
=
TreeGridDataset
(
seed
=
0
)
graph
=
dataset
[
0
]
labels
=
graph
.
ndata
[
'
label
'
]
n_feats
=
graph
.
ndata
[
'
feat
'
]
labels
=
graph
.
ndata
[
"
label
"
]
n_feats
=
graph
.
ndata
[
"
feat
"
]
num_classes
=
dataset
.
num_classes
model
=
Model
(
n_feats
.
shape
[
-
1
],
num_classes
)
...
...
@@ -40,16 +46,21 @@ def main(args):
loss
.
backward
()
optim
.
step
()
print
(
f
'
In Epoch:
{
epoch
}
; Acc:
{
acc
}
; Loss:
{
loss
.
item
()
}
'
)
print
(
f
"
In Epoch:
{
epoch
}
; Acc:
{
acc
}
; Loss:
{
loss
.
item
()
}
"
)
model_stat_dict
=
model
.
state_dict
()
model_path
=
os
.
path
.
join
(
'
./
'
,
f
'
model_
{
args
.
dataset
}
.pth
'
)
model_path
=
os
.
path
.
join
(
"
./
"
,
f
"
model_
{
args
.
dataset
}
.pth
"
)
th
.
save
(
model_stat_dict
,
model_path
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Dummy model training'
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
default
=
'BAShape'
,
choices
=
[
'BAShape'
,
'BACommunity'
,
'TreeCycle'
,
'TreeGrid'
])
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Dummy model training"
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"BAShape"
,
choices
=
[
"BAShape"
,
"BACommunity"
,
"TreeCycle"
,
"TreeGrid"
],
)
args
=
parser
.
parse_args
()
print
(
args
)
...
...
examples/pytorch/grace/aug.py
View file @
704bcaf6
# Data augmentation on graphs via edge dropping and feature masking
import
dgl
import
numpy
as
np
import
torch
as
th
import
dgl
def
aug
(
graph
,
x
,
feat_drop_rate
,
edge_mask_rate
):
n_node
=
graph
.
num_nodes
()
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment