Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
bef99307
Unverified
Commit
bef99307
authored
May 28, 2022
by
Mufei Li
Committed by
GitHub
May 28, 2022
Browse files
[Bugfix] Fix PinSAGE Benchmark (#4058)
* Update * Update * Update dgl.data.rst * CI
parent
7a065a9c
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
40 additions
and
39 deletions
+40
-39
benchmarks/benchmarks/utils.py
benchmarks/benchmarks/utils.py
+8
-22
docs/source/api/python/dgl.data.rst
docs/source/api/python/dgl.data.rst
+1
-1
examples/pytorch/pinsage/README.md
examples/pytorch/pinsage/README.md
+6
-6
examples/pytorch/pinsage/model.py
examples/pytorch/pinsage/model.py
+6
-1
examples/pytorch/pinsage/model_sparse.py
examples/pytorch/pinsage/model_sparse.py
+6
-1
examples/pytorch/pinsage/process_movielens1m.py
examples/pytorch/pinsage/process_movielens1m.py
+6
-4
examples/pytorch/pinsage/process_nowplaying_rs.py
examples/pytorch/pinsage/process_nowplaying_rs.py
+7
-4
No files found.
benchmarks/benchmarks/utils.py
View file @
bef99307
...
...
@@ -253,24 +253,16 @@ class PinsageDataset:
def
load_nowplaying_rs
():
import
torchtext.legacy
as
torchtext
# follow examples/pytorch/pinsage/README to create
nowplaying_rs.pkl
name
=
'
nowplaying_rs.pkl
'
# follow examples/pytorch/pinsage/README to create
train_g.bin
name
=
'
train_g.bin
'
dataset_dir
=
os
.
path
.
join
(
os
.
getcwd
(),
'dataset'
)
os
.
symlink
(
'/tmp/dataset/'
,
dataset_dir
)
dataset_path
=
os
.
path
.
join
(
dataset_dir
,
"nowplaying_rs"
,
name
)
# Load dataset
with
open
(
dataset_path
,
'rb'
)
as
f
:
dataset
=
pickle
.
load
(
f
)
g
=
dataset
[
'train-graph'
]
val_matrix
=
dataset
[
'val-matrix'
].
tocsr
()
test_matrix
=
dataset
[
'test-matrix'
].
tocsr
()
item_texts
=
dataset
[
'item-texts'
]
user_ntype
=
dataset
[
'user-type'
]
item_ntype
=
dataset
[
'item-type'
]
user_to_item_etype
=
dataset
[
'user-to-item-type'
]
timestamp
=
dataset
[
'timestamp-edge-column'
]
g_list
,
_
=
dgl
.
load_graphs
(
dataset_path
)
g
=
g_list
[
0
]
user_ntype
=
'user'
item_ntype
=
'track'
# Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity)
...
...
@@ -282,17 +274,11 @@ def load_nowplaying_rs():
# Prepare torchtext dataset and vocabulary
fields
=
{}
examples
=
[]
for
key
,
texts
in
item_texts
.
items
():
fields
[
key
]
=
torchtext
.
data
.
Field
(
include_lengths
=
True
,
lower
=
True
,
batch_first
=
True
)
for
i
in
range
(
g
.
number_of_nodes
(
item_ntype
)):
example
=
torchtext
.
data
.
Example
.
fromlist
(
[
item_texts
[
key
][
i
]
for
key
in
item_texts
.
keys
()],
[(
key
,
fields
[
key
])
for
key
in
item_texts
.
keys
()])
[],
[])
examples
.
append
(
example
)
textset
=
torchtext
.
data
.
Dataset
(
examples
,
fields
)
for
key
,
field
in
fields
.
items
():
field
.
build_vocab
(
getattr
(
textset
,
key
))
return
PinsageDataset
(
g
,
user_ntype
,
item_ntype
,
textset
)
...
...
docs/source/api/python/dgl.data.rst
View file @
bef99307
...
...
@@ -51,7 +51,6 @@ Datasets for node classification/regression tasks
BACommunityDataset
TreeCycleDataset
TreeGridDataset
BA2MotifDataset
Edge Prediction Datasets
---------------------------------------
...
...
@@ -88,6 +87,7 @@ Datasets for graph classification/regression tasks
LegacyTUDataset
GINDataset
FakeNewsDataset
BA2MotifDataset
Dataset adapters
-------------------
...
...
examples/pytorch/pinsage/README.md
View file @
bef99307
...
...
@@ -12,15 +12,15 @@
1.
Download and extract the MovieLens-1M dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip
into the current directory.
2.
Run
`python process_movielens1m.py ./ml-1m ./data
.pkl
`
.
Replace
`ml-1m`
with the directory you put the
`.dat`
files, and replace
`data
.pkl`
to
any path you wish to put the output
pickle
file.
2.
Run
`python process_movielens1m.py ./ml-1m ./data
_processed
`
.
Replace
`ml-1m`
with the directory you put the
`.dat`
files, and replace
`data
_processed`
with
any path you wish to put the output file
s
.
### Nowplaying-rs
1.
Download and extract the Nowplaying-rs dataset from https://zenodo.org/record/3248543/files/nowplayingrs.zip?download=1
into the current directory.
2.
Run
`python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data
.pkl
`
2.
Run
`python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data
_processed
`
## Run model
...
...
@@ -31,7 +31,7 @@ interacted. The distance between two items are measured by Euclidean distance o
item embeddings, which are learned as outputs of PinSAGE.
```
python model.py data
.pkl
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64
python model.py data
_processed
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64
```
The implementation here also assigns a learnable vector to each item. If your hidden
...
...
@@ -40,7 +40,7 @@ for sparse embedding update (written with `torch.optim.SparseAdam`) instead:
```
python model_sparse.py data
.pkl
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024
python model_sparse.py data
_processed
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024
```
Note that since the embedding update is done on CPU, it will be significantly slower than doing
...
...
examples/pytorch/pinsage/model.py
View file @
bef99307
...
...
@@ -6,6 +6,7 @@ import torch.nn as nn
from
torch.utils.data
import
DataLoader
import
torchtext
import
dgl
import
os
import
tqdm
import
layers
...
...
@@ -137,6 +138,10 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
# Load dataset
with
open
(
args
.
dataset_path
,
'rb'
)
as
f
:
data_info_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'data.pkl'
)
with
open
(
data_info_path
,
'rb'
)
as
f
:
dataset
=
pickle
.
load
(
f
)
train_g_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'train_g.bin'
)
g_list
,
_
=
dgl
.
load_graphs
(
train_g_path
)
dataset
[
'train-graph'
]
=
g_list
[
0
]
train
(
dataset
,
args
)
examples/pytorch/pinsage/model_sparse.py
View file @
bef99307
...
...
@@ -6,6 +6,7 @@ import torch.nn as nn
from
torch.utils.data
import
DataLoader
import
torchtext
import
dgl
import
os
import
tqdm
import
layers
...
...
@@ -142,6 +143,10 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
# Load dataset
with
open
(
args
.
dataset_path
,
'rb'
)
as
f
:
data_info_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'data.pkl'
)
with
open
(
data_info_path
,
'rb'
)
as
f
:
dataset
=
pickle
.
load
(
f
)
train_g_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'train_g.bin'
)
g_list
,
_
=
dgl
.
load_graphs
(
train_g_path
)
dataset
[
'train-graph'
]
=
g_list
[
0
]
train
(
dataset
,
args
)
examples/pytorch/pinsage/process_movielens1m.py
View file @
bef99307
...
...
@@ -28,10 +28,11 @@ from data_utils import *
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'out
put_path
'
,
type
=
str
)
parser
.
add_argument
(
'out
_directory
'
,
type
=
str
)
args
=
parser
.
parse_args
()
directory
=
args
.
directory
output_path
=
args
.
output_path
out_directory
=
args
.
out_directory
os
.
makedirs
(
out_directory
,
exist_ok
=
True
)
## Build heterogeneous graph
...
...
@@ -139,8 +140,9 @@ if __name__ == '__main__':
## Dump the graph and the datasets
dgl
.
save_graphs
(
os
.
path
.
join
(
out_directory
,
'train_g.bin'
),
train_g
)
dataset
=
{
'train-graph'
:
train_g
,
'val-matrix'
:
val_matrix
,
'test-matrix'
:
test_matrix
,
'item-texts'
:
movie_textual_dataset
,
...
...
@@ -151,5 +153,5 @@ if __name__ == '__main__':
'item-to-user-type'
:
'watched-by'
,
'timestamp-edge-column'
:
'timestamp'
}
with
open
(
o
utput_path
,
'wb'
)
as
f
:
with
open
(
o
s
.
path
.
join
(
out_directory
,
'data.pkl'
)
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
examples/pytorch/pinsage/process_nowplaying_rs.py
View file @
bef99307
...
...
@@ -5,6 +5,7 @@ file a heterogeneous graph with categorical and numeric features.
import
os
import
argparse
import
dgl
import
pandas
as
pd
import
scipy.sparse
as
ssp
import
pickle
...
...
@@ -14,10 +15,11 @@ from builder import PandasGraphBuilder
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'out
put_path
'
,
type
=
str
)
parser
.
add_argument
(
'out
_directory
'
,
type
=
str
)
args
=
parser
.
parse_args
()
directory
=
args
.
directory
output_path
=
args
.
output_path
out_directory
=
args
.
out_directory
os
.
makedirs
(
out_directory
,
exist_ok
=
True
)
data
=
pd
.
read_csv
(
os
.
path
.
join
(
directory
,
'context_content_features.csv'
))
track_feature_cols
=
list
(
data
.
columns
[
1
:
13
])
...
...
@@ -59,8 +61,9 @@ if __name__ == '__main__':
val_matrix
,
test_matrix
=
build_val_test_matrix
(
g
,
val_indices
,
test_indices
,
'user'
,
'track'
,
'listened'
)
dgl
.
save_graphs
(
os
.
path
.
join
(
out_directory
,
'train_g.bin'
),
train_g
)
dataset
=
{
'train-graph'
:
train_g
,
'val-matrix'
:
val_matrix
,
'test-matrix'
:
test_matrix
,
'item-texts'
:
{},
...
...
@@ -71,5 +74,5 @@ if __name__ == '__main__':
'item-to-user-type'
:
'listened-by'
,
'timestamp-edge-column'
:
'created_at'
}
with
open
(
o
utput_path
,
'wb'
)
as
f
:
with
open
(
o
s
.
path
.
join
(
out_directory
,
'data.pkl'
)
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment