Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
bef99307
Unverified
Commit
bef99307
authored
May 28, 2022
by
Mufei Li
Committed by
GitHub
May 28, 2022
Browse files
[Bugfix] Fix PinSAGE Benchmark (#4058)
* Update * Update * Update dgl.data.rst * CI
parent
7a065a9c
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
40 additions
and
39 deletions
+40
-39
benchmarks/benchmarks/utils.py
benchmarks/benchmarks/utils.py
+8
-22
docs/source/api/python/dgl.data.rst
docs/source/api/python/dgl.data.rst
+1
-1
examples/pytorch/pinsage/README.md
examples/pytorch/pinsage/README.md
+6
-6
examples/pytorch/pinsage/model.py
examples/pytorch/pinsage/model.py
+6
-1
examples/pytorch/pinsage/model_sparse.py
examples/pytorch/pinsage/model_sparse.py
+6
-1
examples/pytorch/pinsage/process_movielens1m.py
examples/pytorch/pinsage/process_movielens1m.py
+6
-4
examples/pytorch/pinsage/process_nowplaying_rs.py
examples/pytorch/pinsage/process_nowplaying_rs.py
+7
-4
No files found.
benchmarks/benchmarks/utils.py
View file @
bef99307
...
@@ -253,24 +253,16 @@ class PinsageDataset:
...
@@ -253,24 +253,16 @@ class PinsageDataset:
def
load_nowplaying_rs
():
def
load_nowplaying_rs
():
import
torchtext.legacy
as
torchtext
import
torchtext.legacy
as
torchtext
# follow examples/pytorch/pinsage/README to create
nowplaying_rs.pkl
# follow examples/pytorch/pinsage/README to create
train_g.bin
name
=
'
nowplaying_rs.pkl
'
name
=
'
train_g.bin
'
dataset_dir
=
os
.
path
.
join
(
os
.
getcwd
(),
'dataset'
)
dataset_dir
=
os
.
path
.
join
(
os
.
getcwd
(),
'dataset'
)
os
.
symlink
(
'/tmp/dataset/'
,
dataset_dir
)
os
.
symlink
(
'/tmp/dataset/'
,
dataset_dir
)
dataset_path
=
os
.
path
.
join
(
dataset_dir
,
"nowplaying_rs"
,
name
)
dataset_path
=
os
.
path
.
join
(
dataset_dir
,
"nowplaying_rs"
,
name
)
# Load dataset
g_list
,
_
=
dgl
.
load_graphs
(
dataset_path
)
with
open
(
dataset_path
,
'rb'
)
as
f
:
g
=
g_list
[
0
]
dataset
=
pickle
.
load
(
f
)
user_ntype
=
'user'
item_ntype
=
'track'
g
=
dataset
[
'train-graph'
]
val_matrix
=
dataset
[
'val-matrix'
].
tocsr
()
test_matrix
=
dataset
[
'test-matrix'
].
tocsr
()
item_texts
=
dataset
[
'item-texts'
]
user_ntype
=
dataset
[
'user-type'
]
item_ntype
=
dataset
[
'item-type'
]
user_to_item_etype
=
dataset
[
'user-to-item-type'
]
timestamp
=
dataset
[
'timestamp-edge-column'
]
# Assign user and movie IDs and use them as features (to learn an individual trainable
# Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity)
# embedding for each entity)
...
@@ -282,17 +274,11 @@ def load_nowplaying_rs():
...
@@ -282,17 +274,11 @@ def load_nowplaying_rs():
# Prepare torchtext dataset and vocabulary
# Prepare torchtext dataset and vocabulary
fields
=
{}
fields
=
{}
examples
=
[]
examples
=
[]
for
key
,
texts
in
item_texts
.
items
():
fields
[
key
]
=
torchtext
.
data
.
Field
(
include_lengths
=
True
,
lower
=
True
,
batch_first
=
True
)
for
i
in
range
(
g
.
number_of_nodes
(
item_ntype
)):
for
i
in
range
(
g
.
number_of_nodes
(
item_ntype
)):
example
=
torchtext
.
data
.
Example
.
fromlist
(
example
=
torchtext
.
data
.
Example
.
fromlist
(
[
item_texts
[
key
][
i
]
for
key
in
item_texts
.
keys
()],
[],
[])
[(
key
,
fields
[
key
])
for
key
in
item_texts
.
keys
()])
examples
.
append
(
example
)
examples
.
append
(
example
)
textset
=
torchtext
.
data
.
Dataset
(
examples
,
fields
)
textset
=
torchtext
.
data
.
Dataset
(
examples
,
fields
)
for
key
,
field
in
fields
.
items
():
field
.
build_vocab
(
getattr
(
textset
,
key
))
return
PinsageDataset
(
g
,
user_ntype
,
item_ntype
,
textset
)
return
PinsageDataset
(
g
,
user_ntype
,
item_ntype
,
textset
)
...
...
docs/source/api/python/dgl.data.rst
View file @
bef99307
...
@@ -51,7 +51,6 @@ Datasets for node classification/regression tasks
...
@@ -51,7 +51,6 @@ Datasets for node classification/regression tasks
BACommunityDataset
BACommunityDataset
TreeCycleDataset
TreeCycleDataset
TreeGridDataset
TreeGridDataset
BA2MotifDataset
Edge Prediction Datasets
Edge Prediction Datasets
---------------------------------------
---------------------------------------
...
@@ -88,6 +87,7 @@ Datasets for graph classification/regression tasks
...
@@ -88,6 +87,7 @@ Datasets for graph classification/regression tasks
LegacyTUDataset
LegacyTUDataset
GINDataset
GINDataset
FakeNewsDataset
FakeNewsDataset
BA2MotifDataset
Dataset adapters
Dataset adapters
-------------------
-------------------
...
...
examples/pytorch/pinsage/README.md
View file @
bef99307
...
@@ -12,15 +12,15 @@
...
@@ -12,15 +12,15 @@
1.
Download and extract the MovieLens-1M dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip
1.
Download and extract the MovieLens-1M dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip
into the current directory.
into the current directory.
2.
Run
`python process_movielens1m.py ./ml-1m ./data
.pkl
`
.
2.
Run
`python process_movielens1m.py ./ml-1m ./data
_processed
`
.
Replace
`ml-1m`
with the directory you put the
`.dat`
files, and replace
`data
.pkl`
to
Replace
`ml-1m`
with the directory you put the
`.dat`
files, and replace
`data
_processed`
with
any path you wish to put the output
pickle
file.
any path you wish to put the output file
s
.
### Nowplaying-rs
### Nowplaying-rs
1.
Download and extract the Nowplaying-rs dataset from https://zenodo.org/record/3248543/files/nowplayingrs.zip?download=1
1.
Download and extract the Nowplaying-rs dataset from https://zenodo.org/record/3248543/files/nowplayingrs.zip?download=1
into the current directory.
into the current directory.
2.
Run
`python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data
.pkl
`
2.
Run
`python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data
_processed
`
## Run model
## Run model
...
@@ -31,7 +31,7 @@ interacted. The distance between two items are measured by Euclidean distance o
...
@@ -31,7 +31,7 @@ interacted. The distance between two items are measured by Euclidean distance o
item embeddings, which are learned as outputs of PinSAGE.
item embeddings, which are learned as outputs of PinSAGE.
```
```
python model.py data
.pkl
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64
python model.py data
_processed
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64
```
```
The implementation here also assigns a learnable vector to each item. If your hidden
The implementation here also assigns a learnable vector to each item. If your hidden
...
@@ -40,7 +40,7 @@ for sparse embedding update (written with `torch.optim.SparseAdam`) instead:
...
@@ -40,7 +40,7 @@ for sparse embedding update (written with `torch.optim.SparseAdam`) instead:
```
```
python model_sparse.py data
.pkl
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024
python model_sparse.py data
_processed
--num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024
```
```
Note that since the embedding update is done on CPU, it will be significantly slower than doing
Note that since the embedding update is done on CPU, it will be significantly slower than doing
...
...
examples/pytorch/pinsage/model.py
View file @
bef99307
...
@@ -6,6 +6,7 @@ import torch.nn as nn
...
@@ -6,6 +6,7 @@ import torch.nn as nn
from
torch.utils.data
import
DataLoader
from
torch.utils.data
import
DataLoader
import
torchtext
import
torchtext
import
dgl
import
dgl
import
os
import
tqdm
import
tqdm
import
layers
import
layers
...
@@ -137,6 +138,10 @@ if __name__ == '__main__':
...
@@ -137,6 +138,10 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# Load dataset
# Load dataset
with
open
(
args
.
dataset_path
,
'rb'
)
as
f
:
data_info_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'data.pkl'
)
with
open
(
data_info_path
,
'rb'
)
as
f
:
dataset
=
pickle
.
load
(
f
)
dataset
=
pickle
.
load
(
f
)
train_g_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'train_g.bin'
)
g_list
,
_
=
dgl
.
load_graphs
(
train_g_path
)
dataset
[
'train-graph'
]
=
g_list
[
0
]
train
(
dataset
,
args
)
train
(
dataset
,
args
)
examples/pytorch/pinsage/model_sparse.py
View file @
bef99307
...
@@ -6,6 +6,7 @@ import torch.nn as nn
...
@@ -6,6 +6,7 @@ import torch.nn as nn
from
torch.utils.data
import
DataLoader
from
torch.utils.data
import
DataLoader
import
torchtext
import
torchtext
import
dgl
import
dgl
import
os
import
tqdm
import
tqdm
import
layers
import
layers
...
@@ -142,6 +143,10 @@ if __name__ == '__main__':
...
@@ -142,6 +143,10 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# Load dataset
# Load dataset
with
open
(
args
.
dataset_path
,
'rb'
)
as
f
:
data_info_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'data.pkl'
)
with
open
(
data_info_path
,
'rb'
)
as
f
:
dataset
=
pickle
.
load
(
f
)
dataset
=
pickle
.
load
(
f
)
train_g_path
=
os
.
path
.
join
(
args
.
dataset_path
,
'train_g.bin'
)
g_list
,
_
=
dgl
.
load_graphs
(
train_g_path
)
dataset
[
'train-graph'
]
=
g_list
[
0
]
train
(
dataset
,
args
)
train
(
dataset
,
args
)
examples/pytorch/pinsage/process_movielens1m.py
View file @
bef99307
...
@@ -28,10 +28,11 @@ from data_utils import *
...
@@ -28,10 +28,11 @@ from data_utils import *
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'out
put_path
'
,
type
=
str
)
parser
.
add_argument
(
'out
_directory
'
,
type
=
str
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
directory
=
args
.
directory
directory
=
args
.
directory
output_path
=
args
.
output_path
out_directory
=
args
.
out_directory
os
.
makedirs
(
out_directory
,
exist_ok
=
True
)
## Build heterogeneous graph
## Build heterogeneous graph
...
@@ -139,8 +140,9 @@ if __name__ == '__main__':
...
@@ -139,8 +140,9 @@ if __name__ == '__main__':
## Dump the graph and the datasets
## Dump the graph and the datasets
dgl
.
save_graphs
(
os
.
path
.
join
(
out_directory
,
'train_g.bin'
),
train_g
)
dataset
=
{
dataset
=
{
'train-graph'
:
train_g
,
'val-matrix'
:
val_matrix
,
'val-matrix'
:
val_matrix
,
'test-matrix'
:
test_matrix
,
'test-matrix'
:
test_matrix
,
'item-texts'
:
movie_textual_dataset
,
'item-texts'
:
movie_textual_dataset
,
...
@@ -151,5 +153,5 @@ if __name__ == '__main__':
...
@@ -151,5 +153,5 @@ if __name__ == '__main__':
'item-to-user-type'
:
'watched-by'
,
'item-to-user-type'
:
'watched-by'
,
'timestamp-edge-column'
:
'timestamp'
}
'timestamp-edge-column'
:
'timestamp'
}
with
open
(
o
utput_path
,
'wb'
)
as
f
:
with
open
(
o
s
.
path
.
join
(
out_directory
,
'data.pkl'
)
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
pickle
.
dump
(
dataset
,
f
)
examples/pytorch/pinsage/process_nowplaying_rs.py
View file @
bef99307
...
@@ -5,6 +5,7 @@ file a heterogeneous graph with categorical and numeric features.
...
@@ -5,6 +5,7 @@ file a heterogeneous graph with categorical and numeric features.
import
os
import
os
import
argparse
import
argparse
import
dgl
import
pandas
as
pd
import
pandas
as
pd
import
scipy.sparse
as
ssp
import
scipy.sparse
as
ssp
import
pickle
import
pickle
...
@@ -14,10 +15,11 @@ from builder import PandasGraphBuilder
...
@@ -14,10 +15,11 @@ from builder import PandasGraphBuilder
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'out
put_path
'
,
type
=
str
)
parser
.
add_argument
(
'out
_directory
'
,
type
=
str
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
directory
=
args
.
directory
directory
=
args
.
directory
output_path
=
args
.
output_path
out_directory
=
args
.
out_directory
os
.
makedirs
(
out_directory
,
exist_ok
=
True
)
data
=
pd
.
read_csv
(
os
.
path
.
join
(
directory
,
'context_content_features.csv'
))
data
=
pd
.
read_csv
(
os
.
path
.
join
(
directory
,
'context_content_features.csv'
))
track_feature_cols
=
list
(
data
.
columns
[
1
:
13
])
track_feature_cols
=
list
(
data
.
columns
[
1
:
13
])
...
@@ -59,8 +61,9 @@ if __name__ == '__main__':
...
@@ -59,8 +61,9 @@ if __name__ == '__main__':
val_matrix
,
test_matrix
=
build_val_test_matrix
(
val_matrix
,
test_matrix
=
build_val_test_matrix
(
g
,
val_indices
,
test_indices
,
'user'
,
'track'
,
'listened'
)
g
,
val_indices
,
test_indices
,
'user'
,
'track'
,
'listened'
)
dgl
.
save_graphs
(
os
.
path
.
join
(
out_directory
,
'train_g.bin'
),
train_g
)
dataset
=
{
dataset
=
{
'train-graph'
:
train_g
,
'val-matrix'
:
val_matrix
,
'val-matrix'
:
val_matrix
,
'test-matrix'
:
test_matrix
,
'test-matrix'
:
test_matrix
,
'item-texts'
:
{},
'item-texts'
:
{},
...
@@ -71,5 +74,5 @@ if __name__ == '__main__':
...
@@ -71,5 +74,5 @@ if __name__ == '__main__':
'item-to-user-type'
:
'listened-by'
,
'item-to-user-type'
:
'listened-by'
,
'timestamp-edge-column'
:
'created_at'
}
'timestamp-edge-column'
:
'created_at'
}
with
open
(
o
utput_path
,
'wb'
)
as
f
:
with
open
(
o
s
.
path
.
join
(
out_directory
,
'data.pkl'
)
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
pickle
.
dump
(
dataset
,
f
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment