Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
6367318f
Unverified
Commit
6367318f
authored
Aug 18, 2020
by
Quan (Andy) Gan
Committed by
GitHub
Aug 18, 2020
Browse files
[Model] use dask to parallelize preprocessing (#2040)
parent
47c96dd1
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
200 additions
and
190 deletions
+200
-190
examples/pytorch/pinsage/README.md
examples/pytorch/pinsage/README.md
+6
-0
examples/pytorch/pinsage/data_utils.py
examples/pytorch/pinsage/data_utils.py
+14
-12
examples/pytorch/pinsage/process_movielens1m.py
examples/pytorch/pinsage/process_movielens1m.py
+127
-126
examples/pytorch/pinsage/process_nowplaying_rs.py
examples/pytorch/pinsage/process_nowplaying_rs.py
+53
-52
No files found.
examples/pytorch/pinsage/README.md
View file @
6367318f
# PinSAGE example
## Requirements
-
dask
-
pandas
-
torchtext
## Prepare datasets
### MovieLens 1M
...
...
examples/pytorch/pinsage/data_utils.py
View file @
6367318f
...
...
@@ -2,26 +2,28 @@ import torch
import
dgl
import
numpy
as
np
import
scipy.sparse
as
ssp
import
tqdm
import
dask.dataframe
as
dd
# This is the train-test split method most of the recommender system papers running on MovieLens
# takes. It essentially follows the intuition of "training on the past and predict the future".
# One can also change the threshold to make validation and test set take larger proportions.
def
train_test_split_by_time
(
df
,
timestamp
,
item
):
df
=
df
.
copy
()
df
[
'train_mask'
]
=
np
.
ones
((
len
(
df
),),
dtype
=
np
.
bool
)
df
[
'val_mask'
]
=
np
.
zeros
((
len
(
df
),),
dtype
=
np
.
bool
)
df
[
'test_mask'
]
=
np
.
zeros
((
len
(
df
),),
dtype
=
np
.
bool
)
df
=
df
.
sort_values
([
item
,
timestamp
])
for
track_id
in
df
[
item
].
unique
():
idx
=
(
df
[
item
]
==
track_id
).
to_numpy
().
nonzero
()[
0
]
idx
=
df
.
index
[
idx
]
if
len
(
idx
)
>
1
:
df
.
loc
[
idx
[
-
1
],
'train_mask'
]
=
False
df
.
loc
[
idx
[
-
1
],
'test_mask'
]
=
True
if
len
(
idx
)
>
2
:
df
.
loc
[
idx
[
-
2
],
'train_mask'
]
=
False
df
.
loc
[
idx
[
-
2
],
'val_mask'
]
=
True
df
=
df
.
sort_index
()
df
=
dd
.
from_pandas
(
df
,
npartitions
=
10
)
def
train_test_split
(
df
):
df
=
df
.
sort_values
([
timestamp
])
if
df
.
shape
[
0
]
>
1
:
df
.
iloc
[
-
1
,
-
3
]
=
False
df
.
iloc
[
-
1
,
-
1
]
=
True
if
df
.
shape
[
0
]
>
2
:
df
.
iloc
[
-
2
,
-
3
]
=
False
df
.
iloc
[
-
2
,
-
2
]
=
True
return
df
df
=
df
.
groupby
(
item
).
apply
(
train_test_split
).
compute
(
scheduler
=
'processes'
).
sort_index
()
print
(
df
[
df
[
item
]
==
df
[
item
].
unique
()[
0
]].
sort_values
(
timestamp
))
return
df
[
'train_mask'
].
to_numpy
().
nonzero
()[
0
],
\
df
[
'val_mask'
].
to_numpy
().
nonzero
()[
0
],
\
df
[
'test_mask'
].
to_numpy
().
nonzero
()[
0
]
...
...
examples/pytorch/pinsage/process_movielens1m.py
View file @
6367318f
...
...
@@ -25,18 +25,19 @@ import torchtext
from
builder
import
PandasGraphBuilder
from
data_utils
import
*
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'output_path'
,
type
=
str
)
args
=
parser
.
parse_args
()
directory
=
args
.
directory
output_path
=
args
.
output_path
## Build heterogeneous graph
# Load data
users
=
[]
with
open
(
os
.
path
.
join
(
directory
,
'users.dat'
),
encoding
=
'latin1'
)
as
f
:
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'output_path'
,
type
=
str
)
args
=
parser
.
parse_args
()
directory
=
args
.
directory
output_path
=
args
.
output_path
## Build heterogeneous graph
# Load data
users
=
[]
with
open
(
os
.
path
.
join
(
directory
,
'users.dat'
),
encoding
=
'latin1'
)
as
f
:
for
l
in
f
:
id_
,
gender
,
age
,
occupation
,
zip_
=
l
.
strip
().
split
(
'::'
)
users
.
append
({
...
...
@@ -46,10 +47,10 @@ with open(os.path.join(directory, 'users.dat'), encoding='latin1') as f:
'occupation'
:
occupation
,
'zip'
:
zip_
,
})
users
=
pd
.
DataFrame
(
users
).
astype
(
'category'
)
users
=
pd
.
DataFrame
(
users
).
astype
(
'category'
)
movies
=
[]
with
open
(
os
.
path
.
join
(
directory
,
'movies.dat'
),
encoding
=
'latin1'
)
as
f
:
movies
=
[]
with
open
(
os
.
path
.
join
(
directory
,
'movies.dat'
),
encoding
=
'latin1'
)
as
f
:
for
l
in
f
:
id_
,
title
,
genres
=
l
.
strip
().
split
(
'::'
)
genres_set
=
set
(
genres
.
split
(
'|'
))
...
...
@@ -63,10 +64,10 @@ with open(os.path.join(directory, 'movies.dat'), encoding='latin1') as f:
for
g
in
genres_set
:
data
[
g
]
=
True
movies
.
append
(
data
)
movies
=
pd
.
DataFrame
(
movies
).
astype
({
'year'
:
'category'
})
movies
=
pd
.
DataFrame
(
movies
).
astype
({
'year'
:
'category'
})
ratings
=
[]
with
open
(
os
.
path
.
join
(
directory
,
'ratings.dat'
),
encoding
=
'latin1'
)
as
f
:
ratings
=
[]
with
open
(
os
.
path
.
join
(
directory
,
'ratings.dat'
),
encoding
=
'latin1'
)
as
f
:
for
l
in
f
:
user_id
,
movie_id
,
rating
,
timestamp
=
[
int
(
_
)
for
_
in
l
.
split
(
'::'
)]
ratings
.
append
({
...
...
@@ -75,69 +76,69 @@ with open(os.path.join(directory, 'ratings.dat'), encoding='latin1') as f:
'rating'
:
rating
,
'timestamp'
:
timestamp
,
})
ratings
=
pd
.
DataFrame
(
ratings
)
ratings
=
pd
.
DataFrame
(
ratings
)
# Filter the users and items that never appear in the rating table.
distinct_users_in_ratings
=
ratings
[
'user_id'
].
unique
()
distinct_movies_in_ratings
=
ratings
[
'movie_id'
].
unique
()
users
=
users
[
users
[
'user_id'
].
isin
(
distinct_users_in_ratings
)]
movies
=
movies
[
movies
[
'movie_id'
].
isin
(
distinct_movies_in_ratings
)]
# Filter the users and items that never appear in the rating table.
distinct_users_in_ratings
=
ratings
[
'user_id'
].
unique
()
distinct_movies_in_ratings
=
ratings
[
'movie_id'
].
unique
()
users
=
users
[
users
[
'user_id'
].
isin
(
distinct_users_in_ratings
)]
movies
=
movies
[
movies
[
'movie_id'
].
isin
(
distinct_movies_in_ratings
)]
# Group the movie features into genres (a vector), year (a category), title (a string)
genre_columns
=
movies
.
columns
.
drop
([
'movie_id'
,
'title'
,
'year'
])
movies
[
genre_columns
]
=
movies
[
genre_columns
].
fillna
(
False
).
astype
(
'bool'
)
movies_categorical
=
movies
.
drop
(
'title'
,
axis
=
1
)
# Group the movie features into genres (a vector), year (a category), title (a string)
genre_columns
=
movies
.
columns
.
drop
([
'movie_id'
,
'title'
,
'year'
])
movies
[
genre_columns
]
=
movies
[
genre_columns
].
fillna
(
False
).
astype
(
'bool'
)
movies_categorical
=
movies
.
drop
(
'title'
,
axis
=
1
)
# Build graph
graph_builder
=
PandasGraphBuilder
()
graph_builder
.
add_entities
(
users
,
'user_id'
,
'user'
)
graph_builder
.
add_entities
(
movies_categorical
,
'movie_id'
,
'movie'
)
graph_builder
.
add_binary_relations
(
ratings
,
'user_id'
,
'movie_id'
,
'watched'
)
graph_builder
.
add_binary_relations
(
ratings
,
'movie_id'
,
'user_id'
,
'watched-by'
)
# Build graph
graph_builder
=
PandasGraphBuilder
()
graph_builder
.
add_entities
(
users
,
'user_id'
,
'user'
)
graph_builder
.
add_entities
(
movies_categorical
,
'movie_id'
,
'movie'
)
graph_builder
.
add_binary_relations
(
ratings
,
'user_id'
,
'movie_id'
,
'watched'
)
graph_builder
.
add_binary_relations
(
ratings
,
'movie_id'
,
'user_id'
,
'watched-by'
)
g
=
graph_builder
.
build
()
g
=
graph_builder
.
build
()
# Assign features.
# Note that variable-sized features such as texts or images are handled elsewhere.
g
.
nodes
[
'user'
].
data
[
'gender'
]
=
torch
.
LongTensor
(
users
[
'gender'
].
cat
.
codes
.
values
)
g
.
nodes
[
'user'
].
data
[
'age'
]
=
torch
.
LongTensor
(
users
[
'age'
].
cat
.
codes
.
values
)
g
.
nodes
[
'user'
].
data
[
'occupation'
]
=
torch
.
LongTensor
(
users
[
'occupation'
].
cat
.
codes
.
values
)
g
.
nodes
[
'user'
].
data
[
'zip'
]
=
torch
.
LongTensor
(
users
[
'zip'
].
cat
.
codes
.
values
)
# Assign features.
# Note that variable-sized features such as texts or images are handled elsewhere.
g
.
nodes
[
'user'
].
data
[
'gender'
]
=
torch
.
LongTensor
(
users
[
'gender'
].
cat
.
codes
.
values
)
g
.
nodes
[
'user'
].
data
[
'age'
]
=
torch
.
LongTensor
(
users
[
'age'
].
cat
.
codes
.
values
)
g
.
nodes
[
'user'
].
data
[
'occupation'
]
=
torch
.
LongTensor
(
users
[
'occupation'
].
cat
.
codes
.
values
)
g
.
nodes
[
'user'
].
data
[
'zip'
]
=
torch
.
LongTensor
(
users
[
'zip'
].
cat
.
codes
.
values
)
g
.
nodes
[
'movie'
].
data
[
'year'
]
=
torch
.
LongTensor
(
movies
[
'year'
].
cat
.
codes
.
values
)
g
.
nodes
[
'movie'
].
data
[
'genre'
]
=
torch
.
FloatTensor
(
movies
[
genre_columns
].
values
)
g
.
nodes
[
'movie'
].
data
[
'year'
]
=
torch
.
LongTensor
(
movies
[
'year'
].
cat
.
codes
.
values
)
g
.
nodes
[
'movie'
].
data
[
'genre'
]
=
torch
.
FloatTensor
(
movies
[
genre_columns
].
values
)
g
.
edges
[
'watched'
].
data
[
'rating'
]
=
torch
.
LongTensor
(
ratings
[
'rating'
].
values
)
g
.
edges
[
'watched'
].
data
[
'timestamp'
]
=
torch
.
LongTensor
(
ratings
[
'timestamp'
].
values
)
g
.
edges
[
'watched-by'
].
data
[
'rating'
]
=
torch
.
LongTensor
(
ratings
[
'rating'
].
values
)
g
.
edges
[
'watched-by'
].
data
[
'timestamp'
]
=
torch
.
LongTensor
(
ratings
[
'timestamp'
].
values
)
g
.
edges
[
'watched'
].
data
[
'rating'
]
=
torch
.
LongTensor
(
ratings
[
'rating'
].
values
)
g
.
edges
[
'watched'
].
data
[
'timestamp'
]
=
torch
.
LongTensor
(
ratings
[
'timestamp'
].
values
)
g
.
edges
[
'watched-by'
].
data
[
'rating'
]
=
torch
.
LongTensor
(
ratings
[
'rating'
].
values
)
g
.
edges
[
'watched-by'
].
data
[
'timestamp'
]
=
torch
.
LongTensor
(
ratings
[
'timestamp'
].
values
)
# Train-validation-test split
# This is a little bit tricky as we want to select the last interaction for test, and the
# second-to-last interaction for validation.
train_indices
,
val_indices
,
test_indices
=
train_test_split_by_time
(
ratings
,
'timestamp'
,
'movie_id'
)
# Train-validation-test split
# This is a little bit tricky as we want to select the last interaction for test, and the
# second-to-last interaction for validation.
train_indices
,
val_indices
,
test_indices
=
train_test_split_by_time
(
ratings
,
'timestamp'
,
'movie_id'
)
# Build the graph with training interactions only.
train_g
=
build_train_graph
(
g
,
train_indices
,
'user'
,
'movie'
,
'watched'
,
'watched-by'
)
# Build the graph with training interactions only.
train_g
=
build_train_graph
(
g
,
train_indices
,
'user'
,
'movie'
,
'watched'
,
'watched-by'
)
# Build the user-item sparse matrix for validation and test set.
val_matrix
,
test_matrix
=
build_val_test_matrix
(
g
,
val_indices
,
test_indices
,
'user'
,
'movie'
,
'watched'
)
# Build the user-item sparse matrix for validation and test set.
val_matrix
,
test_matrix
=
build_val_test_matrix
(
g
,
val_indices
,
test_indices
,
'user'
,
'movie'
,
'watched'
)
## Build title set
## Build title set
movie_textual_dataset
=
{
'title'
:
movies
[
'title'
].
values
}
movie_textual_dataset
=
{
'title'
:
movies
[
'title'
].
values
}
# The model should build their own vocabulary and process the texts. Here is one example
# of using torchtext to pad and numericalize a batch of strings.
# field = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True)
# examples = [torchtext.data.Example.fromlist([t], [('title', title_field)]) for t in texts]
# titleset = torchtext.data.Dataset(examples, [('title', title_field)])
# field.build_vocab(titleset.title, vectors='fasttext.simple.300d')
# token_ids, lengths = field.process([examples[0].title, examples[1].title])
# The model should build their own vocabulary and process the texts. Here is one example
# of using torchtext to pad and numericalize a batch of strings.
# field = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True)
# examples = [torchtext.data.Example.fromlist([t], [('title', title_field)]) for t in texts]
# titleset = torchtext.data.Dataset(examples, [('title', title_field)])
# field.build_vocab(titleset.title, vectors='fasttext.simple.300d')
# token_ids, lengths = field.process([examples[0].title, examples[1].title])
## Dump the graph and the datasets
## Dump the graph and the datasets
dataset
=
{
dataset
=
{
'train-graph'
:
train_g
,
'val-matrix'
:
val_matrix
,
'test-matrix'
:
test_matrix
,
...
...
@@ -149,5 +150,5 @@ dataset = {
'item-to-user-type'
:
'watched-by'
,
'timestamp-edge-column'
:
'timestamp'
}
with
open
(
output_path
,
'wb'
)
as
f
:
with
open
(
output_path
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
examples/pytorch/pinsage/process_nowplaying_rs.py
View file @
6367318f
...
...
@@ -11,34 +11,35 @@ import pickle
from
data_utils
import
*
from
builder
import
PandasGraphBuilder
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'output_path'
,
type
=
str
)
args
=
parser
.
parse_args
()
directory
=
args
.
directory
output_path
=
args
.
output_path
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'directory'
,
type
=
str
)
parser
.
add_argument
(
'output_path'
,
type
=
str
)
args
=
parser
.
parse_args
()
directory
=
args
.
directory
output_path
=
args
.
output_path
data
=
pd
.
read_csv
(
os
.
path
.
join
(
directory
,
'context_content_features.csv'
))
track_feature_cols
=
list
(
data
.
columns
[
1
:
13
])
data
=
data
[[
'user_id'
,
'track_id'
,
'created_at'
]
+
track_feature_cols
].
dropna
()
data
=
pd
.
read_csv
(
os
.
path
.
join
(
directory
,
'context_content_features.csv'
))
track_feature_cols
=
list
(
data
.
columns
[
1
:
13
])
data
=
data
[[
'user_id'
,
'track_id'
,
'created_at'
]
+
track_feature_cols
].
dropna
()
users
=
data
[[
'user_id'
]].
drop_duplicates
()
tracks
=
data
[[
'track_id'
]
+
track_feature_cols
].
drop_duplicates
()
assert
tracks
[
'track_id'
].
value_counts
().
max
()
==
1
tracks
=
tracks
.
astype
({
'mode'
:
'int64'
,
'key'
:
'int64'
,
'artist_id'
:
'category'
})
events
=
data
[[
'user_id'
,
'track_id'
,
'created_at'
]]
events
[
'created_at'
]
=
events
[
'created_at'
].
values
.
astype
(
'datetime64[s]'
).
astype
(
'int64'
)
users
=
data
[[
'user_id'
]].
drop_duplicates
()
tracks
=
data
[[
'track_id'
]
+
track_feature_cols
].
drop_duplicates
()
assert
tracks
[
'track_id'
].
value_counts
().
max
()
==
1
tracks
=
tracks
.
astype
({
'mode'
:
'int64'
,
'key'
:
'int64'
,
'artist_id'
:
'category'
})
events
=
data
[[
'user_id'
,
'track_id'
,
'created_at'
]]
events
[
'created_at'
]
=
events
[
'created_at'
].
values
.
astype
(
'datetime64[s]'
).
astype
(
'int64'
)
graph_builder
=
PandasGraphBuilder
()
graph_builder
.
add_entities
(
users
,
'user_id'
,
'user'
)
graph_builder
.
add_entities
(
tracks
,
'track_id'
,
'track'
)
graph_builder
.
add_binary_relations
(
events
,
'user_id'
,
'track_id'
,
'listened'
)
graph_builder
.
add_binary_relations
(
events
,
'track_id'
,
'user_id'
,
'listened-by'
)
graph_builder
=
PandasGraphBuilder
()
graph_builder
.
add_entities
(
users
,
'user_id'
,
'user'
)
graph_builder
.
add_entities
(
tracks
,
'track_id'
,
'track'
)
graph_builder
.
add_binary_relations
(
events
,
'user_id'
,
'track_id'
,
'listened'
)
graph_builder
.
add_binary_relations
(
events
,
'track_id'
,
'user_id'
,
'listened-by'
)
g
=
graph_builder
.
build
()
g
=
graph_builder
.
build
()
float_cols
=
[]
for
col
in
tracks
.
columns
:
float_cols
=
[]
for
col
in
tracks
.
columns
:
if
col
==
'track_id'
:
continue
elif
col
==
'artist_id'
:
...
...
@@ -47,17 +48,17 @@ for col in tracks.columns:
float_cols
.
append
(
col
)
else
:
g
.
nodes
[
'track'
].
data
[
col
]
=
torch
.
LongTensor
(
tracks
[
col
].
values
)
g
.
nodes
[
'track'
].
data
[
'song_features'
]
=
torch
.
FloatTensor
(
linear_normalize
(
tracks
[
float_cols
].
values
))
g
.
edges
[
'listened'
].
data
[
'created_at'
]
=
torch
.
LongTensor
(
events
[
'created_at'
].
values
)
g
.
edges
[
'listened-by'
].
data
[
'created_at'
]
=
torch
.
LongTensor
(
events
[
'created_at'
].
values
)
g
.
nodes
[
'track'
].
data
[
'song_features'
]
=
torch
.
FloatTensor
(
linear_normalize
(
tracks
[
float_cols
].
values
))
g
.
edges
[
'listened'
].
data
[
'created_at'
]
=
torch
.
LongTensor
(
events
[
'created_at'
].
values
)
g
.
edges
[
'listened-by'
].
data
[
'created_at'
]
=
torch
.
LongTensor
(
events
[
'created_at'
].
values
)
n_edges
=
g
.
number_of_edges
(
'listened'
)
train_indices
,
val_indices
,
test_indices
=
train_test_split_by_time
(
events
,
'created_at'
,
'track_id'
)
train_g
=
build_train_graph
(
g
,
train_indices
,
'user'
,
'track'
,
'listened'
,
'listened-by'
)
val_matrix
,
test_matrix
=
build_val_test_matrix
(
n_edges
=
g
.
number_of_edges
(
'listened'
)
train_indices
,
val_indices
,
test_indices
=
train_test_split_by_time
(
events
,
'created_at'
,
'track_id'
)
train_g
=
build_train_graph
(
g
,
train_indices
,
'user'
,
'track'
,
'listened'
,
'listened-by'
)
val_matrix
,
test_matrix
=
build_val_test_matrix
(
g
,
val_indices
,
test_indices
,
'user'
,
'track'
,
'listened'
)
dataset
=
{
dataset
=
{
'train-graph'
:
train_g
,
'val-matrix'
:
val_matrix
,
'test-matrix'
:
test_matrix
,
...
...
@@ -69,5 +70,5 @@ dataset = {
'item-to-user-type'
:
'listened-by'
,
'timestamp-edge-column'
:
'created_at'
}
with
open
(
output_path
,
'wb'
)
as
f
:
with
open
(
output_path
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment