Unverified Commit bef99307 authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Bugfix] Fix PinSAGE Benchmark (#4058)

* Update

* Update

* Update dgl.data.rst

* CI
parent 7a065a9c
...@@ -253,24 +253,16 @@ class PinsageDataset: ...@@ -253,24 +253,16 @@ class PinsageDataset:
def load_nowplaying_rs(): def load_nowplaying_rs():
import torchtext.legacy as torchtext import torchtext.legacy as torchtext
# follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl # follow examples/pytorch/pinsage/README to create train_g.bin
name = 'nowplaying_rs.pkl' name = 'train_g.bin'
dataset_dir = os.path.join(os.getcwd(), 'dataset') dataset_dir = os.path.join(os.getcwd(), 'dataset')
os.symlink('/tmp/dataset/', dataset_dir) os.symlink('/tmp/dataset/', dataset_dir)
dataset_path = os.path.join(dataset_dir, "nowplaying_rs", name) dataset_path = os.path.join(dataset_dir, "nowplaying_rs", name)
# Load dataset g_list, _ = dgl.load_graphs(dataset_path)
with open(dataset_path, 'rb') as f: g = g_list[0]
dataset = pickle.load(f) user_ntype = 'user'
item_ntype = 'track'
g = dataset['train-graph']
val_matrix = dataset['val-matrix'].tocsr()
test_matrix = dataset['test-matrix'].tocsr()
item_texts = dataset['item-texts']
user_ntype = dataset['user-type']
item_ntype = dataset['item-type']
user_to_item_etype = dataset['user-to-item-type']
timestamp = dataset['timestamp-edge-column']
# Assign user and movie IDs and use them as features (to learn an individual trainable # Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity) # embedding for each entity)
...@@ -282,17 +274,11 @@ def load_nowplaying_rs(): ...@@ -282,17 +274,11 @@ def load_nowplaying_rs():
# Prepare torchtext dataset and vocabulary # Prepare torchtext dataset and vocabulary
fields = {} fields = {}
examples = [] examples = []
for key, texts in item_texts.items():
fields[key] = torchtext.data.Field(
include_lengths=True, lower=True, batch_first=True)
for i in range(g.number_of_nodes(item_ntype)): for i in range(g.number_of_nodes(item_ntype)):
example = torchtext.data.Example.fromlist( example = torchtext.data.Example.fromlist(
[item_texts[key][i] for key in item_texts.keys()], [], [])
[(key, fields[key]) for key in item_texts.keys()])
examples.append(example) examples.append(example)
textset = torchtext.data.Dataset(examples, fields) textset = torchtext.data.Dataset(examples, fields)
for key, field in fields.items():
field.build_vocab(getattr(textset, key))
return PinsageDataset(g, user_ntype, item_ntype, textset) return PinsageDataset(g, user_ntype, item_ntype, textset)
......
...@@ -51,7 +51,6 @@ Datasets for node classification/regression tasks ...@@ -51,7 +51,6 @@ Datasets for node classification/regression tasks
BACommunityDataset BACommunityDataset
TreeCycleDataset TreeCycleDataset
TreeGridDataset TreeGridDataset
BA2MotifDataset
Edge Prediction Datasets Edge Prediction Datasets
--------------------------------------- ---------------------------------------
...@@ -88,6 +87,7 @@ Datasets for graph classification/regression tasks ...@@ -88,6 +87,7 @@ Datasets for graph classification/regression tasks
LegacyTUDataset LegacyTUDataset
GINDataset GINDataset
FakeNewsDataset FakeNewsDataset
BA2MotifDataset
Dataset adapters Dataset adapters
------------------- -------------------
......
...@@ -12,15 +12,15 @@ ...@@ -12,15 +12,15 @@
1. Download and extract the MovieLens-1M dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip 1. Download and extract the MovieLens-1M dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip
into the current directory. into the current directory.
2. Run `python process_movielens1m.py ./ml-1m ./data.pkl`. 2. Run `python process_movielens1m.py ./ml-1m ./data_processed`.
Replace `ml-1m` with the directory you put the `.dat` files, and replace `data.pkl` to Replace `ml-1m` with the directory you put the `.dat` files, and replace `data_processed` with
any path you wish to put the output pickle file. any path you wish to put the output files.
### Nowplaying-rs ### Nowplaying-rs
1. Download and extract the Nowplaying-rs dataset from https://zenodo.org/record/3248543/files/nowplayingrs.zip?download=1 1. Download and extract the Nowplaying-rs dataset from https://zenodo.org/record/3248543/files/nowplayingrs.zip?download=1
into the current directory. into the current directory.
2. Run `python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data.pkl` 2. Run `python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data_processed`
## Run model ## Run model
...@@ -31,7 +31,7 @@ interacted. The distance between two items are measured by Euclidean distance o ...@@ -31,7 +31,7 @@ interacted. The distance between two items are measured by Euclidean distance o
item embeddings, which are learned as outputs of PinSAGE. item embeddings, which are learned as outputs of PinSAGE.
``` ```
python model.py data.pkl --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64 python model.py data_processed --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64
``` ```
The implementation here also assigns a learnable vector to each item. If your hidden The implementation here also assigns a learnable vector to each item. If your hidden
...@@ -40,7 +40,7 @@ for sparse embedding update (written with `torch.optim.SparseAdam`) instead: ...@@ -40,7 +40,7 @@ for sparse embedding update (written with `torch.optim.SparseAdam`) instead:
``` ```
python model_sparse.py data.pkl --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024 python model_sparse.py data_processed --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024
``` ```
Note that since the embedding update is done on CPU, it will be significantly slower than doing Note that since the embedding update is done on CPU, it will be significantly slower than doing
......
...@@ -6,6 +6,7 @@ import torch.nn as nn ...@@ -6,6 +6,7 @@ import torch.nn as nn
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
import torchtext import torchtext
import dgl import dgl
import os
import tqdm import tqdm
import layers import layers
...@@ -137,6 +138,10 @@ if __name__ == '__main__': ...@@ -137,6 +138,10 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
# Load dataset # Load dataset
with open(args.dataset_path, 'rb') as f: data_info_path = os.path.join(args.dataset_path, 'data.pkl')
with open(data_info_path, 'rb') as f:
dataset = pickle.load(f) dataset = pickle.load(f)
train_g_path = os.path.join(args.dataset_path, 'train_g.bin')
g_list, _ = dgl.load_graphs(train_g_path)
dataset['train-graph'] = g_list[0]
train(dataset, args) train(dataset, args)
...@@ -6,6 +6,7 @@ import torch.nn as nn ...@@ -6,6 +6,7 @@ import torch.nn as nn
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
import torchtext import torchtext
import dgl import dgl
import os
import tqdm import tqdm
import layers import layers
...@@ -142,6 +143,10 @@ if __name__ == '__main__': ...@@ -142,6 +143,10 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
# Load dataset # Load dataset
with open(args.dataset_path, 'rb') as f: data_info_path = os.path.join(args.dataset_path, 'data.pkl')
with open(data_info_path, 'rb') as f:
dataset = pickle.load(f) dataset = pickle.load(f)
train_g_path = os.path.join(args.dataset_path, 'train_g.bin')
g_list, _ = dgl.load_graphs(train_g_path)
dataset['train-graph'] = g_list[0]
train(dataset, args) train(dataset, args)
...@@ -28,10 +28,11 @@ from data_utils import * ...@@ -28,10 +28,11 @@ from data_utils import *
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('directory', type=str) parser.add_argument('directory', type=str)
parser.add_argument('output_path', type=str) parser.add_argument('out_directory', type=str)
args = parser.parse_args() args = parser.parse_args()
directory = args.directory directory = args.directory
output_path = args.output_path out_directory = args.out_directory
os.makedirs(out_directory, exist_ok=True)
## Build heterogeneous graph ## Build heterogeneous graph
...@@ -139,8 +140,9 @@ if __name__ == '__main__': ...@@ -139,8 +140,9 @@ if __name__ == '__main__':
## Dump the graph and the datasets ## Dump the graph and the datasets
dgl.save_graphs(os.path.join(out_directory, 'train_g.bin'), train_g)
dataset = { dataset = {
'train-graph': train_g,
'val-matrix': val_matrix, 'val-matrix': val_matrix,
'test-matrix': test_matrix, 'test-matrix': test_matrix,
'item-texts': movie_textual_dataset, 'item-texts': movie_textual_dataset,
...@@ -151,5 +153,5 @@ if __name__ == '__main__': ...@@ -151,5 +153,5 @@ if __name__ == '__main__':
'item-to-user-type': 'watched-by', 'item-to-user-type': 'watched-by',
'timestamp-edge-column': 'timestamp'} 'timestamp-edge-column': 'timestamp'}
with open(output_path, 'wb') as f: with open(os.path.join(out_directory, 'data.pkl'), 'wb') as f:
pickle.dump(dataset, f) pickle.dump(dataset, f)
...@@ -5,6 +5,7 @@ file a heterogeneous graph with categorical and numeric features. ...@@ -5,6 +5,7 @@ file a heterogeneous graph with categorical and numeric features.
import os import os
import argparse import argparse
import dgl
import pandas as pd import pandas as pd
import scipy.sparse as ssp import scipy.sparse as ssp
import pickle import pickle
...@@ -14,10 +15,11 @@ from builder import PandasGraphBuilder ...@@ -14,10 +15,11 @@ from builder import PandasGraphBuilder
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('directory', type=str) parser.add_argument('directory', type=str)
parser.add_argument('output_path', type=str) parser.add_argument('out_directory', type=str)
args = parser.parse_args() args = parser.parse_args()
directory = args.directory directory = args.directory
output_path = args.output_path out_directory = args.out_directory
os.makedirs(out_directory, exist_ok=True)
data = pd.read_csv(os.path.join(directory, 'context_content_features.csv')) data = pd.read_csv(os.path.join(directory, 'context_content_features.csv'))
track_feature_cols = list(data.columns[1:13]) track_feature_cols = list(data.columns[1:13])
...@@ -59,8 +61,9 @@ if __name__ == '__main__': ...@@ -59,8 +61,9 @@ if __name__ == '__main__':
val_matrix, test_matrix = build_val_test_matrix( val_matrix, test_matrix = build_val_test_matrix(
g, val_indices, test_indices, 'user', 'track', 'listened') g, val_indices, test_indices, 'user', 'track', 'listened')
dgl.save_graphs(os.path.join(out_directory, 'train_g.bin'), train_g)
dataset = { dataset = {
'train-graph': train_g,
'val-matrix': val_matrix, 'val-matrix': val_matrix,
'test-matrix': test_matrix, 'test-matrix': test_matrix,
'item-texts': {}, 'item-texts': {},
...@@ -71,5 +74,5 @@ if __name__ == '__main__': ...@@ -71,5 +74,5 @@ if __name__ == '__main__':
'item-to-user-type': 'listened-by', 'item-to-user-type': 'listened-by',
'timestamp-edge-column': 'created_at'} 'timestamp-edge-column': 'created_at'}
with open(output_path, 'wb') as f: with open(os.path.join(out_directory, 'data.pkl'), 'wb') as f:
pickle.dump(dataset, f) pickle.dump(dataset, f)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment