"git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "e4a4a29a7b5d451523e71af779f010841d0c8ca8"
Unverified Commit bef99307 authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Bugfix] Fix PinSAGE Benchmark (#4058)

* Update

* Update

* Update dgl.data.rst

* CI
parent 7a065a9c
......@@ -91,7 +91,7 @@ def get_graph(name, format = None):
g_list, _ = dgl.load_graphs(bin_path)
g = g_list[0]
else:
# the original node IDs of friendster are not consecutive, so we compact it
# the original node IDs of friendster are not consecutive, so we compact it
g = dgl.compact_graphs(get_friendster()).formats(format)
dgl.save_graphs(bin_path, [g])
elif name == "reddit":
......@@ -253,24 +253,16 @@ class PinsageDataset:
def load_nowplaying_rs():
import torchtext.legacy as torchtext
# follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl
name = 'nowplaying_rs.pkl'
# follow examples/pytorch/pinsage/README to create train_g.bin
name = 'train_g.bin'
dataset_dir = os.path.join(os.getcwd(), 'dataset')
os.symlink('/tmp/dataset/', dataset_dir)
dataset_path = os.path.join(dataset_dir, "nowplaying_rs", name)
# Load dataset
with open(dataset_path, 'rb') as f:
dataset = pickle.load(f)
g = dataset['train-graph']
val_matrix = dataset['val-matrix'].tocsr()
test_matrix = dataset['test-matrix'].tocsr()
item_texts = dataset['item-texts']
user_ntype = dataset['user-type']
item_ntype = dataset['item-type']
user_to_item_etype = dataset['user-to-item-type']
timestamp = dataset['timestamp-edge-column']
g_list, _ = dgl.load_graphs(dataset_path)
g = g_list[0]
user_ntype = 'user'
item_ntype = 'track'
# Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity)
......@@ -282,17 +274,11 @@ def load_nowplaying_rs():
# Prepare torchtext dataset and vocabulary
fields = {}
examples = []
for key, texts in item_texts.items():
fields[key] = torchtext.data.Field(
include_lengths=True, lower=True, batch_first=True)
for i in range(g.number_of_nodes(item_ntype)):
example = torchtext.data.Example.fromlist(
[item_texts[key][i] for key in item_texts.keys()],
[(key, fields[key]) for key in item_texts.keys()])
[], [])
examples.append(example)
textset = torchtext.data.Dataset(examples, fields)
for key, field in fields.items():
field.build_vocab(getattr(textset, key))
return PinsageDataset(g, user_ntype, item_ntype, textset)
......
......@@ -51,7 +51,6 @@ Datasets for node classification/regression tasks
BACommunityDataset
TreeCycleDataset
TreeGridDataset
BA2MotifDataset
Edge Prediction Datasets
---------------------------------------
......@@ -88,6 +87,7 @@ Datasets for graph classification/regression tasks
LegacyTUDataset
GINDataset
FakeNewsDataset
BA2MotifDataset
Dataset adapters
-------------------
......
......@@ -12,15 +12,15 @@
1. Download and extract the MovieLens-1M dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip
into the current directory.
2. Run `python process_movielens1m.py ./ml-1m ./data.pkl`.
Replace `ml-1m` with the directory you put the `.dat` files, and replace `data.pkl` to
any path you wish to put the output pickle file.
2. Run `python process_movielens1m.py ./ml-1m ./data_processed`.
Replace `ml-1m` with the directory you put the `.dat` files, and replace `data_processed` with
any path you wish to put the output files.
### Nowplaying-rs
1. Download and extract the Nowplaying-rs dataset from https://zenodo.org/record/3248543/files/nowplayingrs.zip?download=1
into the current directory.
2. Run `python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data.pkl`
2. Run `python process_nowplaying_rs.py ./nowplaying_rs_dataset ./data_processed`
## Run model
......@@ -31,7 +31,7 @@ interacted. The distance between two items are measured by Euclidean distance o
item embeddings, which are learned as outputs of PinSAGE.
```
python model.py data.pkl --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64
python model.py data_processed --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 64
```
The implementation here also assigns a learnable vector to each item. If your hidden
......@@ -40,7 +40,7 @@ for sparse embedding update (written with `torch.optim.SparseAdam`) instead:
```
python model_sparse.py data.pkl --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024
python model_sparse.py data_processed --num-epochs 300 --num-workers 2 --device cuda:0 --hidden-dims 1024
```
Note that since the embedding update is done on CPU, it will be significantly slower than doing
......
......@@ -6,6 +6,7 @@ import torch.nn as nn
from torch.utils.data import DataLoader
import torchtext
import dgl
import os
import tqdm
import layers
......@@ -137,6 +138,10 @@ if __name__ == '__main__':
args = parser.parse_args()
# Load dataset
with open(args.dataset_path, 'rb') as f:
data_info_path = os.path.join(args.dataset_path, 'data.pkl')
with open(data_info_path, 'rb') as f:
dataset = pickle.load(f)
train_g_path = os.path.join(args.dataset_path, 'train_g.bin')
g_list, _ = dgl.load_graphs(train_g_path)
dataset['train-graph'] = g_list[0]
train(dataset, args)
......@@ -6,6 +6,7 @@ import torch.nn as nn
from torch.utils.data import DataLoader
import torchtext
import dgl
import os
import tqdm
import layers
......@@ -142,6 +143,10 @@ if __name__ == '__main__':
args = parser.parse_args()
# Load dataset
with open(args.dataset_path, 'rb') as f:
data_info_path = os.path.join(args.dataset_path, 'data.pkl')
with open(data_info_path, 'rb') as f:
dataset = pickle.load(f)
train_g_path = os.path.join(args.dataset_path, 'train_g.bin')
g_list, _ = dgl.load_graphs(train_g_path)
dataset['train-graph'] = g_list[0]
train(dataset, args)
......@@ -28,10 +28,11 @@ from data_utils import *
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('directory', type=str)
parser.add_argument('output_path', type=str)
parser.add_argument('out_directory', type=str)
args = parser.parse_args()
directory = args.directory
output_path = args.output_path
out_directory = args.out_directory
os.makedirs(out_directory, exist_ok=True)
## Build heterogeneous graph
......@@ -139,8 +140,9 @@ if __name__ == '__main__':
## Dump the graph and the datasets
dgl.save_graphs(os.path.join(out_directory, 'train_g.bin'), train_g)
dataset = {
'train-graph': train_g,
'val-matrix': val_matrix,
'test-matrix': test_matrix,
'item-texts': movie_textual_dataset,
......@@ -151,5 +153,5 @@ if __name__ == '__main__':
'item-to-user-type': 'watched-by',
'timestamp-edge-column': 'timestamp'}
with open(output_path, 'wb') as f:
with open(os.path.join(out_directory, 'data.pkl'), 'wb') as f:
pickle.dump(dataset, f)
......@@ -5,6 +5,7 @@ file a heterogeneous graph with categorical and numeric features.
import os
import argparse
import dgl
import pandas as pd
import scipy.sparse as ssp
import pickle
......@@ -14,10 +15,11 @@ from builder import PandasGraphBuilder
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('directory', type=str)
parser.add_argument('output_path', type=str)
parser.add_argument('out_directory', type=str)
args = parser.parse_args()
directory = args.directory
output_path = args.output_path
out_directory = args.out_directory
os.makedirs(out_directory, exist_ok=True)
data = pd.read_csv(os.path.join(directory, 'context_content_features.csv'))
track_feature_cols = list(data.columns[1:13])
......@@ -59,8 +61,9 @@ if __name__ == '__main__':
val_matrix, test_matrix = build_val_test_matrix(
g, val_indices, test_indices, 'user', 'track', 'listened')
dgl.save_graphs(os.path.join(out_directory, 'train_g.bin'), train_g)
dataset = {
'train-graph': train_g,
'val-matrix': val_matrix,
'test-matrix': test_matrix,
'item-texts': {},
......@@ -71,5 +74,5 @@ if __name__ == '__main__':
'item-to-user-type': 'listened-by',
'timestamp-edge-column': 'created_at'}
with open(output_path, 'wb') as f:
with open(os.path.join(out_directory, 'data.pkl'), 'wb') as f:
pickle.dump(dataset, f)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment