Unverified Commit 9706eaa8 authored by Kay Liu's avatar Kay Liu Committed by GitHub
Browse files

[Feature] add permission information and fix import problems (#3036)



* [Feature] add positive negative statistics

* [Feature] add permission information and fix import problem

* fix backend incompatible problem

* modify random split to remove sklearn usage

* modify file read to remove pandas usage

* add datasets into doc

* add random seed in data splitting

* add dataset unit test

* usage permission information update
Co-authored-by: default avatarzhjwy9343 <6593865@qq.com>
parent e57c6e35
...@@ -117,6 +117,19 @@ Symmetric Stochastic Block Model Mixture dataset ...@@ -117,6 +117,19 @@ Symmetric Stochastic Block Model Mixture dataset
.. autoclass:: SBMMixtureDataset .. autoclass:: SBMMixtureDataset
:members: __getitem__, __len__, collate_fn :members: __getitem__, __len__, collate_fn
.. _frauddata:
Fraud dataset
``````````````
.. autoclass:: FraudDataset
:members: __getitem__, __len__
.. autoclass:: FraudYelpDataset
:members: __getitem__, __len__
.. autoclass:: FraudAmazonDataset
:members: __getitem__, __len__
Edge Prediction Datasets Edge Prediction Datasets
--------------------------------------- ---------------------------------------
...@@ -207,6 +220,13 @@ Graph isomorphism network dataset ...@@ -207,6 +220,13 @@ Graph isomorphism network dataset
.. autoclass:: GINDataset .. autoclass:: GINDataset
:members: __getitem__, __len__ :members: __getitem__, __len__
.. _fakenewsdata:
Fake news dataset
```````````````````````````````````
.. autoclass:: FakeNewsDataset
:members: __getitem__, __len__
Utilities Utilities
----------------- -----------------
......
...@@ -27,6 +27,8 @@ from .dgl_dataset import DGLDataset, DGLBuiltinDataset ...@@ -27,6 +27,8 @@ from .dgl_dataset import DGLDataset, DGLBuiltinDataset
from .citation_graph import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from .citation_graph import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset
from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset
from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
from .fakenews import FakeNewsDataset
def register_data_args(parser): def register_data_args(parser):
......
import torch
import os import os
import numpy as np import numpy as np
import pandas as pd
import scipy.sparse as sp import scipy.sparse as sp
from .dgl_dataset import DGLBuiltinDataset from .dgl_dataset import DGLBuiltinDataset
from .utils import save_graphs, load_graphs, _get_dgl_url from .utils import save_graphs, load_graphs, _get_dgl_url
from .utils import save_info, load_info from .utils import save_info, load_info
from ..convert import graph from ..convert import graph
from .. import backend as F
class FakeNewsDataset(DGLBuiltinDataset): class FakeNewsDataset(DGLBuiltinDataset):
...@@ -30,6 +29,8 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -30,6 +29,8 @@ class FakeNewsDataset(DGLBuiltinDataset):
spacy: the 300-dimensional node feature composed of Twitter user spacy: the 300-dimensional node feature composed of Twitter user
historical tweets encoded by the spaCy word2vec encoder. historical tweets encoded by the spaCy word2vec encoder.
Note: this dataset is for academic use only, and commercial use is prohibited.
Statistics: Statistics:
Politifact: Politifact:
...@@ -86,7 +87,7 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -86,7 +87,7 @@ class FakeNewsDataset(DGLBuiltinDataset):
Graph labels Graph labels
feature_name : str feature_name : str
Name of the feature (bert, content, profile, or spacy) Name of the feature (bert, content, profile, or spacy)
feature : scipy.sparse.csr.csr_matrix feature : Tensor
Node features Node features
train_mask : Tensor train_mask : Tensor
Mask of training set Mask of training set
...@@ -122,14 +123,13 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -122,14 +123,13 @@ class FakeNewsDataset(DGLBuiltinDataset):
def process(self): def process(self):
"""process raw data to graph, labels and masks""" """process raw data to graph, labels and masks"""
self.labels = np.load(os.path.join(self.raw_path, 'graph_labels.npy')) self.labels = F.tensor(np.load(os.path.join(self.raw_path, 'graph_labels.npy')))
self.labels = torch.LongTensor(self.labels)
num_graphs = self.labels.shape[0] num_graphs = self.labels.shape[0]
node_graph_id = np.load(os.path.join(self.raw_path, 'node_graph_id.npy')) node_graph_id = np.load(os.path.join(self.raw_path, 'node_graph_id.npy'))
edges = pd.read_csv(os.path.join(self.raw_path, 'A.txt'), header=None) edges = np.genfromtxt(os.path.join(self.raw_path, 'A.txt'), delimiter=',', dtype=int)
src = edges[0].to_numpy() src = edges[:, 0]
dst = edges[1].to_numpy() dst = edges[:, 1]
g = graph((src, dst)) g = graph((src, dst))
node_idx_list = [] node_idx_list = []
...@@ -142,18 +142,18 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -142,18 +142,18 @@ class FakeNewsDataset(DGLBuiltinDataset):
train_idx = np.load(os.path.join(self.raw_path, 'train_idx.npy')) train_idx = np.load(os.path.join(self.raw_path, 'train_idx.npy'))
val_idx = np.load(os.path.join(self.raw_path, 'val_idx.npy')) val_idx = np.load(os.path.join(self.raw_path, 'val_idx.npy'))
test_idx = np.load(os.path.join(self.raw_path, 'test_idx.npy')) test_idx = np.load(os.path.join(self.raw_path, 'test_idx.npy'))
train_mask = torch.zeros(num_graphs, dtype=torch.bool) train_mask = np.zeros(num_graphs, dtype=np.bool)
val_mask = torch.zeros(num_graphs, dtype=torch.bool) val_mask = np.zeros(num_graphs, dtype=np.bool)
test_mask = torch.zeros(num_graphs, dtype=torch.bool) test_mask = np.zeros(num_graphs, dtype=np.bool)
train_mask[train_idx] = True train_mask[train_idx] = True
val_mask[val_idx] = True val_mask[val_idx] = True
test_mask[test_idx] = True test_mask[test_idx] = True
self.train_mask = train_mask self.train_mask = F.tensor(train_mask)
self.val_mask = val_mask self.val_mask = F.tensor(val_mask)
self.test_mask = test_mask self.test_mask = F.tensor(test_mask)
feature_file = 'new_' + self.feature_name + '_feature.npz' feature_file = 'new_' + self.feature_name + '_feature.npz'
self.feature = sp.load_npz(os.path.join(self.raw_path, feature_file)) self.feature = F.tensor(sp.load_npz(os.path.join(self.raw_path, feature_file)).todense())
def save(self): def save(self):
"""save the graph list and the labels""" """save the graph list and the labels"""
......
"""Fraud Dataset """Fraud Dataset
""" """
import torch
import os import os
from scipy import io from scipy import io
from sklearn.model_selection import train_test_split import numpy as np
from .utils import save_graphs, load_graphs, _get_dgl_url from .utils import save_graphs, load_graphs, _get_dgl_url
from ..convert import heterograph from ..convert import heterograph
from ..utils import graphdata2tensors from ..utils import graphdata2tensors
from .dgl_dataset import DGLBuiltinDataset from .dgl_dataset import DGLBuiltinDataset
from .. import backend as F
class FraudDataset(DGLBuiltinDataset): class FraudDataset(DGLBuiltinDataset):
...@@ -39,7 +39,7 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -39,7 +39,7 @@ class FraudDataset(DGLBuiltinDataset):
Default: ~/.dgl/ Default: ~/.dgl/
random_seed : int random_seed : int
Specifying the random seed in splitting the dataset. Specifying the random seed in splitting the dataset.
Default: 2 Default: 717
train_size : float train_size : float
training set size of the dataset. training set size of the dataset.
Default: 0.7 Default: 0.7
...@@ -86,7 +86,7 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -86,7 +86,7 @@ class FraudDataset(DGLBuiltinDataset):
'amazon': 'review' 'amazon': 'review'
} }
def __init__(self, name, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1): def __init__(self, name, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1):
assert name in ['yelp', 'amazon'], "only supports 'yelp', or 'amazon'" assert name in ['yelp', 'amazon'], "only supports 'yelp', or 'amazon'"
url = _get_dgl_url(self.file_urls[name]) url = _get_dgl_url(self.file_urls[name])
self.seed = random_seed self.seed = random_seed
...@@ -101,9 +101,8 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -101,9 +101,8 @@ class FraudDataset(DGLBuiltinDataset):
file_path = os.path.join(self.raw_path, self.file_names[self.name]) file_path = os.path.join(self.raw_path, self.file_names[self.name])
data = io.loadmat(file_path) data = io.loadmat(file_path)
node_features = torch.from_numpy(data['features'].todense()) node_features = data['features'].todense()
node_labels = torch.from_numpy(data['label']) node_labels = data['label']
node_labels = node_labels.transpose(0, 1)
graph_data = {} graph_data = {}
for relation in self.relations[self.name]: for relation in self.relations[self.name]:
...@@ -111,11 +110,11 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -111,11 +110,11 @@ class FraudDataset(DGLBuiltinDataset):
graph_data[(self.node_name[self.name], relation, self.node_name[self.name])] = (u, v) graph_data[(self.node_name[self.name], relation, self.node_name[self.name])] = (u, v)
g = heterograph(graph_data) g = heterograph(graph_data)
g.ndata['feature'] = node_features g.ndata['feature'] = F.tensor(node_features)
g.ndata['label'] = node_labels g.ndata['label'] = F.tensor(node_labels.T)
self.graph = g self.graph = g
self._random_split(g.ndata['feature'], g.ndata['label'], self.seed, self.train_size, self.val_size) self._random_split(g.ndata['feature'], self.seed, self.train_size, self.val_size)
def __getitem__(self, idx): def __getitem__(self, idx):
r""" Get graph object r""" Get graph object
...@@ -170,42 +169,32 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -170,42 +169,32 @@ class FraudDataset(DGLBuiltinDataset):
graph_path = os.path.join(self.save_path, self.name + '_dgl_graph.bin') graph_path = os.path.join(self.save_path, self.name + '_dgl_graph.bin')
return os.path.exists(graph_path) return os.path.exists(graph_path)
def _random_split(self, x, node_labels, seed=2, train_size=0.7, val_size=0.1): def _random_split(self, x, seed=717, train_size=0.7, val_size=0.1):
"""split the dataset into training set, validation set and testing set""" """split the dataset into training set, validation set and testing set"""
assert 0 <= train_size + val_size <= 1, \
"The sum of valid training set size and validation set size " \
"must between 0 and 1 (inclusive)."
N = x.shape[0] N = x.shape[0]
index = list(range(N)) index = list(range(N))
train_idx, test_idx, _, y = train_test_split(index,
node_labels,
stratify=node_labels,
train_size=train_size,
random_state=seed,
shuffle=True)
if self.name == 'amazon': if self.name == 'amazon':
# 0-3304 are unlabeled nodes # 0-3304 are unlabeled nodes
index = list(range(3305, N)) index = list(range(3305, N))
train_idx, test_idx, _, y = train_test_split(index,
node_labels[3305:], np.random.RandomState(seed).permutation(index)
stratify=node_labels[3305:], train_idx = index[:int(train_size * N)]
test_size=train_size, val_idx = index[int(N - val_size * N):]
random_state=seed, test_idx = index[int(train_size * N):int(N - val_size * N)]
shuffle=True) train_mask = np.zeros(N, dtype=np.bool)
val_mask = np.zeros(N, dtype=np.bool)
val_idx, test_idx, _, _ = train_test_split(test_idx, test_mask = np.zeros(N, dtype=np.bool)
y,
stratify=y,
train_size=val_size / (1 - train_size),
random_state=seed,
shuffle=True)
train_mask = torch.zeros(N, dtype=torch.bool)
val_mask = torch.zeros(N, dtype=torch.bool)
test_mask = torch.zeros(N, dtype=torch.bool)
train_mask[train_idx] = True train_mask[train_idx] = True
val_mask[val_idx] = True val_mask[val_idx] = True
test_mask[test_idx] = True test_mask[test_idx] = True
self.graph.ndata['train_mask'] = train_mask self.graph.ndata['train_mask'] = F.tensor(train_mask)
self.graph.ndata['val_mask'] = val_mask self.graph.ndata['val_mask'] = F.tensor(val_mask)
self.graph.ndata['test_mask'] = test_mask self.graph.ndata['test_mask'] = F.tensor(test_mask)
class FraudYelpDataset(FraudDataset): class FraudYelpDataset(FraudDataset):
...@@ -242,9 +231,8 @@ class FraudYelpDataset(FraudDataset): ...@@ -242,9 +231,8 @@ class FraudYelpDataset(FraudDataset):
already stores the input data. already stores the input data.
Default: ~/.dgl/ Default: ~/.dgl/
random_seed : int random_seed : int
Specifying the random seed in splitting the Specifying the random seed in splitting the dataset.
dataset. Default: 717
Default: 2
train_size : float train_size : float
training set size of the dataset. training set size of the dataset.
Default: 0.7 Default: 0.7
...@@ -262,7 +250,7 @@ class FraudYelpDataset(FraudDataset): ...@@ -262,7 +250,7 @@ class FraudYelpDataset(FraudDataset):
>>> label = dataset.ndata['label'] >>> label = dataset.ndata['label']
""" """
def __init__(self, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1): def __init__(self, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1):
super(FraudYelpDataset, self).__init__(name='yelp', super(FraudYelpDataset, self).__init__(name='yelp',
raw_dir=raw_dir, raw_dir=raw_dir,
random_seed=random_seed, random_seed=random_seed,
...@@ -308,9 +296,8 @@ class FraudAmazonDataset(FraudDataset): ...@@ -308,9 +296,8 @@ class FraudAmazonDataset(FraudDataset):
already stores the input data. already stores the input data.
Default: ~/.dgl/ Default: ~/.dgl/
random_seed : int random_seed : int
Specifying the random seed in splitting the Specifying the random seed in splitting the dataset.
dataset. Default: 717
Default: 2
train_size : float train_size : float
training set size of the dataset. training set size of the dataset.
Default: 0.7 Default: 0.7
...@@ -328,7 +315,7 @@ class FraudAmazonDataset(FraudDataset): ...@@ -328,7 +315,7 @@ class FraudAmazonDataset(FraudDataset):
>>> label = dataset.ndata['label'] >>> label = dataset.ndata['label']
""" """
def __init__(self, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1): def __init__(self, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1):
super(FraudAmazonDataset, self).__init__(name='amazon', super(FraudAmazonDataset, self).__init__(name='amazon',
raw_dir=raw_dir, raw_dir=raw_dir,
random_seed=random_seed, random_seed=random_seed,
......
...@@ -24,12 +24,33 @@ def test_gin(): ...@@ -24,12 +24,33 @@ def test_gin():
assert len(ds) == n_graphs, (len(ds), name) assert len(ds) == n_graphs, (len(ds), name)
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_fraud():
g = data.FraudDataset('amazon')[0]
assert g.num_nodes() == 11944
g = data.FraudAmazonDataset()[0]
assert g.num_nodes() == 11944
g = data.FraudYelpDataset()[0]
assert g.num_nodes() == 45954
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_fakenews():
ds = data.FakeNewsDataset('politifact', 'bert')
assert len(ds) == 314
ds = data.FakeNewsDataset('gossipcop', 'profile')
assert len(ds) == 5464
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.") @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_tudataset_regression(): def test_tudataset_regression():
ds = data.TUDataset('ZINC_test', force_reload=True) ds = data.TUDataset('ZINC_test', force_reload=True)
assert len(ds) == 5000 assert len(ds) == 5000
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.") @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_data_hash(): def test_data_hash():
class HashTestDataset(data.DGLDataset): class HashTestDataset(data.DGLDataset):
...@@ -44,7 +65,11 @@ def test_data_hash(): ...@@ -44,7 +65,11 @@ def test_data_hash():
assert a.hash == b.hash assert a.hash == b.hash
assert a.hash != c.hash assert a.hash != c.hash
if __name__ == '__main__': if __name__ == '__main__':
test_minigc() test_minigc()
test_gin() test_gin()
test_data_hash() test_data_hash()
test_tudataset_regression()
test_fraud()
test_fakenews()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment