"docs/vscode:/vscode.git/clone" did not exist on "01a80807de9727fe9ccb1b35d1ea447647738111"
Unverified Commit 9706eaa8 authored by Kay Liu's avatar Kay Liu Committed by GitHub
Browse files

[Feature] add permission information and fix import problems (#3036)



* [Feature] add positive negative statistics

* [Feature] add permission information and fix import problem

* fix backend incompatible problem

* modify random split to remove sklearn usage

* modify file read to remove pandas usage

* add datasets into doc

* add random seed in data splitting

* add dataset unit test

* usage permission information update
Co-authored-by: default avatarzhjwy9343 <6593865@qq.com>
parent e57c6e35
......@@ -117,6 +117,19 @@ Symmetric Stochastic Block Model Mixture dataset
.. autoclass:: SBMMixtureDataset
:members: __getitem__, __len__, collate_fn
.. _frauddata:
Fraud dataset
``````````````
.. autoclass:: FraudDataset
:members: __getitem__, __len__
.. autoclass:: FraudYelpDataset
:members: __getitem__, __len__
.. autoclass:: FraudAmazonDataset
:members: __getitem__, __len__
Edge Prediction Datasets
---------------------------------------
......@@ -207,6 +220,13 @@ Graph isomorphism network dataset
.. autoclass:: GINDataset
:members: __getitem__, __len__
.. _fakenewsdata:
Fake news dataset
```````````````````````````````````
.. autoclass:: FakeNewsDataset
:members: __getitem__, __len__
Utilities
-----------------
......
......@@ -27,6 +27,8 @@ from .dgl_dataset import DGLDataset, DGLBuiltinDataset
from .citation_graph import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset
from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset
from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
from .fakenews import FakeNewsDataset
def register_data_args(parser):
......
import torch
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
from .dgl_dataset import DGLBuiltinDataset
from .utils import save_graphs, load_graphs, _get_dgl_url
from .utils import save_info, load_info
from ..convert import graph
from .. import backend as F
class FakeNewsDataset(DGLBuiltinDataset):
......@@ -30,6 +29,8 @@ class FakeNewsDataset(DGLBuiltinDataset):
spacy: the 300-dimensional node feature composed of Twitter user
historical tweets encoded by the spaCy word2vec encoder.
Note: this dataset is for academic use only, and commercial use is prohibited.
Statistics:
Politifact:
......@@ -86,7 +87,7 @@ class FakeNewsDataset(DGLBuiltinDataset):
Graph labels
feature_name : str
Name of the feature (bert, content, profile, or spacy)
feature : scipy.sparse.csr.csr_matrix
feature : Tensor
Node features
train_mask : Tensor
Mask of training set
......@@ -122,14 +123,13 @@ class FakeNewsDataset(DGLBuiltinDataset):
def process(self):
"""process raw data to graph, labels and masks"""
self.labels = np.load(os.path.join(self.raw_path, 'graph_labels.npy'))
self.labels = torch.LongTensor(self.labels)
self.labels = F.tensor(np.load(os.path.join(self.raw_path, 'graph_labels.npy')))
num_graphs = self.labels.shape[0]
node_graph_id = np.load(os.path.join(self.raw_path, 'node_graph_id.npy'))
edges = pd.read_csv(os.path.join(self.raw_path, 'A.txt'), header=None)
src = edges[0].to_numpy()
dst = edges[1].to_numpy()
edges = np.genfromtxt(os.path.join(self.raw_path, 'A.txt'), delimiter=',', dtype=int)
src = edges[:, 0]
dst = edges[:, 1]
g = graph((src, dst))
node_idx_list = []
......@@ -142,18 +142,18 @@ class FakeNewsDataset(DGLBuiltinDataset):
train_idx = np.load(os.path.join(self.raw_path, 'train_idx.npy'))
val_idx = np.load(os.path.join(self.raw_path, 'val_idx.npy'))
test_idx = np.load(os.path.join(self.raw_path, 'test_idx.npy'))
train_mask = torch.zeros(num_graphs, dtype=torch.bool)
val_mask = torch.zeros(num_graphs, dtype=torch.bool)
test_mask = torch.zeros(num_graphs, dtype=torch.bool)
train_mask = np.zeros(num_graphs, dtype=np.bool)
val_mask = np.zeros(num_graphs, dtype=np.bool)
test_mask = np.zeros(num_graphs, dtype=np.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True
self.train_mask = train_mask
self.val_mask = val_mask
self.test_mask = test_mask
self.train_mask = F.tensor(train_mask)
self.val_mask = F.tensor(val_mask)
self.test_mask = F.tensor(test_mask)
feature_file = 'new_' + self.feature_name + '_feature.npz'
self.feature = sp.load_npz(os.path.join(self.raw_path, feature_file))
self.feature = F.tensor(sp.load_npz(os.path.join(self.raw_path, feature_file)).todense())
def save(self):
"""save the graph list and the labels"""
......
"""Fraud Dataset
"""
import torch
import os
from scipy import io
from sklearn.model_selection import train_test_split
import numpy as np
from .utils import save_graphs, load_graphs, _get_dgl_url
from ..convert import heterograph
from ..utils import graphdata2tensors
from .dgl_dataset import DGLBuiltinDataset
from .. import backend as F
class FraudDataset(DGLBuiltinDataset):
......@@ -39,7 +39,7 @@ class FraudDataset(DGLBuiltinDataset):
Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the dataset.
Default: 2
Default: 717
train_size : float
training set size of the dataset.
Default: 0.7
......@@ -86,7 +86,7 @@ class FraudDataset(DGLBuiltinDataset):
'amazon': 'review'
}
def __init__(self, name, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1):
def __init__(self, name, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1):
assert name in ['yelp', 'amazon'], "only supports 'yelp', or 'amazon'"
url = _get_dgl_url(self.file_urls[name])
self.seed = random_seed
......@@ -101,9 +101,8 @@ class FraudDataset(DGLBuiltinDataset):
file_path = os.path.join(self.raw_path, self.file_names[self.name])
data = io.loadmat(file_path)
node_features = torch.from_numpy(data['features'].todense())
node_labels = torch.from_numpy(data['label'])
node_labels = node_labels.transpose(0, 1)
node_features = data['features'].todense()
node_labels = data['label']
graph_data = {}
for relation in self.relations[self.name]:
......@@ -111,11 +110,11 @@ class FraudDataset(DGLBuiltinDataset):
graph_data[(self.node_name[self.name], relation, self.node_name[self.name])] = (u, v)
g = heterograph(graph_data)
g.ndata['feature'] = node_features
g.ndata['label'] = node_labels
g.ndata['feature'] = F.tensor(node_features)
g.ndata['label'] = F.tensor(node_labels.T)
self.graph = g
self._random_split(g.ndata['feature'], g.ndata['label'], self.seed, self.train_size, self.val_size)
self._random_split(g.ndata['feature'], self.seed, self.train_size, self.val_size)
def __getitem__(self, idx):
r""" Get graph object
......@@ -170,42 +169,32 @@ class FraudDataset(DGLBuiltinDataset):
graph_path = os.path.join(self.save_path, self.name + '_dgl_graph.bin')
return os.path.exists(graph_path)
def _random_split(self, x, node_labels, seed=2, train_size=0.7, val_size=0.1):
def _random_split(self, x, seed=717, train_size=0.7, val_size=0.1):
"""split the dataset into training set, validation set and testing set"""
assert 0 <= train_size + val_size <= 1, \
"The sum of valid training set size and validation set size " \
"must between 0 and 1 (inclusive)."
N = x.shape[0]
index = list(range(N))
train_idx, test_idx, _, y = train_test_split(index,
node_labels,
stratify=node_labels,
train_size=train_size,
random_state=seed,
shuffle=True)
if self.name == 'amazon':
# 0-3304 are unlabeled nodes
index = list(range(3305, N))
train_idx, test_idx, _, y = train_test_split(index,
node_labels[3305:],
stratify=node_labels[3305:],
test_size=train_size,
random_state=seed,
shuffle=True)
val_idx, test_idx, _, _ = train_test_split(test_idx,
y,
stratify=y,
train_size=val_size / (1 - train_size),
random_state=seed,
shuffle=True)
train_mask = torch.zeros(N, dtype=torch.bool)
val_mask = torch.zeros(N, dtype=torch.bool)
test_mask = torch.zeros(N, dtype=torch.bool)
np.random.RandomState(seed).permutation(index)
train_idx = index[:int(train_size * N)]
val_idx = index[int(N - val_size * N):]
test_idx = index[int(train_size * N):int(N - val_size * N)]
train_mask = np.zeros(N, dtype=np.bool)
val_mask = np.zeros(N, dtype=np.bool)
test_mask = np.zeros(N, dtype=np.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True
self.graph.ndata['train_mask'] = train_mask
self.graph.ndata['val_mask'] = val_mask
self.graph.ndata['test_mask'] = test_mask
self.graph.ndata['train_mask'] = F.tensor(train_mask)
self.graph.ndata['val_mask'] = F.tensor(val_mask)
self.graph.ndata['test_mask'] = F.tensor(test_mask)
class FraudYelpDataset(FraudDataset):
......@@ -242,9 +231,8 @@ class FraudYelpDataset(FraudDataset):
already stores the input data.
Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the
dataset.
Default: 2
Specifying the random seed in splitting the dataset.
Default: 717
train_size : float
training set size of the dataset.
Default: 0.7
......@@ -262,7 +250,7 @@ class FraudYelpDataset(FraudDataset):
>>> label = dataset.ndata['label']
"""
def __init__(self, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1):
def __init__(self, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1):
super(FraudYelpDataset, self).__init__(name='yelp',
raw_dir=raw_dir,
random_seed=random_seed,
......@@ -308,9 +296,8 @@ class FraudAmazonDataset(FraudDataset):
already stores the input data.
Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the
dataset.
Default: 2
Specifying the random seed in splitting the dataset.
Default: 717
train_size : float
training set size of the dataset.
Default: 0.7
......@@ -328,7 +315,7 @@ class FraudAmazonDataset(FraudDataset):
>>> label = dataset.ndata['label']
"""
def __init__(self, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1):
def __init__(self, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1):
super(FraudAmazonDataset, self).__init__(name='amazon',
raw_dir=raw_dir,
random_seed=random_seed,
......
......@@ -24,12 +24,33 @@ def test_gin():
assert len(ds) == n_graphs, (len(ds), name)
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_fraud():
g = data.FraudDataset('amazon')[0]
assert g.num_nodes() == 11944
g = data.FraudAmazonDataset()[0]
assert g.num_nodes() == 11944
g = data.FraudYelpDataset()[0]
assert g.num_nodes() == 45954
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_fakenews():
ds = data.FakeNewsDataset('politifact', 'bert')
assert len(ds) == 314
ds = data.FakeNewsDataset('gossipcop', 'profile')
assert len(ds) == 5464
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_tudataset_regression():
ds = data.TUDataset('ZINC_test', force_reload=True)
assert len(ds) == 5000
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_data_hash():
class HashTestDataset(data.DGLDataset):
......@@ -44,7 +65,11 @@ def test_data_hash():
assert a.hash == b.hash
assert a.hash != c.hash
if __name__ == '__main__':
test_minigc()
test_gin()
test_data_hash()
test_tudataset_regression()
test_fraud()
test_fakenews()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment