Unverified Commit 55e7796a authored by zhjwy9343's avatar zhjwy9343 Committed by GitHub
Browse files

Revert "[Feature] add permission information and fix import problems (#2998)" (#3029)

This reverts commit cba5b188

.
Co-authored-by: default avatarJinjing Zhou <VoVAllen@users.noreply.github.com>
parent 7d069d62
...@@ -117,19 +117,6 @@ Symmetric Stochastic Block Model Mixture dataset ...@@ -117,19 +117,6 @@ Symmetric Stochastic Block Model Mixture dataset
.. autoclass:: SBMMixtureDataset .. autoclass:: SBMMixtureDataset
:members: __getitem__, __len__, collate_fn :members: __getitem__, __len__, collate_fn
.. _frauddata:
Fraud dataset
``````````````
.. autoclass:: FraudDataset
:members: __getitem__, __len__
.. autoclass:: FraudYelpDataset
:members: __getitem__, __len__
.. autoclass:: FraudAmazonDataset
:members: __getitem__, __len__
Edge Prediction Datasets Edge Prediction Datasets
--------------------------------------- ---------------------------------------
...@@ -220,13 +207,6 @@ Graph isomorphism network dataset ...@@ -220,13 +207,6 @@ Graph isomorphism network dataset
.. autoclass:: GINDataset .. autoclass:: GINDataset
:members: __getitem__, __len__ :members: __getitem__, __len__
.. _fakenewsdata:
Fake news dataset
```````````````````````````````````
.. autoclass:: FakeNewsDataset
:members: __getitem__, __len__
Utilities Utilities
----------------- -----------------
......
...@@ -27,8 +27,6 @@ from .dgl_dataset import DGLDataset, DGLBuiltinDataset ...@@ -27,8 +27,6 @@ from .dgl_dataset import DGLDataset, DGLBuiltinDataset
from .citation_graph import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset from .citation_graph import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset
from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset
from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
from .fakenews import FakeNewsDataset
def register_data_args(parser): def register_data_args(parser):
......
import torch
import os import os
import numpy as np import numpy as np
import pandas as pd
import scipy.sparse as sp import scipy.sparse as sp
from .dgl_dataset import DGLBuiltinDataset from .dgl_dataset import DGLBuiltinDataset
from .utils import save_graphs, load_graphs, _get_dgl_url from .utils import save_graphs, load_graphs, _get_dgl_url
from .utils import save_info, load_info from .utils import save_info, load_info
from ..convert import graph from ..convert import graph
from .. import backend as F
class FakeNewsDataset(DGLBuiltinDataset): class FakeNewsDataset(DGLBuiltinDataset):
...@@ -29,9 +30,6 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -29,9 +30,6 @@ class FakeNewsDataset(DGLBuiltinDataset):
spacy: the 300-dimensional node feature composed of Twitter user spacy: the 300-dimensional node feature composed of Twitter user
historical tweets encoded by the spaCy word2vec encoder. historical tweets encoded by the spaCy word2vec encoder.
Note: this dataset is for academic use only, commercial use requires
the approval from Twitter.
Statistics: Statistics:
Politifact: Politifact:
...@@ -88,7 +86,7 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -88,7 +86,7 @@ class FakeNewsDataset(DGLBuiltinDataset):
Graph labels Graph labels
feature_name : str feature_name : str
Name of the feature (bert, content, profile, or spacy) Name of the feature (bert, content, profile, or spacy)
feature : Tensor feature : scipy.sparse.csr.csr_matrix
Node features Node features
train_mask : Tensor train_mask : Tensor
Mask of training set Mask of training set
...@@ -124,13 +122,14 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -124,13 +122,14 @@ class FakeNewsDataset(DGLBuiltinDataset):
def process(self): def process(self):
"""process raw data to graph, labels and masks""" """process raw data to graph, labels and masks"""
self.labels = F.tensor(np.load(os.path.join(self.raw_path, 'graph_labels.npy'))) self.labels = np.load(os.path.join(self.raw_path, 'graph_labels.npy'))
self.labels = torch.LongTensor(self.labels)
num_graphs = self.labels.shape[0] num_graphs = self.labels.shape[0]
node_graph_id = np.load(os.path.join(self.raw_path, 'node_graph_id.npy')) node_graph_id = np.load(os.path.join(self.raw_path, 'node_graph_id.npy'))
edges = np.genfromtxt(os.path.join(self.raw_path, 'A.txt'), delimiter=',', dtype=int) edges = pd.read_csv(os.path.join(self.raw_path, 'A.txt'), header=None)
src = edges[:, 0] src = edges[0].to_numpy()
dst = edges[:, 1] dst = edges[1].to_numpy()
g = graph((src, dst)) g = graph((src, dst))
node_idx_list = [] node_idx_list = []
...@@ -143,18 +142,18 @@ class FakeNewsDataset(DGLBuiltinDataset): ...@@ -143,18 +142,18 @@ class FakeNewsDataset(DGLBuiltinDataset):
train_idx = np.load(os.path.join(self.raw_path, 'train_idx.npy')) train_idx = np.load(os.path.join(self.raw_path, 'train_idx.npy'))
val_idx = np.load(os.path.join(self.raw_path, 'val_idx.npy')) val_idx = np.load(os.path.join(self.raw_path, 'val_idx.npy'))
test_idx = np.load(os.path.join(self.raw_path, 'test_idx.npy')) test_idx = np.load(os.path.join(self.raw_path, 'test_idx.npy'))
train_mask = np.zeros(num_graphs, dtype=np.bool) train_mask = torch.zeros(num_graphs, dtype=torch.bool)
val_mask = np.zeros(num_graphs, dtype=np.bool) val_mask = torch.zeros(num_graphs, dtype=torch.bool)
test_mask = np.zeros(num_graphs, dtype=np.bool) test_mask = torch.zeros(num_graphs, dtype=torch.bool)
train_mask[train_idx] = True train_mask[train_idx] = True
val_mask[val_idx] = True val_mask[val_idx] = True
test_mask[test_idx] = True test_mask[test_idx] = True
self.train_mask = F.tensor(train_mask) self.train_mask = train_mask
self.val_mask = F.tensor(val_mask) self.val_mask = val_mask
self.test_mask = F.tensor(test_mask) self.test_mask = test_mask
feature_file = 'new_' + self.feature_name + '_feature.npz' feature_file = 'new_' + self.feature_name + '_feature.npz'
self.feature = F.tensor(sp.load_npz(os.path.join(self.raw_path, feature_file)).todense()) self.feature = sp.load_npz(os.path.join(self.raw_path, feature_file))
def save(self): def save(self):
"""save the graph list and the labels""" """save the graph list and the labels"""
......
"""Fraud Dataset """Fraud Dataset
""" """
import torch
import os import os
from scipy import io from scipy import io
import numpy as np from sklearn.model_selection import train_test_split
from .utils import save_graphs, load_graphs, _get_dgl_url from .utils import save_graphs, load_graphs, _get_dgl_url
from ..convert import heterograph from ..convert import heterograph
from ..utils import graphdata2tensors from ..utils import graphdata2tensors
from .dgl_dataset import DGLBuiltinDataset from .dgl_dataset import DGLBuiltinDataset
from .. import backend as F
class FraudDataset(DGLBuiltinDataset): class FraudDataset(DGLBuiltinDataset):
...@@ -37,6 +37,9 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -37,6 +37,9 @@ class FraudDataset(DGLBuiltinDataset):
downloaded data or the directory that downloaded data or the directory that
already stores the input data. already stores the input data.
Default: ~/.dgl/ Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the dataset.
Default: 2
train_size : float train_size : float
training set size of the dataset. training set size of the dataset.
Default: 0.7 Default: 0.7
...@@ -51,6 +54,8 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -51,6 +54,8 @@ class FraudDataset(DGLBuiltinDataset):
Number of label classes Number of label classes
graph : dgl.heterograph.DGLHeteroGraph graph : dgl.heterograph.DGLHeteroGraph
Graph structure, etc. Graph structure, etc.
seed : int
Random seed in splitting the dataset.
train_size : float train_size : float
Training set size of the dataset. Training set size of the dataset.
val_size : float val_size : float
...@@ -81,9 +86,10 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -81,9 +86,10 @@ class FraudDataset(DGLBuiltinDataset):
'amazon': 'review' 'amazon': 'review'
} }
def __init__(self, name, raw_dir=None, train_size=0.7, val_size=0.1): def __init__(self, name, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1):
assert name in ['yelp', 'amazon'], "only supports 'yelp', or 'amazon'" assert name in ['yelp', 'amazon'], "only supports 'yelp', or 'amazon'"
url = _get_dgl_url(self.file_urls[name]) url = _get_dgl_url(self.file_urls[name])
self.seed = random_seed
self.train_size = train_size self.train_size = train_size
self.val_size = val_size self.val_size = val_size
super(FraudDataset, self).__init__(name=name, super(FraudDataset, self).__init__(name=name,
...@@ -95,8 +101,9 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -95,8 +101,9 @@ class FraudDataset(DGLBuiltinDataset):
file_path = os.path.join(self.raw_path, self.file_names[self.name]) file_path = os.path.join(self.raw_path, self.file_names[self.name])
data = io.loadmat(file_path) data = io.loadmat(file_path)
node_features = data['features'].todense() node_features = torch.from_numpy(data['features'].todense())
node_labels = data['label'] node_labels = torch.from_numpy(data['label'])
node_labels = node_labels.transpose(0, 1)
graph_data = {} graph_data = {}
for relation in self.relations[self.name]: for relation in self.relations[self.name]:
...@@ -104,11 +111,11 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -104,11 +111,11 @@ class FraudDataset(DGLBuiltinDataset):
graph_data[(self.node_name[self.name], relation, self.node_name[self.name])] = (u, v) graph_data[(self.node_name[self.name], relation, self.node_name[self.name])] = (u, v)
g = heterograph(graph_data) g = heterograph(graph_data)
g.ndata['feature'] = F.tensor(node_features) g.ndata['feature'] = node_features
g.ndata['label'] = F.tensor(node_labels.T) g.ndata['label'] = node_labels
self.graph = g self.graph = g
self._random_split(g.ndata['feature'], g.ndata['label'], self.train_size, self.val_size) self._random_split(g.ndata['feature'], g.ndata['label'], self.seed, self.train_size, self.val_size)
def __getitem__(self, idx): def __getitem__(self, idx):
r""" Get graph object r""" Get graph object
...@@ -163,32 +170,42 @@ class FraudDataset(DGLBuiltinDataset): ...@@ -163,32 +170,42 @@ class FraudDataset(DGLBuiltinDataset):
graph_path = os.path.join(self.save_path, self.name + '_dgl_graph.bin') graph_path = os.path.join(self.save_path, self.name + '_dgl_graph.bin')
return os.path.exists(graph_path) return os.path.exists(graph_path)
def _random_split(self, x, train_size=0.7, val_size=0.1): def _random_split(self, x, node_labels, seed=2, train_size=0.7, val_size=0.1):
"""split the dataset into training set, validation set and testing set""" """split the dataset into training set, validation set and testing set"""
assert 0 <= train_size + val_size <= 1, \
"The sum of valid training set size and validation set size " \
"must between 0 and 1 (inclusive)."
N = x.shape[0] N = x.shape[0]
index = list(range(N)) index = list(range(N))
train_idx, test_idx, _, y = train_test_split(index,
node_labels,
stratify=node_labels,
train_size=train_size,
random_state=seed,
shuffle=True)
if self.name == 'amazon': if self.name == 'amazon':
# 0-3304 are unlabeled nodes # 0-3304 are unlabeled nodes
index = list(range(3305, N)) index = list(range(3305, N))
train_idx, test_idx, _, y = train_test_split(index,
np.random.permutation(index) node_labels[3305:],
train_idx = index[:int(train_size * N)] stratify=node_labels[3305:],
val_idx = index[int(N - val_size * N):] test_size=train_size,
test_idx = index[int(train_size * N):int(N - val_size * N)] random_state=seed,
train_mask = np.zeros(N, dtype=np.bool) shuffle=True)
val_mask = np.zeros(N, dtype=np.bool)
test_mask = np.zeros(N, dtype=np.bool) val_idx, test_idx, _, _ = train_test_split(test_idx,
y,
stratify=y,
train_size=val_size / (1 - train_size),
random_state=seed,
shuffle=True)
train_mask = torch.zeros(N, dtype=torch.bool)
val_mask = torch.zeros(N, dtype=torch.bool)
test_mask = torch.zeros(N, dtype=torch.bool)
train_mask[train_idx] = True train_mask[train_idx] = True
val_mask[val_idx] = True val_mask[val_idx] = True
test_mask[test_idx] = True test_mask[test_idx] = True
self.graph.ndata['train_mask'] = F.tensor(train_mask) self.graph.ndata['train_mask'] = train_mask
self.graph.ndata['val_mask'] = F.tensor(val_mask) self.graph.ndata['val_mask'] = val_mask
self.graph.ndata['test_mask'] = F.tensor(test_mask) self.graph.ndata['test_mask'] = test_mask
class FraudYelpDataset(FraudDataset): class FraudYelpDataset(FraudDataset):
...@@ -224,6 +241,10 @@ class FraudYelpDataset(FraudDataset): ...@@ -224,6 +241,10 @@ class FraudYelpDataset(FraudDataset):
downloaded data or the directory that downloaded data or the directory that
already stores the input data. already stores the input data.
Default: ~/.dgl/ Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the
dataset.
Default: 2
train_size : float train_size : float
training set size of the dataset. training set size of the dataset.
Default: 0.7 Default: 0.7
...@@ -241,9 +262,10 @@ class FraudYelpDataset(FraudDataset): ...@@ -241,9 +262,10 @@ class FraudYelpDataset(FraudDataset):
>>> label = dataset.ndata['label'] >>> label = dataset.ndata['label']
""" """
def __init__(self, raw_dir=None, train_size=0.7, val_size=0.1): def __init__(self, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1):
super(FraudYelpDataset, self).__init__(name='yelp', super(FraudYelpDataset, self).__init__(name='yelp',
raw_dir=raw_dir, raw_dir=raw_dir,
random_seed=random_seed,
train_size=train_size, train_size=train_size,
val_size=val_size) val_size=val_size)
...@@ -285,6 +307,10 @@ class FraudAmazonDataset(FraudDataset): ...@@ -285,6 +307,10 @@ class FraudAmazonDataset(FraudDataset):
downloaded data or the directory that downloaded data or the directory that
already stores the input data. already stores the input data.
Default: ~/.dgl/ Default: ~/.dgl/
random_seed : int
Specifying the random seed in splitting the
dataset.
Default: 2
train_size : float train_size : float
training set size of the dataset. training set size of the dataset.
Default: 0.7 Default: 0.7
...@@ -302,8 +328,9 @@ class FraudAmazonDataset(FraudDataset): ...@@ -302,8 +328,9 @@ class FraudAmazonDataset(FraudDataset):
>>> label = dataset.ndata['label'] >>> label = dataset.ndata['label']
""" """
def __init__(self, raw_dir=None, train_size=0.7, val_size=0.1): def __init__(self, raw_dir=None, random_seed=2, train_size=0.7, val_size=0.1):
super(FraudAmazonDataset, self).__init__(name='amazon', super(FraudAmazonDataset, self).__init__(name='amazon',
raw_dir=raw_dir, raw_dir=raw_dir,
random_seed=random_seed,
train_size=train_size, train_size=train_size,
val_size=val_size) val_size=val_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment