"tests/scripts/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "b8ed4d9ba348c9104cab7b75910b930cb48fc5ce"
Unverified Commit 22485e58 authored by Xiangkun Hu's avatar Xiangkun Hu Committed by GitHub
Browse files

[Dataset] BitcoinOTCDataset (#1910)

* PPIDataset

* Revert "PPIDataset"

This reverts commit 264bd0c960cfa698a7bb946dad132bf52c2d0c8a.

* bitcoinotc dataset

* Update bitcoinotc.py
parent d22f96fe
...@@ -13,7 +13,7 @@ from .tu import TUDataset, LegacyTUDataset ...@@ -13,7 +13,7 @@ from .tu import TUDataset, LegacyTUDataset
from .gnn_benckmark import AmazonCoBuy, CoraFull, Coauthor from .gnn_benckmark import AmazonCoBuy, CoraFull, Coauthor
from .karate import KarateClub, KarateClubDataset from .karate import KarateClub, KarateClubDataset
from .gindt import GINDataset from .gindt import GINDataset
from .bitcoinotc import BitcoinOTC from .bitcoinotc import BitcoinOTC, BitcoinOTCDataset
from .gdelt import GDELT from .gdelt import GDELT
from .icews18 import ICEWS18 from .icews18 import ICEWS18
from .qm7b import QM7b, QM7bDataset from .qm7b import QM7b, QM7bDataset
......
from scipy import io """ BitcoinOTC dataset for fraud detection """
import numpy as np import numpy as np
import os import os
import datetime import datetime
import gzip
import shutil
from .utils import get_download_dir, download, extract_archive from .dgl_dataset import DGLBuiltinDataset
from ..utils import retry_method_with_fix from .utils import download, makedirs, save_graphs, load_graphs, check_sha1
from .. import convert from ..convert import graph as dgl_graph
from .. import backend as F
class BitcoinOTC(object): class BitcoinOTCDataset(DGLBuiltinDataset):
""" r"""BitcoinOTC dataset for fraud detection
This is who-trusts-whom network of people who trade using Bitcoin
on a platform called Bitcoin OTC. This is who-trusts-whom network of people who trade using Bitcoin on
Since Bitcoin users are anonymous, there is a need to maintain a a platform called Bitcoin OTC. Since Bitcoin users are anonymous,
record of users' reputation to prevent transactions with fraudulent there is a need to maintain a record of users' reputation to prevent
and risky users. Members of Bitcoin OTC rate other members in a transactions with fraudulent and risky users.
scale of -10 (total distrust) to +10 (total trust) in steps of 1. Offical website: https://snap.stanford.edu/data/soc-sign-bitcoin-otc.html
Reference: Bitcoin OTC dataset statistics:
- `Bitcoin OTC trust weighted signed network <http://snap.stanford.edu/data/soc-sign-bitcoin-otc.html>`_ Nodes: 5,881
- `EvolveGCN: Evolving Graph Edges: 35,592
Convolutional Networks for Dynamic Graphs Range of edge weight: -10 to +10
<https://arxiv.org/abs/1902.10191>`_ Percentage of positive edges: 89%
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset.
Default: False
verbose: bool
Whether to print out progress information.
Default: True.
Attributes
----------
graphs : list
A list of DGLGraph objects
is_temporal : bool
Indicate whether the graphs are temporal graphs
Raises
------
UserWarning
If the raw data is changed in the remote server by the author.
Examples
--------
>>> dataset = BitcoinOTCDataset()
>>> len(dataset)
136
>>> for g in dataset:
.... # get edge feature
.... edge_weights = g.edata['h']
.... # your code here
>>>
""" """
_url = 'https://snap.stanford.edu/data/soc-sign-bitcoinotc.csv.gz' _url = 'https://snap.stanford.edu/data/soc-sign-bitcoinotc.csv.gz'
_sha1_str = 'c14281f9e252de0bd0b5f1c6e2bae03123938641'
def __init__(self, raw_dir=None, force_reload=False, verbose=False):
super(BitcoinOTCDataset, self).__init__(name='bitcoinotc',
url=self._url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def __init__(self): def download(self):
self.dir = get_download_dir() gz_file_path = os.path.join(self.raw_dir, self.name + '.csv.gz')
self.zip_path = os.path.join( download(self.url, path=gz_file_path)
self.dir, 'bitcoin', "soc-sign-bitcoinotc.csv.gz") if not check_sha1(gz_file_path, self._sha1_str):
self.path = os.path.join( raise UserWarning('File {} is downloaded but the content hash does not match.'
self.dir, 'bitcoin', "soc-sign-bitcoinotc.csv") 'The repo may be outdated or download may be incomplete. '
self.graphs = [] 'Otherwise you can create an issue for it.'.format(self.name + '.csv.gz'))
self._load(self.path) self._extract_gz(gz_file_path, self.raw_path)
def _download_and_extract(self): def process(self):
download(self._url, path=self.zip_path) filename = os.path.join(self.save_path, self.name + '.csv')
extract_archive(self.zip_path, os.path.join(self.dir, 'bitcoin'))
@retry_method_with_fix(_download_and_extract)
def _load(self, filename):
data = np.loadtxt(filename, delimiter=',').astype(np.int64) data = np.loadtxt(filename, delimiter=',').astype(np.int64)
data[:, 0:2] = data[:, 0:2] - data[:, 0:2].min() data[:, 0:2] = data[:, 0:2] - data[:, 0:2].min()
num_nodes = data[:, 0:2].max() - data[:, 0:2].min() + 1
delta = datetime.timedelta(days=14).total_seconds() delta = datetime.timedelta(days=14).total_seconds()
# The source code is not released, but the paper indicates there're # The source code is not released, but the paper indicates there're
# totally 137 samples. The cutoff below has exactly 137 samples. # totally 137 samples. The cutoff below has exactly 137 samples.
time_index = np.around( time_index = np.around(
(data[:, 3] - data[:, 3].min())/delta).astype(np.int64) (data[:, 3] - data[:, 3].min()) / delta).astype(np.int64)
self._graphs = []
for i in range(time_index.max()): for i in range(time_index.max()):
row_mask = time_index <= i row_mask = time_index <= i
edges = data[row_mask][:, 0:2] edges = data[row_mask][:, 0:2]
rate = data[row_mask][:, 2] rate = data[row_mask][:, 2]
g = convert.graph((edges[:, 0], edges[:, 1])) g = dgl_graph((edges[:, 0], edges[:, 1]))
g.edata['h'] = rate.reshape(-1, 1) g.edata['h'] = F.tensor(rate.reshape(-1, 1), dtype=F.data_type_dict['int64'])
self.graphs.append(g) self._graphs.append(g)
def __getitem__(self, idx): def has_cache(self):
return self.graphs[idx] graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
return os.path.exists(graph_path)
def save(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
save_graphs(graph_path, self.graphs)
def load(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
self._graphs = load_graphs(graph_path)[0]
@property
def graphs(self):
return self._graphs
def __len__(self): def __len__(self):
r""" Number of graphs in the dataset """
return len(self.graphs) return len(self.graphs)
def __getitem__(self, item):
r""" Get graph by index
Parameters
----------
item : int
Item index
Returns
-------
dgl.DGLGraph
The graph contains the graph structure and edge weights
- edata['h'] : edge weights
"""
return self.graphs[item]
@property @property
def is_temporal(self): def is_temporal(self):
r""" Are the graphs temporal graphs
Returns
-------
bool
"""
return True return True
def _extract_gz(self, file, target_dir, overwrite=False):
if os.path.exists(target_dir) and not overwrite:
return
print('Extracting file to {}'.format(target_dir))
fname = os.path.basename(file)
makedirs(target_dir)
out_file_path = os.path.join(target_dir, fname[:-3])
with gzip.open(file, 'rb') as f_in:
with open(out_file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
BitcoinOTC = BitcoinOTCDataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment