dataset.py

import os
import pandas
import numpy
import torch
import dgl


def process_raw_data(raw_dir, processed_dir):
    r"""

    Description
    -----------
    Preprocess Elliptic dataset like the EvolveGCN official instruction:
    github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md
    The main purpose is to convert original idx to contiguous idx start at 0.
    """
    oid_nid_path = os.path.join(processed_dir, 'oid_nid.npy')
    id_label_path = os.path.join(processed_dir, 'id_label.npy')
    id_time_features_path = os.path.join(processed_dir, 'id_time_features.npy')
    src_dst_time_path = os.path.join(processed_dir, 'src_dst_time.npy')
    if os.path.exists(oid_nid_path) and os.path.exists(id_label_path) and \
            os.path.exists(id_time_features_path) and os.path.exists(src_dst_time_path):
        print("The preprocessed data already exists, skip the preprocess stage!")
        return
    print("starting process raw data in {}".format(raw_dir))
    id_label = pandas.read_csv(os.path.join(raw_dir, 'elliptic_txs_classes.csv'))
    src_dst = pandas.read_csv(os.path.join(raw_dir, 'elliptic_txs_edgelist.csv'))
    # elliptic_txs_features.csv has no header, and it has the same order idx with elliptic_txs_classes.csv
    id_time_features = pandas.read_csv(os.path.join(raw_dir, 'elliptic_txs_features.csv'), header=None)

    # get oldId_newId
    oid_nid = id_label.loc[:, ['txId']]
    oid_nid = oid_nid.rename(columns={'txId': 'originalId'})
    oid_nid.insert(1, 'newId', range(len(oid_nid)))

    # map classes unknown,1,2 to -1,1,0 and construct id_label. type 1 means illicit.
    id_label = pandas.concat(
        [oid_nid['newId'], id_label['class'].map({'unknown': -1.0, '1': 1.0, '2': 0.0})], axis=1)

    # replace originalId to newId.
    # Attention: the timestamp in features start at 1.
    id_time_features[0] = oid_nid['newId']

    # construct originalId2newId dict
    oid_nid_dict = oid_nid.set_index(['originalId'])['newId'].to_dict()
    # construct newId2timestamp dict
    nid_time_dict = id_time_features.set_index([0])[1].to_dict()

    # Map id in edgelist to newId, and add a timestamp to each edge.
    # Attention: From the EvolveGCN official instruction, the timestamp with edgelist start at 0, rather than 1.
    # see: github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md
    # Here we dose not follow the official instruction, which means timestamp with edgelist also start at 1.
    # In EvolveGCN example, the edge timestamp will not be used.
    #
    # Note: in the dataset, src and dst node has the same timestamp, so it's easy to set edge's timestamp.
    new_src = src_dst['txId1'].map(oid_nid_dict).rename('newSrc')
    new_dst = src_dst['txId2'].map(oid_nid_dict).rename('newDst')
    edge_time = new_src.map(nid_time_dict).rename('timestamp')
    src_dst_time = pandas.concat([new_src, new_dst, edge_time], axis=1)

    # save oid_nid, id_label, id_time_features, src_dst_time to disk. we can convert them to numpy.
    # oid_nid: type int.  id_label: type int.  id_time_features: type float.  src_dst_time: type int.
    oid_nid = oid_nid.to_numpy(dtype=int)
    id_label = id_label.to_numpy(dtype=int)
    id_time_features = id_time_features.to_numpy(dtype=float)
    src_dst_time = src_dst_time.to_numpy(dtype=int)

    numpy.save(oid_nid_path, oid_nid)
    numpy.save(id_label_path, id_label)
    numpy.save(id_time_features_path, id_time_features)
    numpy.save(src_dst_time_path, src_dst_time)
    print("Process Elliptic raw data done, data has saved into {}".format(processed_dir))


class EllipticDataset:
    def __init__(self, raw_dir, processed_dir, self_loop=True, reverse_edge=True):
        self.raw_dir = raw_dir
        self.processd_dir = processed_dir
        self.self_loop = self_loop
        self.reverse_edge = reverse_edge

    def process(self):
        process_raw_data(self.raw_dir, self.processd_dir)
        id_time_features = torch.Tensor(numpy.load(os.path.join(self.processd_dir, 'id_time_features.npy')))
        id_label = torch.IntTensor(numpy.load(os.path.join(self.processd_dir, 'id_label.npy')))
        src_dst_time = torch.IntTensor(numpy.load(os.path.join(self.processd_dir, 'src_dst_time.npy')))

        src = src_dst_time[:, 0]
        dst = src_dst_time[:, 1]
        # id_label[:, 0] is used to add self loop
        if self.self_loop:
            if self.reverse_edge:
                g = dgl.graph(data=(torch.cat((src, dst, id_label[:, 0])), torch.cat((dst, src, id_label[:, 0]))),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = torch.cat((src_dst_time[:, 2], src_dst_time[:, 2], id_time_features[:, 1].int()))
            else:
                g = dgl.graph(data=(torch.cat((src, id_label[:, 0])), torch.cat((dst, id_label[:, 0]))),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = torch.cat((src_dst_time[:, 2], id_time_features[:, 1].int()))
        else:
            if self.reverse_edge:
                g = dgl.graph(data=(torch.cat((src, dst)), torch.cat((dst, src))),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = torch.cat((src_dst_time[:, 2], src_dst_time[:, 2]))
            else:
                g = dgl.graph(data=(src, dst),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = src_dst_time[:, 2]

        time_features = id_time_features[:, 1:]
        label = id_label[:, 1]
        g.ndata['label'] = label
        g.ndata['feat'] = time_features

        # used to construct time-based sub-graph.
        node_mask_by_time = []
        start_time = int(torch.min(id_time_features[:, 1]))
        end_time = int(torch.max(id_time_features[:, 1]))
        for i in range(start_time, end_time + 1):
            node_mask = id_time_features[:, 1] == i
            node_mask_by_time.append(node_mask)

        return g, node_mask_by_time

    @property
    def num_classes(self):
        r"""Number of classes for each node."""
        return 2