dataset.py 5.95 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import pandas
import numpy
import torch
import dgl


def process_raw_data(raw_dir, processed_dir):
    r"""

    Description
    -----------
    Preprocess Elliptic dataset like the EvolveGCN official instruction:
    github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md
    The main purpose is to convert original idx to contiguous idx start at 0.
    """
    oid_nid_path = os.path.join(processed_dir, 'oid_nid.npy')
    id_label_path = os.path.join(processed_dir, 'id_label.npy')
    id_time_features_path = os.path.join(processed_dir, 'id_time_features.npy')
    src_dst_time_path = os.path.join(processed_dir, 'src_dst_time.npy')
    if os.path.exists(oid_nid_path) and os.path.exists(id_label_path) and \
            os.path.exists(id_time_features_path) and os.path.exists(src_dst_time_path):
        print("The preprocessed data already exists, skip the preprocess stage!")
        return
    print("starting process raw data in {}".format(raw_dir))
    id_label = pandas.read_csv(os.path.join(raw_dir, 'elliptic_txs_classes.csv'))
    src_dst = pandas.read_csv(os.path.join(raw_dir, 'elliptic_txs_edgelist.csv'))
    # elliptic_txs_features.csv has no header, and it has the same order idx with elliptic_txs_classes.csv
    id_time_features = pandas.read_csv(os.path.join(raw_dir, 'elliptic_txs_features.csv'), header=None)

    # get oldId_newId
    oid_nid = id_label.loc[:, ['txId']]
    oid_nid = oid_nid.rename(columns={'txId': 'originalId'})
    oid_nid.insert(1, 'newId', range(len(oid_nid)))

    # map classes unknown,1,2 to -1,1,0 and construct id_label. type 1 means illicit.
    id_label = pandas.concat(
        [oid_nid['newId'], id_label['class'].map({'unknown': -1.0, '1': 1.0, '2': 0.0})], axis=1)

    # replace originalId to newId.
    # Attention: the timestamp in features start at 1.
    id_time_features[0] = oid_nid['newId']

    # construct originalId2newId dict
    oid_nid_dict = oid_nid.set_index(['originalId'])['newId'].to_dict()
    # construct newId2timestamp dict
    nid_time_dict = id_time_features.set_index([0])[1].to_dict()

    # Map id in edgelist to newId, and add a timestamp to each edge.
    # Attention: From the EvolveGCN official instruction, the timestamp with edgelist start at 0, rather than 1.
    # see: github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md
    # Here we dose not follow the official instruction, which means timestamp with edgelist also start at 1.
    # In EvolveGCN example, the edge timestamp will not be used.
    #
    # Note: in the dataset, src and dst node has the same timestamp, so it's easy to set edge's timestamp.
    new_src = src_dst['txId1'].map(oid_nid_dict).rename('newSrc')
    new_dst = src_dst['txId2'].map(oid_nid_dict).rename('newDst')
    edge_time = new_src.map(nid_time_dict).rename('timestamp')
    src_dst_time = pandas.concat([new_src, new_dst, edge_time], axis=1)

    # save oid_nid, id_label, id_time_features, src_dst_time to disk. we can convert them to numpy.
    # oid_nid: type int.  id_label: type int.  id_time_features: type float.  src_dst_time: type int.
    oid_nid = oid_nid.to_numpy(dtype=int)
    id_label = id_label.to_numpy(dtype=int)
    id_time_features = id_time_features.to_numpy(dtype=float)
    src_dst_time = src_dst_time.to_numpy(dtype=int)

    numpy.save(oid_nid_path, oid_nid)
    numpy.save(id_label_path, id_label)
    numpy.save(id_time_features_path, id_time_features)
    numpy.save(src_dst_time_path, src_dst_time)
    print("Process Elliptic raw data done, data has saved into {}".format(processed_dir))


class EllipticDataset:
    def __init__(self, raw_dir, processed_dir, self_loop=True, reverse_edge=True):
        self.raw_dir = raw_dir
        self.processd_dir = processed_dir
        self.self_loop = self_loop
        self.reverse_edge = reverse_edge

    def process(self):
        process_raw_data(self.raw_dir, self.processd_dir)
        id_time_features = torch.Tensor(numpy.load(os.path.join(self.processd_dir, 'id_time_features.npy')))
        id_label = torch.IntTensor(numpy.load(os.path.join(self.processd_dir, 'id_label.npy')))
        src_dst_time = torch.IntTensor(numpy.load(os.path.join(self.processd_dir, 'src_dst_time.npy')))

        src = src_dst_time[:, 0]
        dst = src_dst_time[:, 1]
        # id_label[:, 0] is used to add self loop
        if self.self_loop:
            if self.reverse_edge:
                g = dgl.graph(data=(torch.cat((src, dst, id_label[:, 0])), torch.cat((dst, src, id_label[:, 0]))),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = torch.cat((src_dst_time[:, 2], src_dst_time[:, 2], id_time_features[:, 1].int()))
            else:
                g = dgl.graph(data=(torch.cat((src, id_label[:, 0])), torch.cat((dst, id_label[:, 0]))),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = torch.cat((src_dst_time[:, 2], id_time_features[:, 1].int()))
        else:
            if self.reverse_edge:
                g = dgl.graph(data=(torch.cat((src, dst)), torch.cat((dst, src))),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = torch.cat((src_dst_time[:, 2], src_dst_time[:, 2]))
            else:
                g = dgl.graph(data=(src, dst),
                              num_nodes=id_label.shape[0])
                g.edata['timestamp'] = src_dst_time[:, 2]

        time_features = id_time_features[:, 1:]
        label = id_label[:, 1]
        g.ndata['label'] = label
        g.ndata['feat'] = time_features

        # used to construct time-based sub-graph.
        node_mask_by_time = []
        start_time = int(torch.min(id_time_features[:, 1]))
        end_time = int(torch.max(id_time_features[:, 1]))
        for i in range(start_time, end_time + 1):
            node_mask = id_time_features[:, 1] == i
            node_mask_by_time.append(node_mask)

        return g, node_mask_by_time

    @property
    def num_classes(self):
        r"""Number of classes for each node."""
        return 2