"vscode:/vscode.git/clone" did not exist on "0449f4d6b3f0fa888a40d51c06ebd1a85b3a59a4"
dataset.py 6.6 KB
Newer Older
1
import os
2

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
3
4
import dgl

5
import numpy
6
import pandas
7
import torch
8

9
10
11
12
13
14
15
16
17
18

def process_raw_data(raw_dir, processed_dir):
    r"""

    Description
    -----------
    Preprocess Elliptic dataset like the EvolveGCN official instruction:
    github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md
    The main purpose is to convert original idx to contiguous idx start at 0.
    """
19
20
21
22
23
24
25
26
27
28
29
30
31
    oid_nid_path = os.path.join(processed_dir, "oid_nid.npy")
    id_label_path = os.path.join(processed_dir, "id_label.npy")
    id_time_features_path = os.path.join(processed_dir, "id_time_features.npy")
    src_dst_time_path = os.path.join(processed_dir, "src_dst_time.npy")
    if (
        os.path.exists(oid_nid_path)
        and os.path.exists(id_label_path)
        and os.path.exists(id_time_features_path)
        and os.path.exists(src_dst_time_path)
    ):
        print(
            "The preprocessed data already exists, skip the preprocess stage!"
        )
32
33
        return
    print("starting process raw data in {}".format(raw_dir))
34
35
36
37
38
39
    id_label = pandas.read_csv(
        os.path.join(raw_dir, "elliptic_txs_classes.csv")
    )
    src_dst = pandas.read_csv(
        os.path.join(raw_dir, "elliptic_txs_edgelist.csv")
    )
40
    # elliptic_txs_features.csv has no header, and it has the same order idx with elliptic_txs_classes.csv
41
42
43
    id_time_features = pandas.read_csv(
        os.path.join(raw_dir, "elliptic_txs_features.csv"), header=None
    )
44
45

    # get oldId_newId
46
47
48
    oid_nid = id_label.loc[:, ["txId"]]
    oid_nid = oid_nid.rename(columns={"txId": "originalId"})
    oid_nid.insert(1, "newId", range(len(oid_nid)))
49
50
51

    # map classes unknown,1,2 to -1,1,0 and construct id_label. type 1 means illicit.
    id_label = pandas.concat(
52
53
54
55
56
57
        [
            oid_nid["newId"],
            id_label["class"].map({"unknown": -1.0, "1": 1.0, "2": 0.0}),
        ],
        axis=1,
    )
58
59
60

    # replace originalId to newId.
    # Attention: the timestamp in features start at 1.
61
    id_time_features[0] = oid_nid["newId"]
62
63

    # construct originalId2newId dict
64
    oid_nid_dict = oid_nid.set_index(["originalId"])["newId"].to_dict()
65
66
67
68
69
70
71
72
73
74
    # construct newId2timestamp dict
    nid_time_dict = id_time_features.set_index([0])[1].to_dict()

    # Map id in edgelist to newId, and add a timestamp to each edge.
    # Attention: From the EvolveGCN official instruction, the timestamp with edgelist start at 0, rather than 1.
    # see: github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md
    # Here we dose not follow the official instruction, which means timestamp with edgelist also start at 1.
    # In EvolveGCN example, the edge timestamp will not be used.
    #
    # Note: in the dataset, src and dst node has the same timestamp, so it's easy to set edge's timestamp.
75
76
77
    new_src = src_dst["txId1"].map(oid_nid_dict).rename("newSrc")
    new_dst = src_dst["txId2"].map(oid_nid_dict).rename("newDst")
    edge_time = new_src.map(nid_time_dict).rename("timestamp")
78
79
80
81
82
83
84
85
86
87
88
89
90
    src_dst_time = pandas.concat([new_src, new_dst, edge_time], axis=1)

    # save oid_nid, id_label, id_time_features, src_dst_time to disk. we can convert them to numpy.
    # oid_nid: type int.  id_label: type int.  id_time_features: type float.  src_dst_time: type int.
    oid_nid = oid_nid.to_numpy(dtype=int)
    id_label = id_label.to_numpy(dtype=int)
    id_time_features = id_time_features.to_numpy(dtype=float)
    src_dst_time = src_dst_time.to_numpy(dtype=int)

    numpy.save(oid_nid_path, oid_nid)
    numpy.save(id_label_path, id_label)
    numpy.save(id_time_features_path, id_time_features)
    numpy.save(src_dst_time_path, src_dst_time)
91
92
93
94
95
    print(
        "Process Elliptic raw data done, data has saved into {}".format(
            processed_dir
        )
    )
96
97
98


class EllipticDataset:
99
100
101
    def __init__(
        self, raw_dir, processed_dir, self_loop=True, reverse_edge=True
    ):
102
103
104
105
106
107
108
        self.raw_dir = raw_dir
        self.processd_dir = processed_dir
        self.self_loop = self_loop
        self.reverse_edge = reverse_edge

    def process(self):
        process_raw_data(self.raw_dir, self.processd_dir)
109
110
111
112
113
114
115
116
117
        id_time_features = torch.Tensor(
            numpy.load(os.path.join(self.processd_dir, "id_time_features.npy"))
        )
        id_label = torch.IntTensor(
            numpy.load(os.path.join(self.processd_dir, "id_label.npy"))
        )
        src_dst_time = torch.IntTensor(
            numpy.load(os.path.join(self.processd_dir, "src_dst_time.npy"))
        )
118
119
120
121
122
123

        src = src_dst_time[:, 0]
        dst = src_dst_time[:, 1]
        # id_label[:, 0] is used to add self loop
        if self.self_loop:
            if self.reverse_edge:
124
125
126
127
128
129
130
131
132
133
134
135
136
137
                g = dgl.graph(
                    data=(
                        torch.cat((src, dst, id_label[:, 0])),
                        torch.cat((dst, src, id_label[:, 0])),
                    ),
                    num_nodes=id_label.shape[0],
                )
                g.edata["timestamp"] = torch.cat(
                    (
                        src_dst_time[:, 2],
                        src_dst_time[:, 2],
                        id_time_features[:, 1].int(),
                    )
                )
138
            else:
139
140
141
142
143
144
145
146
147
148
                g = dgl.graph(
                    data=(
                        torch.cat((src, id_label[:, 0])),
                        torch.cat((dst, id_label[:, 0])),
                    ),
                    num_nodes=id_label.shape[0],
                )
                g.edata["timestamp"] = torch.cat(
                    (src_dst_time[:, 2], id_time_features[:, 1].int())
                )
149
150
        else:
            if self.reverse_edge:
151
152
153
154
155
156
157
                g = dgl.graph(
                    data=(torch.cat((src, dst)), torch.cat((dst, src))),
                    num_nodes=id_label.shape[0],
                )
                g.edata["timestamp"] = torch.cat(
                    (src_dst_time[:, 2], src_dst_time[:, 2])
                )
158
            else:
159
160
                g = dgl.graph(data=(src, dst), num_nodes=id_label.shape[0])
                g.edata["timestamp"] = src_dst_time[:, 2]
161
162
163

        time_features = id_time_features[:, 1:]
        label = id_label[:, 1]
164
165
        g.ndata["label"] = label
        g.ndata["feat"] = time_features
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

        # used to construct time-based sub-graph.
        node_mask_by_time = []
        start_time = int(torch.min(id_time_features[:, 1]))
        end_time = int(torch.max(id_time_features[:, 1]))
        for i in range(start_time, end_time + 1):
            node_mask = id_time_features[:, 1] == i
            node_mask_by_time.append(node_mask)

        return g, node_mask_by_time

    @property
    def num_classes(self):
        r"""Number of classes for each node."""
        return 2