import os import dgl import numpy import pandas import torch def process_raw_data(raw_dir, processed_dir): r""" Description ----------- Preprocess Elliptic dataset like the EvolveGCN official instruction: github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md The main purpose is to convert original idx to contiguous idx start at 0. """ oid_nid_path = os.path.join(processed_dir, "oid_nid.npy") id_label_path = os.path.join(processed_dir, "id_label.npy") id_time_features_path = os.path.join(processed_dir, "id_time_features.npy") src_dst_time_path = os.path.join(processed_dir, "src_dst_time.npy") if ( os.path.exists(oid_nid_path) and os.path.exists(id_label_path) and os.path.exists(id_time_features_path) and os.path.exists(src_dst_time_path) ): print( "The preprocessed data already exists, skip the preprocess stage!" ) return print("starting process raw data in {}".format(raw_dir)) id_label = pandas.read_csv( os.path.join(raw_dir, "elliptic_txs_classes.csv") ) src_dst = pandas.read_csv( os.path.join(raw_dir, "elliptic_txs_edgelist.csv") ) # elliptic_txs_features.csv has no header, and it has the same order idx with elliptic_txs_classes.csv id_time_features = pandas.read_csv( os.path.join(raw_dir, "elliptic_txs_features.csv"), header=None ) # get oldId_newId oid_nid = id_label.loc[:, ["txId"]] oid_nid = oid_nid.rename(columns={"txId": "originalId"}) oid_nid.insert(1, "newId", range(len(oid_nid))) # map classes unknown,1,2 to -1,1,0 and construct id_label. type 1 means illicit. id_label = pandas.concat( [ oid_nid["newId"], id_label["class"].map({"unknown": -1.0, "1": 1.0, "2": 0.0}), ], axis=1, ) # replace originalId to newId. # Attention: the timestamp in features start at 1. id_time_features[0] = oid_nid["newId"] # construct originalId2newId dict oid_nid_dict = oid_nid.set_index(["originalId"])["newId"].to_dict() # construct newId2timestamp dict nid_time_dict = id_time_features.set_index([0])[1].to_dict() # Map id in edgelist to newId, and add a timestamp to each edge. # Attention: From the EvolveGCN official instruction, the timestamp with edgelist start at 0, rather than 1. # see: github.com/IBM/EvolveGCN/blob/master/elliptic_construction.md # Here we dose not follow the official instruction, which means timestamp with edgelist also start at 1. # In EvolveGCN example, the edge timestamp will not be used. # # Note: in the dataset, src and dst node has the same timestamp, so it's easy to set edge's timestamp. new_src = src_dst["txId1"].map(oid_nid_dict).rename("newSrc") new_dst = src_dst["txId2"].map(oid_nid_dict).rename("newDst") edge_time = new_src.map(nid_time_dict).rename("timestamp") src_dst_time = pandas.concat([new_src, new_dst, edge_time], axis=1) # save oid_nid, id_label, id_time_features, src_dst_time to disk. we can convert them to numpy. # oid_nid: type int. id_label: type int. id_time_features: type float. src_dst_time: type int. oid_nid = oid_nid.to_numpy(dtype=int) id_label = id_label.to_numpy(dtype=int) id_time_features = id_time_features.to_numpy(dtype=float) src_dst_time = src_dst_time.to_numpy(dtype=int) numpy.save(oid_nid_path, oid_nid) numpy.save(id_label_path, id_label) numpy.save(id_time_features_path, id_time_features) numpy.save(src_dst_time_path, src_dst_time) print( "Process Elliptic raw data done, data has saved into {}".format( processed_dir ) ) class EllipticDataset: def __init__( self, raw_dir, processed_dir, self_loop=True, reverse_edge=True ): self.raw_dir = raw_dir self.processd_dir = processed_dir self.self_loop = self_loop self.reverse_edge = reverse_edge def process(self): process_raw_data(self.raw_dir, self.processd_dir) id_time_features = torch.Tensor( numpy.load(os.path.join(self.processd_dir, "id_time_features.npy")) ) id_label = torch.IntTensor( numpy.load(os.path.join(self.processd_dir, "id_label.npy")) ) src_dst_time = torch.IntTensor( numpy.load(os.path.join(self.processd_dir, "src_dst_time.npy")) ) src = src_dst_time[:, 0] dst = src_dst_time[:, 1] # id_label[:, 0] is used to add self loop if self.self_loop: if self.reverse_edge: g = dgl.graph( data=( torch.cat((src, dst, id_label[:, 0])), torch.cat((dst, src, id_label[:, 0])), ), num_nodes=id_label.shape[0], ) g.edata["timestamp"] = torch.cat( ( src_dst_time[:, 2], src_dst_time[:, 2], id_time_features[:, 1].int(), ) ) else: g = dgl.graph( data=( torch.cat((src, id_label[:, 0])), torch.cat((dst, id_label[:, 0])), ), num_nodes=id_label.shape[0], ) g.edata["timestamp"] = torch.cat( (src_dst_time[:, 2], id_time_features[:, 1].int()) ) else: if self.reverse_edge: g = dgl.graph( data=(torch.cat((src, dst)), torch.cat((dst, src))), num_nodes=id_label.shape[0], ) g.edata["timestamp"] = torch.cat( (src_dst_time[:, 2], src_dst_time[:, 2]) ) else: g = dgl.graph(data=(src, dst), num_nodes=id_label.shape[0]) g.edata["timestamp"] = src_dst_time[:, 2] time_features = id_time_features[:, 1:] label = id_label[:, 1] g.ndata["label"] = label g.ndata["feat"] = time_features # used to construct time-based sub-graph. node_mask_by_time = [] start_time = int(torch.min(id_time_features[:, 1])) end_time = int(torch.max(id_time_features[:, 1])) for i in range(start_time, end_time + 1): node_mask = id_time_features[:, 1] == i node_mask_by_time.append(node_mask) return g, node_mask_by_time @property def num_classes(self): r"""Number of classes for each node.""" return 2