dataset.py 7.03 KB
Newer Older
1
""" Code adapted from https://github.com/kavehhassani/mvgrl """
2
3
import os
import re
4
5
from collections import Counter

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
6
7
import dgl

8
import networkx as nx
9
10
import numpy as np
import torch as th
11
from dgl.data import DGLDataset
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
12
from scipy.linalg import fractional_matrix_power, inv
13
14
15
16

""" Compute Personalized Page Ranking"""


17
18
19
def compute_ppr(graph: nx.Graph, alpha=0.2, self_loop=True):
    a = nx.convert_matrix.to_numpy_array(graph)
    if self_loop:
20
21
22
23
24
25
26
        a = a + np.eye(a.shape[0])  # A^ = A + I_n
    d = np.diag(np.sum(a, 1))  # D^ = Sigma A^_ii
    dinv = fractional_matrix_power(d, -0.5)  # D^(-1/2)
    at = np.matmul(np.matmul(dinv, a), dinv)  # A~ = D^(-1/2) x A^ x D^(-1/2)
    return alpha * inv(
        (np.eye(a.shape[0]) - (1 - alpha) * at)
    )  # a(I_n-(1-a)A~)^-1
27
28
29
30


def download(dataset, datadir):
    os.makedirs(datadir)
31
32
33
    url = "https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/{0}.zip".format(
        dataset
    )
34
    zipfile = os.path.basename(url)
35
36
37
38
39
    os.system("wget {0}; unzip {1}".format(url, zipfile))
    os.system("mv {0}/* {1}".format(dataset, datadir))
    os.system("rm -r {0}".format(dataset))
    os.system("rm {0}".format(zipfile))

40
41

def process(dataset):
42
    src = os.path.join(os.path.dirname(__file__), "data")
43
44
45
46
    prefix = os.path.join(src, dataset, dataset)

    # assign each node to the corresponding graph
    graph_node_dict = {}
47
    with open("{0}_graph_indicator.txt".format(prefix), "r") as f:
48
        for idx, line in enumerate(f):
49
            graph_node_dict[idx + 1] = int(line.strip("\n"))
50
51

    node_labels = []
52
53
    if os.path.exists("{0}_node_labels.txt".format(prefix)):
        with open("{0}_node_labels.txt".format(prefix), "r") as f:
54
            for line in f:
55
                node_labels += [int(line.strip("\n")) - 1]
56
57
            num_unique_node_labels = max(node_labels) + 1
    else:
58
        print("No node labels")
59
60

    node_attrs = []
61
62
    if os.path.exists("{0}_node_attributes.txt".format(prefix)):
        with open("{0}_node_attributes.txt".format(prefix), "r") as f:
63
64
            for line in f:
                node_attrs.append(
65
66
67
68
69
70
71
72
                    np.array(
                        [
                            float(attr)
                            for attr in re.split("[,\s]+", line.strip("\s\n"))
                            if attr
                        ],
                        dtype=np.float,
                    )
73
74
                )
    else:
75
        print("No node attributes")
76
77
78

    graph_labels = []
    unique_labels = set()
79
    with open("{0}_graph_labels.txt".format(prefix), "r") as f:
80
        for line in f:
81
            val = int(line.strip("\n"))
82
83
84
85
86
87
88
89
            if val not in unique_labels:
                unique_labels.add(val)
            graph_labels.append(val)
    label_idx_dict = {val: idx for idx, val in enumerate(unique_labels)}
    graph_labels = np.array([label_idx_dict[l] for l in graph_labels])

    adj_list = {idx: [] for idx in range(1, len(graph_labels) + 1)}
    index_graph = {idx: [] for idx in range(1, len(graph_labels) + 1)}
90
    with open("{0}_A.txt".format(prefix), "r") as f:
91
        for line in f:
92
            u, v = tuple(map(int, line.strip("\n").split(",")))
93
94
95
96
97
98
99
100
101
102
            adj_list[graph_node_dict[u]].append((u, v))
            index_graph[graph_node_dict[u]] += [u, v]

    for k in index_graph.keys():
        index_graph[k] = [u - 1 for u in set(index_graph[k])]

    graphs, pprs = [], []
    for idx in range(1, 1 + len(adj_list)):
        graph = nx.from_edgelist(adj_list[idx])

103
        graph.graph["label"] = graph_labels[idx - 1]
104
105
106
107
108
        for u in graph.nodes():
            if len(node_labels) > 0:
                node_label_one_hot = [0] * num_unique_node_labels
                node_label = node_labels[u - 1]
                node_label_one_hot[node_label] = 1
109
                graph.nodes[u]["label"] = node_label_one_hot
110
            if len(node_attrs) > 0:
111
                graph.nodes[u]["feat"] = node_attrs[u - 1]
112
        if len(node_attrs) > 0:
113
            graph.graph["feat_dim"] = node_attrs[0].shape[0]
114
115
116
117
118
119
120
121
122

        # relabeling
        mapping = {}
        for node_idx, node in enumerate(graph.nodes()):
            mapping[node] = node_idx

        graphs.append(nx.relabel_nodes(graph, mapping))
        pprs.append(compute_ppr(graph, alpha=0.2))

123
    if "feat_dim" in graphs[0].graph:
124
125
126
127
128
129
130
        pass
    else:
        max_deg = max([max(dict(graph.degree).values()) for graph in graphs])
        for graph in graphs:
            for u in graph.nodes(data=True):
                f = np.zeros(max_deg + 1)
                f[graph.degree[u[0]]] = 1.0
131
132
133
134
135
                if "label" in u[1]:
                    f = np.concatenate(
                        (np.array(u[1]["label"], dtype=np.float), f)
                    )
                graph.nodes[u[0]]["feat"] = f
136
137
    return graphs, pprs

138

139
140
def load(dataset):
    basedir = os.path.dirname(os.path.abspath(__file__))
141
    datadir = os.path.join(basedir, "data", dataset)
142
143
144
145
146
147
148
149

    if not os.path.exists(datadir):
        download(dataset, datadir)
        graphs, diff = process(dataset)
        feat, adj, labels = [], [], []

        for idx, graph in enumerate(graphs):
            adj.append(nx.to_numpy_array(graph))
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
            labels.append(graph.graph["label"])
            feat.append(
                np.array(list(nx.get_node_attributes(graph, "feat").values()))
            )

        adj, diff, feat, labels = (
            np.array(adj),
            np.array(diff),
            np.array(feat),
            np.array(labels),
        )

        np.save(f"{datadir}/adj.npy", adj)
        np.save(f"{datadir}/diff.npy", diff)
        np.save(f"{datadir}/feat.npy", feat)
        np.save(f"{datadir}/labels.npy", labels)
166
    else:
167
168
169
170
        adj = np.load(f"{datadir}/adj.npy", allow_pickle=True)
        diff = np.load(f"{datadir}/diff.npy", allow_pickle=True)
        feat = np.load(f"{datadir}/feat.npy", allow_pickle=True)
        labels = np.load(f"{datadir}/labels.npy", allow_pickle=True)
171
172
173
174
175
176
177
178
179
180
181
182
183

    n_graphs = adj.shape[0]

    graphs = []
    diff_graphs = []
    lbls = []

    for i in range(n_graphs):
        a = adj[i]
        edge_indexes = a.nonzero()

        graph = dgl.graph(edge_indexes)
        graph = graph.add_self_loop()
184
        graph.ndata["feat"] = th.tensor(feat[i]).float()
185
186
187
188
189
190

        diff_adj = diff[i]
        diff_indexes = diff_adj.nonzero()
        diff_weight = th.tensor(diff_adj[diff_indexes]).float()

        diff_graph = dgl.graph(diff_indexes)
191
        diff_graph.edata["edge_weight"] = diff_weight
192
193
194
195
196
197
198
199
200
201
        label = labels[i]
        graphs.append(graph)
        diff_graphs.append(diff_graph)
        lbls.append(label)

    labels = th.tensor(lbls)

    dataset = TUDataset(graphs, diff_graphs, labels)
    return dataset

202

203
204
class TUDataset(DGLDataset):
    def __init__(self, graphs, diff_graphs, labels):
205
        super(TUDataset, self).__init__(name="tu")
206
207
208
209
210
211
212
213
214
215
216
        self.graphs = graphs
        self.diff_graphs = diff_graphs
        self.labels = labels

    def process(self):
        return

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
217
        return self.graphs[idx], self.diff_graphs[idx], self.labels[idx]