dataset.py 4.07 KB
Newer Older
1
""" Code adapted from https://github.com/kavehhassani/mvgrl """
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
2
import dgl
3
import networkx as nx
4
5
import numpy as np
import scipy.sparse as sp
6
7
import torch as th
from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset
8
from dgl.nn import APPNPConv
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
9
10
from scipy.linalg import fractional_matrix_power, inv
from sklearn.preprocessing import MinMaxScaler
11

12

13
14
15
16
def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
17
    r_inv[np.isinf(r_inv)] = 0.0
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    if isinstance(features, np.ndarray):
        return features
    else:
        return features.todense(), sparse_to_tuple(features)


def sparse_to_tuple(sparse_mx):
    """Convert sparse matrix to tuple representation."""

    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx


def compute_ppr(graph: nx.Graph, alpha=0.2, self_loop=True):
    a = nx.convert_matrix.to_numpy_array(graph)
    if self_loop:
        a = a + np.eye(a.shape[0])  # A^ = A + I_n
    d = np.diag(np.sum(a, 1))  # D^ = Sigma A^_ii
    dinv = fractional_matrix_power(d, -0.5)  # D^(-1/2)
    at = np.matmul(np.matmul(dinv, a), dinv)  # A~ = D^(-1/2) x A^ x D^(-1/2)
53
54
55
    return alpha * inv(
        (np.eye(a.shape[0]) - (1 - alpha) * at)
    )  # a(I_n-(1-a)A~)^-1
56
57
58


def process_dataset(name, epsilon):
59
    if name == "cora":
60
        dataset = CoraGraphDataset()
61
    elif name == "citeseer":
62
63
64
        dataset = CiteseerGraphDataset()

    graph = dataset[0]
65
66
    feat = graph.ndata.pop("feat")
    label = graph.ndata.pop("label")
67

68
69
70
    train_mask = graph.ndata.pop("train_mask")
    val_mask = graph.ndata.pop("val_mask")
    test_mask = graph.ndata.pop("test_mask")
71
72
73
74
75
76
77

    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
    val_idx = th.nonzero(val_mask, as_tuple=False).squeeze()
    test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()

    nx_g = dgl.to_networkx(graph)

78
    print("computing ppr")
79
    diff_adj = compute_ppr(nx_g, 0.2)
80
    print("computing end")
81

82
83
    if name == "citeseer":
        print("additional processing")
84
85
86
87
88
89
90
91
92
93
94
95
        feat = th.tensor(preprocess_features(feat.numpy())).float()
        diff_adj[diff_adj < epsilon] = 0
        scaler = MinMaxScaler()
        scaler.fit(diff_adj)
        diff_adj = scaler.transform(diff_adj)

    diff_edges = np.nonzero(diff_adj)
    diff_weight = diff_adj[diff_edges]
    diff_graph = dgl.graph(diff_edges)

    graph = graph.add_self_loop()

96
97
98
99
100
101
102
103
104
105
106
    return (
        graph,
        diff_graph,
        feat,
        label,
        train_idx,
        val_idx,
        test_idx,
        diff_weight,
    )

107
108
109
110
111
112

def process_dataset_appnp(epsilon):
    k = 20
    alpha = 0.2
    dataset = PubmedGraphDataset()
    graph = dataset[0]
113
114
    feat = graph.ndata.pop("feat")
    label = graph.ndata.pop("label")
115

116
117
118
    train_mask = graph.ndata.pop("train_mask")
    val_mask = graph.ndata.pop("val_mask")
    test_mask = graph.ndata.pop("test_mask")
119
120
121
122
123
124

    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
    val_idx = th.nonzero(val_mask, as_tuple=False).squeeze()
    test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()

    appnp = APPNPConv(k, alpha)
125
    id = th.eye(graph.num_nodes()).float()
126
127
128
129
130
131
132
133
134
135
    diff_adj = appnp(graph.add_self_loop(), id).numpy()

    diff_adj[diff_adj < epsilon] = 0
    scaler = MinMaxScaler()
    scaler.fit(diff_adj)
    diff_adj = scaler.transform(diff_adj)
    diff_edges = np.nonzero(diff_adj)
    diff_weight = diff_adj[diff_edges]
    diff_graph = dgl.graph(diff_edges)

136
137
138
139
140
141
142
143
144
145
    return (
        graph,
        diff_graph,
        feat,
        label,
        train_idx,
        val_idx,
        test_idx,
        diff_weight,
    )