utils.py 6.06 KB
Newer Older
1
2
3
4
5
6
7
"""
Utility functions for link prediction
Most code is adapted from authors' implementation of RGCN link prediction:
https://github.com/MichSchli/RelationPrediction

"""

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
8
import dgl
9
10
import numpy as np
import tensorflow as tf
11

12
13
14
15
16
17
#######################################################################
#
# Utility function for building training and testing graphs
#
#######################################################################

18

19
def get_adj_and_degrees(num_nodes, triplets):
20
    """Get adjacency list and degrees of the graph"""
21
    adj_list = [[] for _ in range(num_nodes)]
22
    for i, triplet in enumerate(triplets):
23
24
25
26
27
28
29
        adj_list[triplet[0]].append([i, triplet[2]])
        adj_list[triplet[2]].append([i, triplet[0]])

    degrees = np.array([len(a) for a in adj_list])
    adj_list = [np.array(a) for a in adj_list]
    return adj_list, degrees

30

31
32
33
34
35
36
37
38
def sample_edge_neighborhood(adj_list, degrees, n_triplets, sample_size):
    """Sample edges by neighborhool expansion.

    This guarantees that the sampled edges form a connected graph, which
    may help deeper GNNs that require information from more than one hop.
    """
    edges = np.zeros((sample_size), dtype=np.int32)

39
    # initialize
40
41
42
43
44
45
46
47
48
49
50
51
    sample_counts = np.array([d for d in degrees])
    picked = np.array([False for _ in range(n_triplets)])
    seen = np.array([False for _ in degrees])

    for i in range(0, sample_size):
        weights = sample_counts * seen

        if np.sum(weights) == 0:
            weights = np.ones_like(weights)
            weights[np.where(sample_counts == 0)] = 0

        probabilities = (weights) / np.sum(weights)
52
53
54
        chosen_vertex = np.random.choice(
            np.arange(degrees.shape[0]), p=probabilities
        )
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
        chosen_adj_list = adj_list[chosen_vertex]
        seen[chosen_vertex] = True

        chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
        chosen_edge = chosen_adj_list[chosen_edge]
        edge_number = chosen_edge[0]

        while picked[edge_number]:
            chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
            chosen_edge = chosen_adj_list[chosen_edge]
            edge_number = chosen_edge[0]

        edges[i] = edge_number
        other_vertex = chosen_edge[1]
        picked[edge_number] = True
        sample_counts[chosen_vertex] -= 1
        sample_counts[other_vertex] -= 1
        seen[other_vertex] = True

    return edges

76

77
78
79
80
81
def sample_edge_uniform(adj_list, degrees, n_triplets, sample_size):
    """Sample edges uniformly from all the edges."""
    all_edges = np.arange(n_triplets)
    return np.random.choice(all_edges, sample_size, replace=False)

82
83
84
85
86
87
88
89
90
91
92

def generate_sampled_graph_and_labels(
    triplets,
    sample_size,
    split_size,
    num_rels,
    adj_list,
    degrees,
    negative_rate,
    sampler="uniform",
):
93
94
95
96
97
98
    """Get training graph and signals
    First perform edge neighborhood sampling on graph, then perform negative
    sampling to generate negative samples
    """
    # perform edge neighbor sampling
    if sampler == "uniform":
99
100
101
        edges = sample_edge_uniform(
            adj_list, degrees, len(triplets), sample_size
        )
102
    elif sampler == "neighbor":
103
104
105
        edges = sample_edge_neighborhood(
            adj_list, degrees, len(triplets), sample_size
        )
106
107
108
109
110
111
112
113
114
115
116
    else:
        raise ValueError("Sampler type must be either 'uniform' or 'neighbor'.")

    # relabel nodes to have consecutive node ids
    edges = triplets[edges]
    src, rel, dst = edges.transpose()
    uniq_v, edges = np.unique((src, dst), return_inverse=True)
    src, dst = np.reshape(edges, (2, -1))
    relabeled_edges = np.stack((src, rel, dst)).transpose()

    # negative sampling
117
118
119
    samples, labels = negative_sampling(
        relabeled_edges, len(uniq_v), negative_rate
    )
120
121
122
123

    # further split graph, only half of the edges will be used as graph
    # structure, while the rest half is used as unseen positive samples
    split_size = int(sample_size * split_size)
124
125
126
    graph_split_ids = np.random.choice(
        np.arange(sample_size), size=split_size, replace=False
    )
127
128
129
130
131
132
133
    src = src[graph_split_ids]
    dst = dst[graph_split_ids]
    rel = rel[graph_split_ids]

    # build DGL graph
    print("# sampled nodes: {}".format(len(uniq_v)))
    print("# sampled edges: {}".format(len(src) * 2))
134
135
136
    g, rel, norm = build_graph_from_triplets(
        len(uniq_v), num_rels, (src, rel, dst)
    )
137
138
    return g, uniq_v, rel, norm, samples, labels

139

140
141
142
143
144
145
146
def comp_deg_norm(g):
    g = g.local_var()
    in_deg = g.in_degrees(range(g.number_of_nodes())).float().numpy()
    norm = 1.0 / in_deg
    norm[np.isinf(norm)] = 0
    return norm

147

148
def build_graph_from_triplets(num_nodes, num_rels, triplets):
149
150
151
152
    """Create a DGL graph. The graph is bidirectional because RGCN authors
    use reversed relations.
    This function also generates edge type and normalization factor
    (reciprocal of node incoming degree)
153
154
155
156
157
158
159
160
161
162
163
164
165
    """
    g = dgl.DGLGraph()
    g.add_nodes(num_nodes)
    src, rel, dst = triplets
    src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
    rel = np.concatenate((rel, rel + num_rels))
    edges = sorted(zip(dst, src, rel))
    dst, src, rel = np.array(edges).transpose()
    g.add_edges(src, dst)
    norm = comp_deg_norm(g)
    print("# nodes: {}, # edges: {}".format(num_nodes, len(src)))
    return g, rel, norm

166

167
168
169
170
171
def build_test_graph(num_nodes, num_rels, edges):
    src, rel, dst = edges.transpose()
    print("Test graph:")
    return build_graph_from_triplets(num_nodes, num_rels, (src, rel, dst))

172

173
174
175
176
177
def negative_sampling(pos_samples, num_entity, negative_rate):
    size_of_batch = len(pos_samples)
    num_to_generate = size_of_batch * negative_rate
    neg_samples = np.tile(pos_samples, (negative_rate, 1))
    labels = np.zeros(size_of_batch * (negative_rate + 1), dtype=np.float32)
178
    labels[:size_of_batch] = 1
179
180
181
182
183
184
185
186
    values = np.random.randint(num_entity, size=num_to_generate)
    choices = np.random.uniform(size=num_to_generate)
    subj = choices > 0.5
    obj = choices <= 0.5
    neg_samples[subj, 0] = values[subj]
    neg_samples[obj, 2] = values[obj]

    return np.concatenate((pos_samples, neg_samples)), labels