Unverified Commit b98dc92c authored by Lingfan Yu's avatar Lingfan Yu Committed by GitHub
Browse files

[Model] Relational GCN (#55)

* data preprocessing for rgcn

* edge subgraph

* WIP: RGCN

* use edge feature in spmv

* fix bugs

* match AIFB accuracy

* match mutag accuracy

* avoid materializing in featureless case

* remove untouched nodes and relabel nodes

* fix python list concatenate overhead

* sparsely store edge types

* refactor entity classify code for clean link prediction implementation

* further refactor code

* refactoring

* rgcn block decompose layers

* link predict dataset

* link predict model and eval code

* dropout, self-loop, regularization, etc, plus bug fixes

* update to new api

* dataset update

* bugs, WIP, need to impl early stopping and filtered metrics

* instruction to run, and minor

* group conv and early stop

* clean slow code

* some code comments

* use new api in model code

* change data preprocessing

* entity classify model

* WIP

* move dgl graph out of model

* hot fix for extract zip

* fix link predict model

* use latest dgl apis

* still have memory issue...

* bug fix and move inference to cpu

* move rgcn data processing to contrib

* th.allclose -> U.allclose

* minor change in readme

* fix memory issue in entity classify

* fix and testing code for link predict

* fix entity classify

* clean up

* fix comments

* revert erroneous git merge changes

* code clean up and more comments

* minor

* dependent package version
parent 8918cce0
# Relational-GCN
### Prerequisites
Two extra python packages are needed for this example:
- rdflib
- pandas
Example code was tested with rdflib 4.2.2 and pandas 0.23.4
### Entity Classification
AIFB:
```
python3 entity_classify.py -d aifb --testing --gpu 0
```
MUTAG:
```
python3 entity_classify.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0
```
BGS:
```
python3 entity_classify.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --relabel
```
### Link Prediction
FB15k-237:
```
python3 link_predict.py -d FB15k-237 --gpu 0
```
"""
Modeling Relational Data with Graph Convolutional Networks
Paper: https://arxiv.org/abs/1703.06103
Code: https://github.com/tkipf/relational-gcn
Difference compared to tkipf/relation-gcn
* l2norm applied to all weights
* remove nodes that won't be touched
"""
import argparse
import numpy as np
import time
import torch
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.contrib.data import load_data
import dgl.function as fn
from functools import partial
from layers import RGCNBasisLayer as RGCNLayer
from model import BaseRGCN
class EntityClassify(BaseRGCN):
def create_features(self):
features = torch.arange(self.num_nodes)
if self.use_cuda:
features = features.cuda()
return features
def build_input_layer(self):
return RGCNLayer(self.num_nodes, self.h_dim, self.num_rels, self.num_bases,
activation=F.relu, is_input_layer=True)
def build_hidden_layer(self, idx):
return RGCNLayer(self.h_dim, self.h_dim, self.num_rels, self.num_bases,
activation=F.relu)
def build_output_layer(self):
return RGCNLayer(self.h_dim, self.out_dim, self.num_rels,self.num_bases,
activation=partial(F.softmax, dim=1))
def main(args):
# load graph data
data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel)
num_nodes = data.num_nodes
num_rels = data.num_rels
num_classes = data.num_classes
labels = data.labels
train_idx = data.train_idx
test_idx = data.test_idx
# split dataset into train, validate, test
if args.validation:
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]
else:
val_idx = train_idx
# edge type and normalization factor
edge_type = torch.from_numpy(data.edge_type)
edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1)
labels = torch.from_numpy(labels).view(-1)
# check cuda
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
if use_cuda:
torch.cuda.set_device(args.gpu)
edge_type = edge_type.cuda()
edge_norm = edge_norm.cuda()
labels = labels.cuda()
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
g.edata.update({'type': edge_type, 'norm': edge_norm})
# create model
model = EntityClassify(len(g),
args.n_hidden,
num_classes,
num_rels,
num_bases=args.n_bases,
num_hidden_layers=args.n_layers - 2,
dropout=args.dropout,
use_cuda=use_cuda)
if use_cuda:
model.cuda()
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2norm)
# training loop
print("start training...")
forward_time = []
backward_time = []
model.train()
for epoch in range(args.n_epochs):
optimizer.zero_grad()
t0 = time.time()
logits = model.forward(g)
loss = F.cross_entropy(logits[train_idx], labels[train_idx])
t1 = time.time()
loss.backward()
optimizer.step()
t2 = time.time()
forward_time.append(t1 - t0)
backward_time.append(t2 - t1)
print("Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}".
format(epoch, forward_time[-1], backward_time[-1]))
train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx]).item() / len(train_idx)
val_loss = F.cross_entropy(logits[val_idx], labels[val_idx])
val_acc = torch.sum(logits[val_idx].argmax(dim=1) == labels[val_idx]).item() / len(val_idx)
print("Train Accuracy: {:.4f} | Train Loss: {:.4f} | Validation Accuracy: {:.4f} | Validation loss: {:.4f}".
format(train_acc, loss.item(), val_acc, val_loss.item()))
print()
model.eval()
logits = model.forward(g)
test_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
test_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx]).item() / len(test_idx)
print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss.item()))
print()
print("Mean forward time: {:4f}".format(np.mean(forward_time[len(forward_time) // 4:])))
print("Mean backward time: {:4f}".format(np.mean(backward_time[len(backward_time) // 4:])))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='RGCN')
parser.add_argument("--dropout", type=float, default=0,
help="dropout probability")
parser.add_argument("--n-hidden", type=int, default=16,
help="number of hidden units")
parser.add_argument("--gpu", type=int, default=-1,
help="gpu")
parser.add_argument("--lr", type=float, default=1e-2,
help="learning rate")
parser.add_argument("--n-bases", type=int, default=-1,
help="number of filter weight matrices, default: -1 [use all]")
parser.add_argument("--n-layers", type=int, default=2,
help="number of propagation rounds")
parser.add_argument("-e", "--n-epochs", type=int, default=50,
help="number of training epochs")
parser.add_argument("-d", "--dataset", type=str, required=True,
help="dataset to use")
parser.add_argument("--l2norm", type=float, default=0,
help="l2 norm coef")
parser.add_argument("--relabel", default=False, action='store_true',
help="remove untouched nodes and relabel")
fp = parser.add_mutually_exclusive_group(required=False)
fp.add_argument('--validation', dest='validation', action='store_true')
fp.add_argument('--testing', dest='validation', action='store_false')
parser.set_defaults(validation=True)
args = parser.parse_args()
print(args)
args.bfs_level = args.n_layers + 1 # pruning used nodes for memory
main(args)
import torch
import torch.nn as nn
import dgl.function as fn
class RGCNLayer(nn.Module):
def __init__(self, in_feat, out_feat, bias=None, activation=None,
self_loop=False, dropout=0.0):
super(RGCNLayer, self).__init__()
self.bias = bias
self.activation = activation
self.self_loop = self_loop
if self.bias == True:
self.bias = nn.Parameter(torch.Tensor(out_feat))
nn.init.xavier_uniform_(self.bias,
gain=nn.init.calculate_gain('relu'))
# weight for self loop
if self.self_loop:
self.loop_weight = nn.Parameter(torch.Tensor(in_feat, out_feat))
nn.init.xavier_uniform_(self.loop_weight,
gain=nn.init.calculate_gain('relu'))
if dropout:
self.dropout = nn.Dropout(dropout)
else:
self.dropout = None
# define how propagation is done in subclass
def propagate(self, g):
raise NotImplementedError
def forward(self, g):
if self.self_loop:
loop_message = torch.mm(g.ndata['h'], self.loop_weight)
if self.dropout is not None:
loop_message = self.dropout(loop_message)
self.propagate(g)
# apply bias and activation
node_repr = g.ndata['h']
if self.bias:
node_repr = node_repr + self.bias
if self.self_loop:
node_repr = node_repr + loop_message
if self.activation:
node_repr = self.activation(node_repr)
g.ndata['h'] = node_repr
class RGCNBasisLayer(RGCNLayer):
def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
activation=None, is_input_layer=False):
super(RGCNBasisLayer, self).__init__(in_feat, out_feat, bias, activation)
self.in_feat = in_feat
self.out_feat = out_feat
self.num_rels = num_rels
self.num_bases = num_bases
self.is_input_layer = is_input_layer
if self.num_bases <= 0 or self.num_bases > self.num_rels:
self.num_bases = self.num_rels
# add basis weights
self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
self.out_feat))
if self.num_bases < self.num_rels:
# linear combination coefficients
self.w_comp = nn.Parameter(torch.Tensor(self.num_rels,
self.num_bases))
nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
if self.num_bases < self.num_rels:
nn.init.xavier_uniform_(self.w_comp,
gain=nn.init.calculate_gain('relu'))
def propagate(self, g):
if self.num_bases < self.num_rels:
# generate all weights from bases
weight = self.weight.view(self.num_bases,
self.in_feat * self.out_feat)
weight = torch.matmul(self.w_comp, weight).view(
self.num_rels, self.in_feat, self.out_feat)
else:
weight = self.weight
if self.is_input_layer:
def msg_func(edges):
# for input layer, matrix multiply can be converted to be
# an embedding lookup using source node id
embed = weight.view(-1, self.out_feat)
index = edges.data['type'] * self.in_feat + edges.src['id']
return {'msg': embed[index] * edges.data['norm']}
else:
def msg_func(edges):
w = weight[edges.data['type']]
msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
msg = msg * edges.data['norm']
return {'msg': msg}
g.update_all(msg_func, fn.sum(msg='msg', out='h'), None)
class RGCNBlockLayer(RGCNLayer):
def __init__(self, in_feat, out_feat, num_rels, num_bases, bias=None,
activation=None, self_loop=False, dropout=0.0):
super(RGCNBlockLayer, self).__init__(in_feat, out_feat, bias,
activation, self_loop=self_loop,
dropout=dropout)
self.num_rels = num_rels
self.num_bases = num_bases
assert self.num_bases > 0
self.out_feat = out_feat
self.submat_in = in_feat // self.num_bases
self.submat_out = out_feat // self.num_bases
# assuming in_feat and out_feat are both divisible by num_bases
self.weight = nn.Parameter(torch.Tensor(
self.num_rels, self.num_bases * self.submat_in * self.submat_out))
nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
def msg_func(self, edges):
weight = self.weight[edges.data['type']].view(
-1, self.submat_in, self.submat_out)
node = edges.src['h'].view(-1, 1, self.submat_in)
msg = torch.bmm(node, weight).view(-1, self.out_feat)
return {'msg': msg}
def propagate(self, g):
g.update_all(self.msg_func, fn.sum(msg='msg', out='h'), self.apply_func)
def apply_func(self, nodes):
return {'h': nodes.data['h'] * nodes.data['norm']}
"""
Modeling Relational Data with Graph Convolutional Networks
Paper: https://arxiv.org/abs/1703.06103
Code: https://github.com/MichSchli/RelationPrediction
Difference compared to MichSchli/RelationPrediction
* report raw metrics instead of filtered metrics
"""
import argparse
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from dgl.contrib.data import load_data
from layers import RGCNBlockLayer as RGCNLayer
from model import BaseRGCN
import utils
class EmbeddingLayer(nn.Module):
def __init__(self, num_nodes, h_dim):
super(EmbeddingLayer, self).__init__()
self.embedding = torch.nn.Embedding(num_nodes, h_dim)
def forward(self, g):
node_id = g.ndata['id'].squeeze()
g.ndata['h'] = self.embedding(node_id)
class RGCN(BaseRGCN):
def build_input_layer(self):
return EmbeddingLayer(self.num_nodes, self.h_dim)
def build_hidden_layer(self, idx):
act = F.relu if idx < self.num_hidden_layers - 1 else None
return RGCNLayer(self.h_dim, self.h_dim, self.num_rels, self.num_bases,
activation=act, self_loop=True, dropout=self.dropout)
class LinkPredict(nn.Module):
def __init__(self, in_dim, h_dim, num_rels, num_bases=-1,
num_hidden_layers=1, dropout=0, use_cuda=False, reg_param=0):
super(LinkPredict, self).__init__()
self.rgcn = RGCN(in_dim, h_dim, h_dim, num_rels * 2, num_bases,
num_hidden_layers, dropout, use_cuda)
self.reg_param = reg_param
self.w_relation = nn.Parameter(torch.Tensor(num_rels, h_dim))
nn.init.xavier_uniform_(self.w_relation,
gain=nn.init.calculate_gain('relu'))
def calc_score(self, embedding, triplets):
# DistMult
s = embedding[triplets[:,0]]
r = self.w_relation[triplets[:,1]]
o = embedding[triplets[:,2]]
score = torch.sum(s * r * o, dim=1)
return score
def forward(self, g):
return self.rgcn.forward(g)
def evaluate(self, g):
# get embedding and relation weight without grad
embedding = self.forward(g)
return embedding, self.w_relation
def regularization_loss(self, embedding):
return torch.mean(embedding.pow(2)) + torch.mean(self.w_relation.pow(2))
def get_loss(self, g, triplets, labels):
# triplets is a list of data samples (positive and negative)
# each row in the triplets is a 3-tuple of (source, relation, destination)
embedding = self.forward(g)
score = self.calc_score(embedding, triplets)
predict_loss = F.binary_cross_entropy_with_logits(score, labels)
reg_loss = self.regularization_loss(embedding)
return predict_loss + self.reg_param * reg_loss
def main(args):
# load graph data
data = load_data(args.dataset)
num_nodes = data.num_nodes
train_data = data.train
valid_data = data.valid
test_data = data.test
num_rels = data.num_rels
# check cuda
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
if use_cuda:
torch.cuda.set_device(args.gpu)
# create model
model = LinkPredict(num_nodes,
args.n_hidden,
num_rels,
num_bases=args.n_bases,
num_hidden_layers=args.n_layers,
dropout=args.dropout,
use_cuda=use_cuda,
reg_param=args.regularization)
# validation and testing triplets
valid_data = torch.LongTensor(valid_data)
test_data = torch.LongTensor(test_data)
# build test graph
test_graph, test_rel, test_norm = utils.build_test_graph(
num_nodes, num_rels, train_data)
test_deg = test_graph.in_degrees(
range(test_graph.number_of_nodes())).float().view(-1,1)
test_node_id = torch.arange(0, num_nodes, dtype=torch.long).view(-1, 1)
test_rel = torch.from_numpy(test_rel).view(-1, 1)
test_norm = torch.from_numpy(test_norm).view(-1, 1)
test_graph.ndata.update({'id': test_node_id, 'norm': test_norm})
test_graph.edata['type'] = test_rel
if use_cuda:
model.cuda()
# build adj list and calculate degrees for sampling
adj_list, degrees = utils.get_adj_and_degrees(num_nodes, train_data)
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
model_state_file = 'model_state.pth'
forward_time = []
backward_time = []
# training loop
print("start training...")
epoch = 0
best_mrr = 0
while True:
model.train()
epoch += 1
# perform edge neighborhood sampling to generate training graph and data
g, node_id, edge_type, node_norm, data, labels = \
utils.generate_sampled_graph_and_labels(
train_data, args.graph_batch_size, args.graph_split_size,
num_rels, adj_list, degrees, args.negative_sample)
print("Done edge sampling")
# set node/edge feature
node_id = torch.from_numpy(node_id).view(-1, 1)
edge_type = torch.from_numpy(edge_type).view(-1, 1)
node_norm = torch.from_numpy(node_norm).view(-1, 1)
data, labels = torch.from_numpy(data), torch.from_numpy(labels)
deg = g.in_degrees(range(g.number_of_nodes())).float().view(-1, 1)
if use_cuda:
node_id, deg = node_id.cuda(), deg.cuda()
edge_type, node_norm = edge_type.cuda(), node_norm.cuda()
data, labels = data.cuda(), labels.cuda()
g.ndata.update({'id': node_id, 'norm': node_norm})
g.edata['type'] = edge_type
t0 = time.time()
loss = model.get_loss(g, data, labels)
t1 = time.time()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) # clip gradients
optimizer.step()
t2 = time.time()
forward_time.append(t1 - t0)
backward_time.append(t2 - t1)
print("Epoch {:04d} | Loss {:.4f} | Best MRR {:.4f} | Forward {:.4f}s | Backward {:.4f}s".
format(epoch, loss.item(), best_mrr, forward_time[-1], backward_time[-1]))
optimizer.zero_grad()
# validation
if epoch % args.evaluate_every == 0:
# perform validation on CPU because full graph is too large
if use_cuda:
model.cpu()
model.eval()
print("start eval")
mrr = utils.evaluate(test_graph, model, valid_data, num_nodes,
hits=[1, 3, 10], eval_bz=args.eval_batch_size)
# save best model
if mrr < best_mrr:
if epoch >= args.n_epochs:
break
else:
best_mrr = mrr
torch.save({'state_dict': model.state_dict(), 'epoch': epoch},
model_state_file)
if use_cuda:
model.cuda()
print("training done")
print("Mean forward time: {:4f}s".format(np.mean(forward_time)))
print("Mean Backward time: {:4f}s".format(np.mean(backward_time)))
print("\nstart testing:")
# use best model checkpoint
checkpoint = torch.load(model_state_file)
if use_cuda:
model.cpu() # test on CPU
model.eval()
model.load_state_dict(checkpoint['state_dict'])
print("Using best epoch: {}".format(checkpoint['epoch']))
utils.evaluate(test_graph, model, test_data, num_nodes, hits=[1, 3, 10],
eval_bz=args.eval_batch_size)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='RGCN')
parser.add_argument("--dropout", type=float, default=0.2,
help="dropout probability")
parser.add_argument("--n-hidden", type=int, default=500,
help="number of hidden units")
parser.add_argument("--gpu", type=int, default=-1,
help="gpu")
parser.add_argument("--lr", type=float, default=1e-2,
help="learning rate")
parser.add_argument("--n-bases", type=int, default=100,
help="number of weight blocks for each relation")
parser.add_argument("--n-layers", type=int, default=2,
help="number of propagation rounds")
parser.add_argument("--n-epochs", type=int, default=6000,
help="number of minimum training epochs")
parser.add_argument("-d", "--dataset", type=str, required=True,
help="dataset to use")
parser.add_argument("--eval-batch-size", type=int, default=500,
help="batch size when evaluating")
parser.add_argument("--regularization", type=float, default=0.01,
help="regularization weight")
parser.add_argument("--grad-norm", type=float, default=1.0,
help="norm to clip gradient to")
parser.add_argument("--graph-batch-size", type=int, default=30000,
help="number of edges to sample in each iteration")
parser.add_argument("--graph-split-size", type=float, default=0.5,
help="portion of edges used as positive sample")
parser.add_argument("--negative-sample", type=int, default=10,
help="number of negative samples per positive sample")
parser.add_argument("--evaluate-every", type=int, default=500,
help="perform evalution every n epochs")
args = parser.parse_args()
print(args)
main(args)
import torch.nn as nn
class BaseRGCN(nn.Module):
def __init__(self, num_nodes, h_dim, out_dim, num_rels, num_bases=-1,
num_hidden_layers=1, dropout=0, use_cuda=False):
super(BaseRGCN, self).__init__()
self.num_nodes = num_nodes
self.h_dim = h_dim
self.out_dim = out_dim
self.num_rels = num_rels
self.num_bases = num_bases
self.num_hidden_layers = num_hidden_layers
self.dropout = dropout
self.use_cuda = use_cuda
# create rgcn layers
self.build_model()
# create initial features
self.features = self.create_features()
def build_model(self):
self.layers = nn.ModuleList()
# i2h
i2h = self.build_input_layer()
if i2h is not None:
self.layers.append(i2h)
# h2h
for idx in range(self.num_hidden_layers):
h2h = self.build_hidden_layer(idx)
self.layers.append(h2h)
# h2o
h2o = self.build_output_layer()
if h2o is not None:
self.layers.append(h2o)
# initialize feature for each node
def create_features(self):
return None
def build_input_layer(self):
return None
def build_hidden_layer(self):
raise NotImplementedError
def build_output_layer(self):
return None
def forward(self, g):
if self.features is not None:
g.ndata['id'] = self.features
for layer in self.layers:
layer(g)
return g.ndata.pop('h')
"""
Utility functions for link prediction
Most code is adapted from authors' implementation of RGCN link prediction:
https://github.com/MichSchli/RelationPrediction
"""
import numpy as np
import torch
import dgl
#######################################################################
#
# Utility function for building training and testing graphs
#
#######################################################################
def get_adj_and_degrees(num_nodes, triplets):
""" Get adjacency list and degrees of the graph
"""
adj_list = [[] for _ in range(num_nodes)]
for i,triplet in enumerate(triplets):
adj_list[triplet[0]].append([i, triplet[2]])
adj_list[triplet[2]].append([i, triplet[0]])
degrees = np.array([len(a) for a in adj_list])
adj_list = [np.array(a) for a in adj_list]
return adj_list, degrees
def sample_edge_neighborhood(adj_list, degrees, n_triplets, sample_size):
""" Edge neighborhood sampling to reduce training graph size
"""
edges = np.zeros((sample_size), dtype=np.int32)
#initialize
sample_counts = np.array([d for d in degrees])
picked = np.array([False for _ in range(n_triplets)])
seen = np.array([False for _ in degrees])
for i in range(0, sample_size):
weights = sample_counts * seen
if np.sum(weights) == 0:
weights = np.ones_like(weights)
weights[np.where(sample_counts == 0)] = 0
probabilities = (weights) / np.sum(weights)
chosen_vertex = np.random.choice(np.arange(degrees.shape[0]),
p=probabilities)
chosen_adj_list = adj_list[chosen_vertex]
seen[chosen_vertex] = True
chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
chosen_edge = chosen_adj_list[chosen_edge]
edge_number = chosen_edge[0]
while picked[edge_number]:
chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
chosen_edge = chosen_adj_list[chosen_edge]
edge_number = chosen_edge[0]
edges[i] = edge_number
other_vertex = chosen_edge[1]
picked[edge_number] = True
sample_counts[chosen_vertex] -= 1
sample_counts[other_vertex] -= 1
seen[other_vertex] = True
return edges
def generate_sampled_graph_and_labels(triplets, sample_size, split_size,
num_rels, adj_list, degrees,
negative_rate):
"""Get training graph and signals
First perform edge neighborhood sampling on graph, then perform negative
sampling to generate negative samples
"""
# perform edge neighbor sampling
edges = sample_edge_neighborhood(adj_list, degrees, len(triplets),
sample_size)
# relabel nodes to have consecutive node ids
edges = triplets[edges]
src, rel, dst = edges.transpose()
uniq_v, edges = np.unique((src, dst), return_inverse=True)
src, dst = np.reshape(edges, (2, -1))
relabeled_edges = np.stack((src, rel, dst)).transpose()
# negative sampling
samples, labels = negative_sampling(relabeled_edges, len(uniq_v),
negative_rate)
# further split graph, only half of the edges will be used as graph
# structure, while the rest half is used as unseen positive samples
split_size = int(sample_size * split_size)
graph_split_ids = np.random.choice(np.arange(sample_size),
size=split_size, replace=False)
src = src[graph_split_ids]
dst = dst[graph_split_ids]
rel = rel[graph_split_ids]
# build DGL graph
print("# sampled nodes: {}".format(len(uniq_v)))
print("# sampled edges: {}".format(len(src) * 2))
g, rel, norm = build_graph_from_triplets(len(uniq_v), num_rels,
(src, rel, dst))
return g, uniq_v, rel, norm, samples, labels
def comp_deg_norm(g):
in_deg = g.in_degrees(range(g.number_of_nodes())).float().numpy()
norm = 1.0 / in_deg
norm[np.isinf(norm)] = 0
return norm
def build_graph_from_triplets(num_nodes, num_rels, triplets):
""" Create a DGL graph. The graph is bidirectional because RGCN authors
use reversed relations.
This function also generates edge type and normalization factor
(reciprocal of node incoming degree)
"""
g = dgl.DGLGraph()
g.add_nodes(num_nodes)
src, rel, dst = triplets
src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
rel = np.concatenate((rel, rel + num_rels))
edges = sorted(zip(dst, src, rel))
dst, src, rel = np.array(edges).transpose()
g.add_edges(src, dst)
norm = comp_deg_norm(g)
print("# nodes: {}, # edges: {}".format(num_nodes, len(src)))
return g, rel, norm
def build_test_graph(num_nodes, num_rels, edges):
src, rel, dst = edges.transpose()
print("Test graph:")
return build_graph_from_triplets(num_nodes, num_rels, (src, rel, dst))
def negative_sampling(pos_samples, num_entity, negative_rate):
size_of_batch = len(pos_samples)
num_to_generate = size_of_batch * negative_rate
neg_samples = np.tile(pos_samples, (negative_rate, 1))
labels = np.zeros(size_of_batch * (negative_rate + 1), dtype=np.float32)
labels[: size_of_batch] = 1
values = np.random.randint(num_entity, size=num_to_generate)
choices = np.random.uniform(size=num_to_generate)
subj = choices > 0.5
obj = choices <= 0.5
neg_samples[subj, 0] = values[subj]
neg_samples[obj, 2] = values[obj]
return np.concatenate((pos_samples, neg_samples)), labels
#######################################################################
#
# Utility function for evaluations
#
#######################################################################
def sort_and_rank(score, target):
_, indices = torch.sort(score, dim=1, descending=True)
indices = torch.nonzero(indices == target.view(-1, 1))
indices = indices[:, 1].view(-1)
return indices
def perturb_and_get_rank(embedding, w, a, r, b, num_entity, batch_size=100):
""" Perturb one element in the triplets
"""
n_batch = (num_entity + batch_size - 1) // batch_size
ranks = []
for idx in range(n_batch):
print("batch {} / {}".format(idx, n_batch))
batch_start = idx * batch_size
batch_end = min(num_entity, (idx + 1) * batch_size)
batch_a = a[batch_start: batch_end]
batch_r = r[batch_start: batch_end]
emb_ar = embedding[batch_a] * w[batch_r]
emb_ar = emb_ar.transpose(0, 1).unsqueeze(2) # size: D x E x 1
emb_c = embedding.transpose(0, 1).unsqueeze(1) # size: D x 1 x V
# out-prod and reduce sum
out_prod = torch.bmm(emb_ar, emb_c) # size D x E x V
score = torch.sum(out_prod, dim=0) # size E x V
score = torch.sigmoid(score)
target = b[batch_start: batch_end]
ranks.append(sort_and_rank(score, target))
return torch.cat(ranks)
# TODO (lingfan): implement filtered metrics
# return MRR (raw), and Hits @ (1, 3, 10)
def evaluate(test_graph, model, test_triplets, num_entity, hits=[], eval_bz=100):
with torch.no_grad():
embedding, w = model.evaluate(test_graph)
s = test_triplets[:, 0]
r = test_triplets[:, 1]
o = test_triplets[:, 2]
# perturb subject
ranks_s = perturb_and_get_rank(embedding, w, o, r, s, num_entity, eval_bz)
# perturb object
ranks_o = perturb_and_get_rank(embedding, w, s, r, o, num_entity, eval_bz)
ranks = torch.cat([ranks_s, ranks_o])
ranks += 1 # change to 1-indexed
mrr = torch.mean(1.0 / ranks.float())
print("MRR (raw): {:.6f}".format(mrr.item()))
for hit in hits:
avg_count = torch.mean((ranks <= hit).float())
print("Hits (raw) @ {}: {:.6f}".format(hit, avg_count.item()))
return mrr.item()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment