Unverified Commit f9fd3107 authored by Hao Xiong's avatar Hao Xiong Committed by GitHub
Browse files

[Example] Experimental results over ogbl-ddi (#1707)



* ogb-deepwalk

* update readme

* update readme

* update readme

* update readme

* ogbl-ddi

* readme
Co-authored-by: default avatarxiang song(charlie.song) <classicxsong@gmail.com>
parent 45e1333e
......@@ -6,20 +6,29 @@ python3 load_dataset.py --name ogbl-collab
## Evaluation
For evaluatation we follow the code provided by ogb [here](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/collab/mlp.py).
## Used config
ogbl-collab
```
python3 deepwalk.py --data_file ogbl-collab-net.txt --save_in_pt --output_emb_file embedding.pt --num_walks 50 --window_size 20 --walk_length 40 --lr 0.1 --negative 1 --neg_weight 1 --lap_norm 0.005 --mix --adam --gpus 0 1 2 3 --num_threads 4 --print_interval 2000 --print_loss --batch_size 32
cd ./ogb/blob/master/examples/linkproppred/collab/
cp embedding_pt_file_path ./
python3 mlp.py --device 0 --runs 10 --use_node_embedding
```
## Used config
ogbl-ddi
```
python3 deepwalk.py --data_file ogbl-collab-net.txt --save_in_pt --output_emb_file embedding.pt --num_walks 50 --window_size 20 --walk_length 40 --lr 0.1 --negative 1 --neg_weight 1 --lap_norm 0.005 --mix --adam --gpus 0 1 2 3 --num_threads 4 --print_interval 2000 --print_loss --batch_size 32
python3 deepwalk.py --data_file ogbl-ddi-net.txt --save_in_pt --output_emb_file ddi-embedding.pt --num_walks 50 --window_size 2 --walk_length 80 --lr 0.1 --negative 1 --neg_weight 1 --lap_norm 0.05 --only_gpu --adam --gpus 0 --num_threads 4 --print_interval 2000 --print_loss --batch_size 16 --use_context_weight
cd ./ogb/blob/master/examples/linkproppred/ddi/
cp embedding_pt_file_path ./
python3 mlp.py --device 0 --runs 5
```
## Score
Hits@10
ogbl-collab
<br>Hits@10
<br>&emsp;Highest Train: 74.83 ± 4.79
<br>&emsp;Highest Valid: 40.03 ± 2.98
<br>&emsp;&emsp;Final Train: 74.51 ± 4.92
......@@ -33,4 +42,22 @@ Hits@10
<br>&emsp;Highest Train: 99.86 ± 0.04
<br>&emsp;Highest Valid: 66.64 ± 0.32
<br>&emsp;&emsp;Final Train: 99.84 ± 0.06
<br>&emsp;&emsp;Final Test: 56.88 ± 0.37
\ No newline at end of file
<br>&emsp;&emsp;Final Test: 56.88 ± 0.37
<br>obgl-ddi
<br>Hits@10
<br>&emsp;Highest Train: 35.05 ± 3.68
<br>&emsp;Highest Valid: 31.72 ± 3.52
<br>&emsp;&emsp;Final Train: 35.05 ± 3.68
<br>&emsp;&emsp;Final Test: 12.68 ± 3.19
<br>Hits@20
<br>&emsp;Highest Train: 44.85 ± 1.26
<br>&emsp;Highest Valid: 41.20 ± 1.41
<br>&emsp;&emsp;Final Train: 44.85 ± 1.26
<br>&emsp;&emsp;Final Test: 21.69 ± 3.14
<br>Hits@30
<br>&emsp;Highest Train: 52.28 ± 1.21
<br>&emsp;Highest Valid: 48.49 ± 1.09
<br>&emsp;&emsp;Final Train: 52.28 ± 1.21
<br>&emsp;&emsp;Final Test: 29.13 ± 3.46
......@@ -58,6 +58,8 @@ class DeepwalkTrainer:
avg_sgd=self.args.avg_sgd,
fast_neg=self.args.fast_neg,
record_loss=self.args.print_loss,
norm=self.args.norm,
use_context_weight=self.args.use_context_weight,
)
torch.set_num_threads(self.args.num_threads)
......@@ -153,7 +155,7 @@ class DeepwalkTrainer:
if i > 0 and i % self.args.print_interval == 0:
if self.args.print_loss:
print("Solver [%d] batch %d tt: %.2fs loss: %.4f" \
% (gpu_id, i, time.time()-start, sum(self.emb_model.loss)/self.args.print_interval))
% (gpu_id, i, time.time()-start, -sum(self.emb_model.loss)/self.args.print_interval))
self.emb_model.loss = []
else:
print("Solver [%d] batch %d tt: %.2fs" % (gpu_id, i, time.time()-start))
......@@ -209,7 +211,7 @@ class DeepwalkTrainer:
if i > 0 and i % self.args.print_interval == 0:
if self.args.print_loss:
print("Batch %d training time: %.2fs loss: %.4f" \
% (i, time.time()-start, sum(self.emb_model.loss)/self.args.print_interval))
% (i, time.time()-start, -sum(self.emb_model.loss)/self.args.print_interval))
self.emb_model.loss = []
else:
print("Batch %d, training time: %.2fs" % (i, time.time()-start))
......@@ -273,6 +275,10 @@ if __name__ == '__main__':
help="use sgd for embedding updation")
parser.add_argument('--avg_sgd', default=False, action="store_true",
help="average gradients of sgd for embedding updation")
parser.add_argument('--norm', default=False, action="store_true",
help="whether to do normalization over node embedding after training")
parser.add_argument('--use_context_weight', default=False, action="store_true",
help="whether to add weights over nodes in the context window")
parser.add_argument('--num_threads', default=2, type=int,
help="number of threads used for each CPU-core/GPU")
parser.add_argument('--gpus', type=int, default=[-1], nargs='+',
......
......@@ -13,8 +13,20 @@ name = args.name
dataset = PygLinkPropPredDataset(name=name)
data = dataset[0]
try:
weighted = data.edge_weight
weighted = True
except:
weighted = False
with open(name + "-net.txt", "w") as f:
for i in range(data.edge_index.shape[1]):
f.write(str(data.edge_index[0][i].item()) + " "\
+str(data.edge_index[1][i].item()) + " "\
+str(data.edge_weight[i].item()) + "\n")
\ No newline at end of file
if weighted:
f.write(str(data.edge_index[0][i].item()) + " "\
+str(data.edge_index[1][i].item()) + " "\
+str(data.edge_weight[i].item()) + "\n")
else:
f.write(str(data.edge_index[0][i].item()) + " "\
+str(data.edge_index[1][i].item()) + " "\
+"1\n")
......@@ -73,11 +73,7 @@ def init_emb2neg_index(walk_length, window_size, negative, batch_size):
return index_emb_negu, index_emb_negv
def init_grad_avg(walk_length, window_size, batch_size):
'''select nodes' gradients from gradient matrix
Usage
-----
''' averaging graidents by specific weights
'''
grad_avg = []
for b in range(batch_size):
......@@ -92,6 +88,22 @@ def init_grad_avg(walk_length, window_size, batch_size):
# [num_pos * batch_size]
return torch.Tensor(grad_avg).unsqueeze(1)
def init_weight(walk_length, window_size, batch_size):
''' select nodes' gradients from gradient matrix
'''
weight = []
for b in range(batch_size):
for i in range(walk_length):
for j in range(i-window_size, i):
if j >= 0:
weight.append(1. - float(i - j - 1)/float(window_size))
for j in range(i + 1, i + 1 + window_size):
if j < walk_length:
weight.append(1. - float(j - i - 1)/float(window_size))
# [num_pos * batch_size]
return torch.Tensor(weight).unsqueeze(1)
def init_empty_grad(emb_dimension, walk_length, batch_size):
""" initialize gradient matrix """
grad_u = torch.zeros((batch_size * walk_length, emb_dimension))
......@@ -131,6 +143,8 @@ class SkipGramModel(nn.Module):
avg_sgd,
fast_neg,
record_loss,
norm,
use_context_weight,
):
""" initialize embedding on CPU
......@@ -171,6 +185,8 @@ class SkipGramModel(nn.Module):
self.avg_sgd = avg_sgd
self.fast_neg = fast_neg
self.record_loss = record_loss
self.norm = norm
self.use_context_weight = use_context_weight
# initialize the device as cpu
self.device = torch.device("cpu")
......@@ -206,6 +222,12 @@ class SkipGramModel(nn.Module):
self.negative,
self.batch_size)
if self.use_context_weight:
self.context_weight = init_weight(
self.walk_length,
self.window_size,
self.batch_size)
# coefficients for averaging the gradients
if self.avg_sgd:
self.grad_avg = init_grad_avg(
......@@ -247,6 +269,8 @@ class SkipGramModel(nn.Module):
self.grad_v = self.grad_v.to(self.device)
if self.avg_sgd:
self.grad_avg = self.grad_avg.to(self.device)
if self.use_context_weight:
self.context_weight = self.context_weight.to(self.device)
def all_to_device(self, gpu_id):
""" move all of the parameters to a single GPU """
......@@ -340,6 +364,18 @@ class SkipGramModel(nn.Module):
else:
grad_u_pos = score * emb_pos_v
grad_v_pos = score * emb_pos_u
if self.use_context_weight:
if bs < self.batch_size:
context_weight = init_weight(
self.walk_length,
self.window_size,
bs).to(self.device)
else:
context_weight = self.context_weight
grad_u_pos *= context_weight
grad_v_pos *= context_weight
# [batch_size * walk_length, dim]
if bs < self.batch_size:
grad_u, grad_v = init_empty_grad(
......@@ -453,6 +489,8 @@ class SkipGramModel(nn.Module):
file_name str : the file name
"""
embedding = self.u_embeddings.weight.cpu().data.numpy()
if self.norm:
embedding /= np.sqrt(np.sum(embedding * embedding, 1)).reshape(-1, 1)
np.save(file_name, embedding)
def save_embedding_pt(self, dataset, file_name):
......@@ -462,6 +500,8 @@ class SkipGramModel(nn.Module):
assert max(dataset.node2id.keys()) == self.emb_size - 1, "The node id does not starts from 0, saving embedding failed."
index = torch.LongTensor(list(map(lambda node: dataset.node2id[node], list(range(self.emb_size)))))
embedding = torch.index_select(embedding, 0, index)
if self.norm:
embedding /= torch.sqrt(torch.sum(embedding.mul(embedding), 1)).unsqueeze(1)
torch.save(embedding, file_name)
def save_embedding_txt(self, dataset, file_name):
......@@ -473,6 +513,8 @@ class SkipGramModel(nn.Module):
file_name str : the file name
"""
embedding = self.u_embeddings.weight.cpu().data.numpy()
if self.norm:
embedding /= np.sqrt(np.sum(embedding * embedding, 1)).reshape(-1, 1)
with open(file_name, 'w') as f:
f.write('%d %d\n' % (self.emb_size, self.emb_dimension))
for wid in range(self.emb_size):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment