sampler.py 3.24 KB
Newer Older
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
1
import os
ziqiaomeng's avatar
ziqiaomeng committed
2
import random
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
3
import sys
ziqiaomeng's avatar
ziqiaomeng committed
4
import time
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
5

6
import dgl
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
7
8
import numpy as np
import tqdm
ziqiaomeng's avatar
ziqiaomeng committed
9
10
11

num_walks_per_node = 1000
walk_length = 100
12
path = sys.argv[1]
ziqiaomeng's avatar
ziqiaomeng committed
13

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
14

15
16
17
18
19
20
21
22
23
24
def construct_graph():
    paper_ids = []
    paper_names = []
    author_ids = []
    author_names = []
    conf_ids = []
    conf_names = []
    f_3 = open(os.path.join(path, "id_author.txt"), encoding="ISO-8859-1")
    f_4 = open(os.path.join(path, "id_conf.txt"), encoding="ISO-8859-1")
    f_5 = open(os.path.join(path, "paper.txt"), encoding="ISO-8859-1")
ziqiaomeng's avatar
ziqiaomeng committed
25
26
27
28
    while True:
        z = f_3.readline()
        if not z:
            break
29
        z = z.strip().split()
ziqiaomeng's avatar
ziqiaomeng committed
30
        identity = int(z[0])
31
32
        author_ids.append(identity)
        author_names.append(z[1])
ziqiaomeng's avatar
ziqiaomeng committed
33
34
35
    while True:
        w = f_4.readline()
        if not w:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
36
            break
37
        w = w.strip().split()
ziqiaomeng's avatar
ziqiaomeng committed
38
        identity = int(w[0])
39
40
        conf_ids.append(identity)
        conf_names.append(w[1])
ziqiaomeng's avatar
ziqiaomeng committed
41
42
43
    while True:
        v = f_5.readline()
        if not v:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
44
            break
45
        v = v.strip().split()
ziqiaomeng's avatar
ziqiaomeng committed
46
        identity = int(v[0])
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
47
        paper_name = "p" + "".join(v[1:])
48
49
        paper_ids.append(identity)
        paper_names.append(paper_name)
ziqiaomeng's avatar
ziqiaomeng committed
50
51
52
53
    f_3.close()
    f_4.close()
    f_5.close()

54
55
56
57
58
59
60
61
62
63
    author_ids_invmap = {x: i for i, x in enumerate(author_ids)}
    conf_ids_invmap = {x: i for i, x in enumerate(conf_ids)}
    paper_ids_invmap = {x: i for i, x in enumerate(paper_ids)}

    paper_author_src = []
    paper_author_dst = []
    paper_conf_src = []
    paper_conf_dst = []
    f_1 = open(os.path.join(path, "paper_author.txt"), "r")
    f_2 = open(os.path.join(path, "paper_conf.txt"), "r")
ziqiaomeng's avatar
ziqiaomeng committed
64
    for x in f_1:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
65
        x = x.split("\t")
ziqiaomeng's avatar
ziqiaomeng committed
66
        x[0] = int(x[0])
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
67
        x[1] = int(x[1].strip("\n"))
68
69
        paper_author_src.append(paper_ids_invmap[x[0]])
        paper_author_dst.append(author_ids_invmap[x[1]])
ziqiaomeng's avatar
ziqiaomeng committed
70
    for y in f_2:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
71
        y = y.split("\t")
ziqiaomeng's avatar
ziqiaomeng committed
72
        y[0] = int(y[0])
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
73
        y[1] = int(y[1].strip("\n"))
74
75
        paper_conf_src.append(paper_ids_invmap[y[0]])
        paper_conf_dst.append(conf_ids_invmap[y[1]])
ziqiaomeng's avatar
ziqiaomeng committed
76
77
    f_1.close()
    f_2.close()
78

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
79
80
81
82
83
84
85
86
    hg = dgl.heterograph(
        {
            ("paper", "pa", "author"): (paper_author_src, paper_author_dst),
            ("author", "ap", "paper"): (paper_author_dst, paper_author_src),
            ("paper", "pc", "conf"): (paper_conf_src, paper_conf_dst),
            ("conf", "cp", "paper"): (paper_conf_dst, paper_conf_src),
        }
    )
87
    return hg, author_names, conf_names, paper_names
ziqiaomeng's avatar
ziqiaomeng committed
88

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
89
90

# "conference - paper - Author - paper - conference" metapath sampling
ziqiaomeng's avatar
ziqiaomeng committed
91
def generate_metapath():
92
    output_path = open(os.path.join(path, "output_path.txt"), "w")
ziqiaomeng's avatar
ziqiaomeng committed
93
    count = 0
94
95
96

    hg, author_names, conf_names, paper_names = construct_graph()

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
97
    for conf_idx in tqdm.trange(hg.number_of_nodes("conf")):
98
        traces, _ = dgl.sampling.random_walk(
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
99
100
101
102
            hg,
            [conf_idx] * num_walks_per_node,
            metapath=["cp", "pa", "ap", "pc"] * walk_length,
        )
103
        for tr in traces:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
104
105
106
107
            outline = " ".join(
                (conf_names if i % 4 == 0 else author_names)[tr[i]]
                for i in range(0, len(tr), 2)
            )  # skip paper
108
            print(outline, file=output_path)
ziqiaomeng's avatar
ziqiaomeng committed
109
110
111
112
113
    output_path.close()


if __name__ == "__main__":
    generate_metapath()