import numpy as np import torch import torchvision from torch.autograd import Variable import random import time Metapath = "Conference-Paper-Author-Paper-Conference" num_walks_per_node = 1000 walk_length = 100 #construct mapping from text, could be changed to DGL later def construct_id_dict(): id_to_paper = {} id_to_author = {} id_to_conf = {} f_3 = open(".../id_author.txt", encoding="ISO-8859-1") f_4 = open(".../id_conf.txt", encoding="ISO-8859-1") f_5 = open(".../paper.txt", encoding="ISO-8859-1") while True: z = f_3.readline() if not z: break z = z.split('\t') identity = int(z[0]) id_to_author[identity] = z[1].strip("\n") while True: w = f_4.readline() if not w: break; w = w.split('\t') identity = int(w[0]) id_to_conf[identity] = w[1].strip("\n") while True: v = f_5.readline() if not v: break; v = v.split(' ') identity = int(v[0]) paper_name = "" for s in range(5, len(v)): paper_name += v[s] paper_name = 'p' + paper_name id_to_paper[identity] = paper_name.strip('\n') f_3.close() f_4.close() f_5.close() return id_to_paper, id_to_author, id_to_conf #construct mapping from text, could be changed to DGL later def construct_types_mappings(): paper_to_author = {} author_to_paper = {} paper_to_conf = {} conf_to_paper = {} f_1 = open(".../paper_author.txt", "r") f_2 = open(".../paper_conf.txt", "r") for x in f_1: x = x.split('\t') x[0] = int(x[0]) x[1] = int(x[1].strip('\n')) if x[0] in paper_to_author: paper_to_author[x[0]].append(x[1]) else: paper_to_author[x[0]] = [] paper_to_author[x[0]].append(x[1]) if x[1] in author_to_paper: author_to_paper[x[1]].append(x[0]) else: author_to_paper[x[1]] = [] author_to_paper[x[1]].append(x[0]) for y in f_2: y = y.split('\t') y[0] = int(y[0]) y[1] = int(y[1].strip('\n')) if y[0] in paper_to_conf: paper_to_conf[y[0]].append(y[1]) else: paper_to_conf[y[0]] = [] paper_to_conf[y[0]].append(y[1]) if y[1] in conf_to_paper: conf_to_paper[y[1]].append(y[0]) else: conf_to_paper[y[1]] = [] conf_to_paper[y[1]].append(y[0]) f_1.close() f_2.close() return paper_to_author, author_to_paper, paper_to_conf, conf_to_paper #"conference - paper - Author - paper - conference" metapath sampling def generate_metapath(): output_path = open(".../output_path.txt", "w") id_to_paper, id_to_author, id_to_conf = construct_id_dict() paper_to_author, author_to_paper, paper_to_conf, conf_to_paper = construct_types_mappings() count = 0 #loop all conferences for conf_id in conf_to_paper.keys(): start_time = time.time() print("sampling" + str(count)) conf = id_to_conf[conf_id] conf0 = conf #for each conference, simulate num_walks_per_node walks for i in range(num_walks_per_node): outline = conf0 # each walk with length walk_length for j in range(walk_length): # C - P paper_list_1 = conf_to_paper[conf_id] # check whether the paper nodes link to any author node connections_1 = False available_paper_1 = [] for k in range(len(paper_list_1)): if paper_list_1[k] in paper_to_author: available_paper_1.append(paper_list_1[k]) num_p_1 = len(available_paper_1) if num_p_1 != 0: connections_1 = True paper_1_index = random.randrange(num_p_1) #paper_id_1 = paper_list_1[paper_1_index] paper_id_1 = available_paper_1[paper_1_index] paper_1 = id_to_paper[paper_id_1] outline += " " + paper_1 else: break # C - P - A author_list = paper_to_author[paper_id_1] num_a = len(author_list) # No need to check author_index = random.randrange(num_a) author_id = author_list[author_index] author = id_to_author[author_id] outline += " " + author # C - P - A - P paper_list_2 = author_to_paper[author_id] #check whether paper node links to any conference node connections_2 = False available_paper_2 = [] for m in range(len(paper_list_2)): if paper_list_2[m] in paper_to_conf: available_paper_2.append(paper_list_2[m]) num_p_2 = len(available_paper_2) if num_p_2 != 0: connections_2 = True paper_2_index = random.randrange(num_p_2) paper_id_2 = available_paper_2[paper_2_index] paper_2 = id_to_paper[paper_id_2] outline += " " + paper_2 else: break # C - P - A - P - C conf_list = paper_to_conf[paper_id_2] num_c = len(conf_list) conf_index = random.randrange(num_c) conf_id = conf_list[conf_index] conf = id_to_conf[conf_id] outline += " " + conf if connections_1 and connections_2: output_path.write(outline + "\n") else: break # Note that the original mapping text has type indicator in front of each node just like "cVLDB" # So the sampling sequence looks like "cconference ppaper aauthor ppaper cconference" count += 1 print("--- %s seconds ---" % (time.time() - start_time)) output_path.close() if __name__ == "__main__": generate_metapath()