import datetime import dgl import errno import numpy as np import os import pickle import random import torch from dgl.data.utils import download, get_download_dir, _get_dgl_url from pprint import pprint from scipy import sparse from scipy import io as sio def set_random_seed(seed=0): """Set random seed. Parameters ---------- seed : int Random seed to use """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) def mkdir_p(path, log=True): """Create a directory for the specified path. Parameters ---------- path : str Path name log : bool Whether to print result for directory creation """ try: os.makedirs(path) if log: print('Created directory {}'.format(path)) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path) and log: print('Directory {} already exists.'.format(path)) else: raise def get_date_postfix(): """Get a date based postfix for directory name. Returns ------- post_fix : str """ dt = datetime.datetime.now() post_fix = '{}_{:02d}-{:02d}-{:02d}'.format( dt.date(), dt.hour, dt.minute, dt.second) return post_fix def setup_log_dir(args, sampling=False): """Name and create directory for logging. Parameters ---------- args : dict Configuration Returns ------- log_dir : str Path for logging directory sampling : bool Whether we are using sampling based training """ date_postfix = get_date_postfix() log_dir = os.path.join( args['log_dir'], '{}_{}'.format(args['dataset'], date_postfix)) if sampling: log_dir = log_dir + '_sampling' mkdir_p(log_dir) return log_dir # The configuration below is from the paper. default_configure = { 'lr': 0.005, # Learning rate 'num_heads': [8], # Number of attention heads for node-level attention 'hidden_units': 8, 'dropout': 0.6, 'weight_decay': 0.001, 'num_epochs': 200, 'patience': 100 } sampling_configure = { 'batch_size': 20 } def setup(args): args.update(default_configure) set_random_seed(args['seed']) args['dataset'] = 'ACMRaw' if args['hetero'] else 'ACM' args['device'] = 'cuda: 0' if torch.cuda.is_available() else 'cpu' args['log_dir'] = setup_log_dir(args) return args def setup_for_sampling(args): args.update(default_configure) args.update(sampling_configure) set_random_seed() args['device'] = 'cuda: 0' if torch.cuda.is_available() else 'cpu' args['log_dir'] = setup_log_dir(args, sampling=True) return args def get_binary_mask(total_size, indices): mask = torch.zeros(total_size) mask[indices] = 1 return mask.byte() def load_acm(remove_self_loop): url = 'dataset/ACM3025.pkl' data_path = get_download_dir() + '/ACM3025.pkl' download(_get_dgl_url(url), path=data_path) with open(data_path, 'rb') as f: data = pickle.load(f) labels, features = torch.from_numpy(data['label'].todense()).long(), \ torch.from_numpy(data['feature'].todense()).float() num_classes = labels.shape[1] labels = labels.nonzero()[:, 1] if remove_self_loop: num_nodes = data['label'].shape[0] data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes)) data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes)) # Adjacency matrices for meta path based neighbors # (Mufei): I verified both of them are binary adjacency matrices with self loops author_g = dgl.graph(data['PAP'], ntype='paper', etype='author') subject_g = dgl.graph(data['PLP'], ntype='paper', etype='subject') gs = [author_g, subject_g] train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0) val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0) test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0) num_nodes = author_g.number_of_nodes() train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) print('dataset loaded') pprint({ 'dataset': 'ACM', 'train': train_mask.sum().item() / num_nodes, 'val': val_mask.sum().item() / num_nodes, 'test': test_mask.sum().item() / num_nodes }) return gs, features, labels, num_classes, train_idx, val_idx, test_idx, \ train_mask, val_mask, test_mask def load_acm_raw(remove_self_loop): assert not remove_self_loop url = 'dataset/ACM.mat' data_path = get_download_dir() + '/ACM.mat' download(_get_dgl_url(url), path=data_path) data = sio.loadmat(data_path) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MOBICOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] pa = dgl.bipartite(p_vs_a, 'paper', 'pa', 'author') ap = dgl.bipartite(p_vs_a.transpose(), 'author', 'ap', 'paper') pl = dgl.bipartite(p_vs_l, 'paper', 'pf', 'field') lp = dgl.bipartite(p_vs_l.transpose(), 'field', 'fp', 'paper') hg = dgl.hetero_from_relations([pa, ap, pl, lp]) features = torch.FloatTensor(p_vs_t.toarray()) pc_p, pc_c = p_vs_c.nonzero() labels = np.zeros(len(p_selected), dtype=np.int64) for conf_id, label_id in zip(conf_ids, label_ids): labels[pc_p[pc_c == conf_id]] = label_id labels = torch.LongTensor(labels) num_classes = 3 float_mask = np.zeros(len(pc_p)) for conf_id in conf_ids: pc_c_mask = (pc_c == conf_id) float_mask[pc_c_mask] = np.random.permutation(np.linspace(0, 1, pc_c_mask.sum())) train_idx = np.where(float_mask <= 0.2)[0] val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0] test_idx = np.where(float_mask > 0.3)[0] num_nodes = hg.number_of_nodes('paper') train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \ train_mask, val_mask, test_mask def load_data(dataset, remove_self_loop=False): if dataset == 'ACM': return load_acm(remove_self_loop) elif dataset == 'ACMRaw': return load_acm_raw(remove_self_loop) else: return NotImplementedError('Unsupported dataset {}'.format(dataset)) class EarlyStopping(object): def __init__(self, patience=10): dt = datetime.datetime.now() self.filename = 'early_stop_{}_{:02d}-{:02d}-{:02d}.pth'.format( dt.date(), dt.hour, dt.minute, dt.second) self.patience = patience self.counter = 0 self.best_acc = None self.best_loss = None self.early_stop = False def step(self, loss, acc, model): if self.best_loss is None: self.best_acc = acc self.best_loss = loss self.save_checkpoint(model) elif (loss > self.best_loss) and (acc < self.best_acc): self.counter += 1 print(f'EarlyStopping counter: {self.counter} out of {self.patience}') if self.counter >= self.patience: self.early_stop = True else: if (loss <= self.best_loss) and (acc >= self.best_acc): self.save_checkpoint(model) self.best_loss = np.min((loss, self.best_loss)) self.best_acc = np.max((acc, self.best_acc)) self.counter = 0 return self.early_stop def save_checkpoint(self, model): """Saves model when validation loss decreases.""" torch.save(model.state_dict(), self.filename) def load_checkpoint(self, model): """Load the latest checkpoint.""" model.load_state_dict(torch.load(self.filename))