Merge branch 'master' into dist_part

e9b624fe · Minjie Wang · GitHub · 8086d1ed · a88e7f7e · e9b624fe
Unverified Commit e9b624fe authored Aug 11, 2022 by Minjie Wang Committed by GitHub Aug 11, 2022
20 changed files
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/evaluate_model.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/evaluate_model.py
+import os, torch, argparse
+import netlib as netlib
+import auxiliaries as aux
+import datasets as data
+import evaluate as eval
+
+if __name__ == '__main__':
+
+    ################## INPUT ARGUMENTS ###################
+    parser = argparse.ArgumentParser()
+    ####### Main Parameter: Dataset to use for Training
+    parser.add_argument('--dataset', default='vehicle_id', type=str, help='Dataset to use.',
+                        choices=['Inaturalist', 'vehicle_id'])
+    parser.add_argument('--source_path', default='/scratch/shared/beegfs/abrown/datasets', type=str,
+                        help='Path to training data.')
+    parser.add_argument('--save_path', default=os.getcwd() + '/Training_Results', type=str,
+                        help='Where to save everything.')
+    parser.add_argument('--savename', default='', type=str,
+                        help='Save folder name if any special information is to be included.')
+
+    ### General Training Parameters
+    parser.add_argument('--kernels', default=8, type=int, help='Number of workers for pytorch dataloader.')
+    parser.add_argument('--bs', default=112, type=int, help='Mini-Batchsize to use.')
+    parser.add_argument('--samples_per_class', default=4, type=int,help='Number of samples in one class drawn before choosing the next class. Set to >1 for losses other than ProxyNCA.')
+    parser.add_argument('--loss', default='smoothap', type=str)
+
+    ##### Evaluation Settings
+    parser.add_argument('--k_vals', nargs='+', default=[1, 2, 4, 8], type=int, help='Recall @ Values.')
+    ##### Network parameters
+    parser.add_argument('--embed_dim', default=512, type=int,
+                        help='Embedding dimensionality of the network. Note: in literature, dim=128 is used for ResNet50 and dim=512 for GoogLeNet.')
+    parser.add_argument('--arch', default='resnet50', type=str,
+                        help='Network backend choice: resnet50, googlenet, BNinception')
+    parser.add_argument('--gpu', default=0, type=int, help='GPU-id for GPU to use.')
+    parser.add_argument('--resume', default='', type=str, help='path to where weights to be evaluated are saved.')
+    parser.add_argument('--not_pretrained', action='store_true',
+                        help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+
+    parser.add_argument('--trainset', default="lin_train_set1.txt", type=str)
+    parser.add_argument('--testset', default="Inaturalist_test_set1.txt", type=str)
+    parser.add_argument('--cluster_path', default="", type=str)
+    parser.add_argument('--finetune', default="false", type=str)
+    parser.add_argument('--class_num', default=948, type=int)
+    parser.add_argument('--get_features', default="false", type=str)
+    parser.add_argument('--patch_size', default=16, type=int, help='vit patch size')
+    parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+    parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                        help="Whether to use batch normalizations in projection head (Default: False)")
+    parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                        help='Key to use in the checkpoint (example: "teacher")')
+    parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+    parser.add_argument('--norm_last_layer', default=True, type=aux.bool_flag,
+                        help="""Whether or not to weight normalize the last layer of the DINO head.
+        Not normalizing leads to better performance but can make the training unstable.
+        In our experiments, we typically set this paramater to False with vit_small and True with vit_base.""")
+    parser.add_argument('--linsize', default=29011, type=int, help="Lin data size.")
+    parser.add_argument('--uinsize', default=18403, type=int, help="Uin data size.")
+    opt = parser.parse_args()
+
+    """============================================================================"""
+    opt.source_path += '/' + opt.dataset
+
+    if opt.dataset == 'Inaturalist':
+        opt.n_epochs = 90
+        opt.tau = [40, 70]
+        opt.k_vals = [1, 4, 16, 32]
+
+    if opt.dataset == 'vehicle_id':
+        opt.k_vals = [1, 5]
+
+    if opt.finetune == 'true':
+        opt.finetune = True
+    elif opt.finetune == 'false':
+        opt.finetune = False
+
+    if opt.get_features == 'true':
+        opt.get_features = True
+    elif opt.get_features == 'false':
+        opt.get_features = False
+
+    metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+    LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+
+    """============================================================================"""
+    ##################### NETWORK SETUP ##################
+
+    opt.device = torch.device('cuda')
+    model = netlib.networkselect(opt)
+
+    # Push to Device
+    _ = model.to(opt.device)
+
+    """============================================================================"""
+    #################### DATALOADER SETUPS ##################
+    # Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+    # The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+    # Is simply using the training set, however running under the same rules as 'testing' dataloader,
+    # i.e. no shuffling and no random cropping.
+    dataloaders = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt)
+    # Because the number of supervised classes is dataset dependent, we store them after
+    # initializing the dataloader
+    opt.num_classes = len(dataloaders['training'].dataset.avail_classes)
+
+    if opt.dataset == 'Inaturalist':
+        eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': 0}
+
+    elif opt.dataset == 'vehicle_id':
+        eval_params = {
+            'dataloaders': [dataloaders['testing_set1'], dataloaders['testing_set2'], dataloaders['testing_set3']],
+            'model': model, 'opt': opt, 'epoch': 0}
+
+    """============================================================================"""
+    ####################evaluation ##################
+
+    results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/finetune_1head.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/finetune_1head.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+"""to do:
+
+clean all of the files - particularly the main.py and also the losses and dataset files and the file for doing the dataloading
+
+-- fast loading etc
+
+need to change all of the copyrights at the top of all of the files
+
+"""
+
+#################### LIBRARIES ########################
+import warnings
+warnings.filterwarnings("ignore")
+
+import os, numpy as np, argparse, random, matplotlib, datetime
+os.chdir(os.path.dirname(os.path.realpath(__file__)))
+from pathlib import Path
+matplotlib.use('agg')
+from tqdm import tqdm
+
+import auxiliaries as aux
+import datasets as data
+import netlib as netlib
+import losses as losses
+import evaluate as eval
+from tensorboardX import SummaryWriter
+import torch.multiprocessing
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+import time
+start = time.time()
+
+################### INPUT ARGUMENTS ###################
+parser = argparse.ArgumentParser()
+####### Main Parameter: Dataset to use for Training
+parser.add_argument('--dataset',      default='Inaturalist',   type=str, help='Dataset to use.', choices=['Inaturalist','semi_fungi'])
+### General Training Parameters
+parser.add_argument('--lr',                default=0.00001,  type=float, help='Learning Rate for network parameters.')
+parser.add_argument('--fc_lr_mul',         default=5,        type=float, help='OPTIONAL: Multiply the embedding layer learning rate by this value. If set to 0, the embedding layer shares the same learning rate.')
+parser.add_argument('--n_epochs',          default=400,       type=int,   help='Number of training epochs.')
+parser.add_argument('--kernels',           default=8,        type=int,   help='Number of workers for pytorch dataloader.')
+parser.add_argument('--bs',                default=112 ,     type=int,   help='Mini-Batchsize to use.')
+parser.add_argument('--samples_per_class', default=4,        type=int,   help='Number of samples in one class drawn before choosing the next class')
+parser.add_argument('--seed',              default=1,        type=int,   help='Random seed for reproducibility.')
+parser.add_argument('--scheduler',         default='step',   type=str,   help='Type of learning rate scheduling. Currently: step & exp.')
+parser.add_argument('--gamma',             default=0.3,      type=float, help='Learning rate reduction after tau epochs.')
+parser.add_argument('--decay',             default=0.001,   type=float, help='Weight decay for optimizer.')
+parser.add_argument('--tau',               default= [200,300],nargs='+',type=int,help='Stepsize(s) before reducing learning rate.')
+parser.add_argument('--infrequent_eval', default=0,type=int, help='only compute evaluation metrics every 10 epochs')
+parser.add_argument('--opt', default = 'adam',help='adam or sgd')
+##### Loss-specific Settings
+parser.add_argument('--loss',         default='smoothap', type=str)
+parser.add_argument('--sigmoid_temperature', default=0.01, type=float, help='SmoothAP: the temperature of the sigmoid used in SmoothAP loss')
+##### Evaluation Settings
+parser.add_argument('--k_vals',       nargs='+', default=[1,2,4,8], type=int, help='Recall @ Values.')
+parser.add_argument('--resume', default='', type=str, help='path to checkpoint to load weights from (if empty then ImageNet pre-trained weights are loaded')
+##### Network parameters
+parser.add_argument('--embed_dim',    default=512,         type=int,   help='Embedding dimensionality of the network')
+parser.add_argument('--arch',         default='resnet50',  type=str,   help='Network backend choice: resnet50, googlenet, BNinception')
+parser.add_argument('--grad_measure',                      action='store_true', help='If added, gradients passed from embedding layer to the last conv-layer are stored in each iteration.')
+parser.add_argument('--dist_measure',                      action='store_true', help='If added, the ratio between intra- and interclass distances is stored after each epoch.')
+parser.add_argument('--not_pretrained',                    action='store_true', help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+##### Setup Parameters
+parser.add_argument('--gpu',          default=0,           type=int,   help='GPU-id for GPU to use.')
+parser.add_argument('--savename',     default='',          type=str,   help='Save folder name if any special information is to be included.')
+### Paths to datasets and storage folder
+parser.add_argument('--source_path',  default='/scratch/shared/beegfs/abrown/datasets',         type=str, help='Path to data')
+parser.add_argument('--save_path',    default=os.getcwd()+'/Training_Results', type=str, help='Where to save the checkpoints')
+### additional parameters
+parser.add_argument('--trainset',     default="lin_train_set1.txt", type=str)
+parser.add_argument('--testset',      default="Inaturalist_test_set1.txt", type=str)
+parser.add_argument('--cluster_path', default="", type=str)
+parser.add_argument('--finetune',     default='true', type=str)
+parser.add_argument('--class_num',    default=948, type=int)
+parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                    help="Whether to use batch normalizations in projection head (Default: False)")
+parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                    help='Key to use in the checkpoint (example: "teacher")')
+parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+parser.add_argument('--iter', default=1, type=int)
+
+opt = parser.parse_args()
+"""============================================================================"""
+opt.source_path += '/' + opt.dataset
+opt.save_path += '/' + opt.dataset + "_" + str(opt.embed_dim)
+
+if opt.dataset== 'Inaturalist':
+    # opt.n_epochs = 90
+    opt.tau = [40, 70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.dataset=='semi_fungi':
+    opt.tau = [40,70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.finetune == 'true':
+    opt.finetune = True
+elif opt.finetune == 'false':
+    opt.finetune = False
+
+"""==========================================================================="""
+################### TensorBoard Settings ##################
+timestamp = datetime.datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
+exp_name = aux.args2exp_name(opt)
+opt.save_name = f"weights_{exp_name}" +'/'+ timestamp
+random.seed(opt.seed)
+np.random.seed(opt.seed)
+torch.manual_seed(opt.seed)
+torch.cuda.manual_seed(opt.seed); torch.cuda.manual_seed_all(opt.seed)
+tensorboard_path = Path(f"logs/logs_{exp_name}") / timestamp
+
+tensorboard_path.parent.mkdir(exist_ok=True, parents=True)
+global writer;
+writer = SummaryWriter(tensorboard_path)
+"""============================================================================"""
+################### GPU SETTINGS ###########################
+os.environ["CUDA_DEVICE_ORDER"]   ="PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"]= str(opt.gpu)
+print('using #GPUs:',torch.cuda.device_count())
+
+"""============================================================================"""
+#################### DATALOADER SETUPS ##################
+#Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+#The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+#Is simply using the training set, however running under the same rules as 'testing' dataloader,
+#i.e. no shuffling and no random cropping.
+dataloaders      = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt, cluster_path=opt.cluster_path)
+#Because the number of supervised classes is dataset dependent, we store them after
+#initializing the dataloader
+opt.num_classes  = len(dataloaders['training'].dataset.avail_classes)
+print("num_classes:", opt.num_classes)
+print("train dataset size:", len(dataloaders['training']))
+
+"""============================================================================"""
+##################### NETWORK SETUP ##################
+
+opt.device = torch.device('cuda')
+model      = netlib.networkselect(opt)
+
+#Push to Device
+if torch.cuda.device_count() > 1:
+    model = torch.nn.DataParallel(model)
+_          = model.to(opt.device)
+#Place trainable parameter in list of parameters to train:
+
+if 'fc_lr_mul' in vars(opt).keys() and opt.fc_lr_mul!=0:
+
+    all_but_fc_params = list(filter(lambda x: 'last_linear' not in x[0],model.named_parameters()))
+
+    for ind, param in enumerate(all_but_fc_params):
+        all_but_fc_params[ind] = param[1]
+
+    if torch.cuda.device_count() > 1:
+        fc_params         = model.module.model.last_linear.parameters()
+    else:
+        fc_params         = model.model.last_linear.parameters()
+
+    to_optim          = [{'params':all_but_fc_params,'lr':opt.lr,'weight_decay':opt.decay},
+                         {'params':fc_params,'lr':opt.lr*opt.fc_lr_mul,'weight_decay':opt.decay}]
+else:
+    to_optim   = [{'params':model.parameters(),'lr':opt.lr,'weight_decay':opt.decay}]
+"""============================================================================"""
+#################### CREATE LOGGING FILES ###############
+#Each dataset usually has a set of standard metrics to log. aux.metrics_to_examine()
+#returns a dict which lists metrics to log for training ('train') and validation/testing ('val')
+
+metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+# example output: {'train': ['Epochs', 'Time', 'Train Loss', 'Time'],
+#                  'val': ['Epochs','Time','NMI','F1', 'Recall @ 1','Recall @ 2','Recall @ 4','Recall @ 8']}
+
+#Using the provided metrics of interest, we generate a LOGGER instance.
+#Note that 'start_new' denotes that a new folder should be made in which everything will be stored.
+#This includes network weights as well.
+LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+#If graphviz is installed on the system, a computational graph of the underlying
+#network will be made as well.
+
+"""============================================================================"""
+#################### LOSS SETUP ####################
+#Depending on opt.loss and opt.sampling, the respective criterion is returned,
+#and if the loss has trainable parameters, to_optim is appended.
+criterion, to_optim = losses.loss_select(opt.loss, opt, to_optim)
+_ = criterion.to(opt.device)
+
+"""============================================================================"""
+##################### OPTIONAL EVALUATIONS #####################
+#Store the averaged gradients returned from the embedding to the last conv. layer.
+if opt.grad_measure:
+    grad_measure = eval.GradientMeasure(opt, name='baseline')
+#Store the relative distances between average intra- and inter-class distance.
+if opt.dist_measure:
+    #Add a distance measure for training distance ratios
+    distance_measure = eval.DistanceMeasure(dataloaders['evaluation'], opt, name='Train', update_epochs=1)
+    # #If uncommented: Do the same for the test set
+    # distance_measure_test = eval.DistanceMeasure(dataloaders['testing'], opt, name='Train', update_epochs=1)
+
+"""============================================================================"""
+#################### OPTIM SETUP ####################
+#As optimizer, Adam with standard parameters is used.
+if opt.opt == 'adam':
+    optimizer    = torch.optim.Adam(to_optim)
+elif opt.opt == 'sgd':
+    optimizer    = torch.optim.SGD(to_optim)
+else:
+    raise Exception('unknown optimiser')
+# for the SOA measures in the paper - need to use SGD and 0.05 learning rate
+#optimizer    = torch.optim.Adam(to_optim)
+#optimizer    = torch.optim.SGD(to_optim)
+if opt.scheduler=='exp':
+    scheduler    = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma)
+elif opt.scheduler=='step':
+    scheduler    = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.tau, gamma=opt.gamma)
+elif opt.scheduler=='none':
+    print('No scheduling used!')
+else:
+    raise Exception('No scheduling option for input: {}'.format(opt.scheduler))
+
+def same_model(model1,model2):
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+"""============================================================================"""
+#################### TRAINER FUNCTION ############################
+def train_one_epoch_finetune(train_dataloader, model, optimizer, criterion, opt, epoch):
+    """
+    This function is called every epoch to perform training of the network over one full
+    (randomized) iteration of the dataset.
+
+    Args:
+        train_dataloader: torch.utils.data.DataLoader, returns (augmented) training data.
+        model:            Network to train.
+        optimizer:        Optimizer to use for training.
+        criterion:        criterion to use during training.
+        opt:              argparse.Namespace, Contains all relevant parameters.
+        epoch:            int, Current epoch.
+
+    Returns:
+        Nothing!
+    """
+
+    loss_collect         = []
+
+    start = time.time()
+    data_iterator = tqdm(train_dataloader, desc='Epoch {} Training gt labels...'.format(epoch))
+    for i,(class_labels, input) in enumerate(data_iterator):
+
+        #Compute embeddings for input batch
+        features  = model(input.to(opt.device))
+
+        #Compute loss.
+        if opt.loss != 'smoothap':
+            loss      = criterion(features, class_labels)
+        else:
+            loss      = criterion(features)
+
+        #Ensure gradients are set to zero at beginning
+        optimizer.zero_grad()
+        #Compute gradient
+        loss.backward()
+
+        train_dataloader.dataset.classes_visited = []
+
+        if opt.grad_measure:
+            #If desired, save computed gradients.
+            grad_measure.include(model.model.last_linear)
+
+        #Update weights using comp. gradients.
+        optimizer.step()
+
+        #Store loss per iteration.
+        loss_collect.append(loss.item())
+        if i==len(train_dataloader)-1: 
+            data_iterator.set_description('Epoch (Train) {0}: Mean Loss [{1:.4f}]'.format(epoch, np.mean(loss_collect)))
+
+    #Save metrics
+    LOG.log('train', LOG.metrics_to_log['train'], [epoch, np.round(time.time()-start,4), np.mean(loss_collect)])
+    writer.add_scalar('global/training_loss',np.mean(loss_collect),epoch)
+    if opt.grad_measure:
+        #Dump stored gradients to Pickle-File.
+        grad_measure.dump(epoch)
+
+"""============================================================================"""
+"""========================== MAIN TRAINING PART =============================="""
+"""============================================================================"""
+################### SCRIPT MAIN ##########################
+print('\n-----\n')
+# Each dataset requires slightly different dataloaders.
+
+if opt.dataset == 'Inaturalist' or 'semi_fungi':
+    eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': 0}
+
+# Compute Evaluation metrics, print them and store in LOG.
+print('epochs -> '+str(opt.n_epochs))
+import time
+
+
+for epoch in range(opt.n_epochs):
+    ### Print current learning rates for all parameters
+    if opt.scheduler!='none': print('Running with learning rates {}...'.format(' | '.join('{}'.format(x) for x in scheduler.get_lr())))
+
+    ### Train one epoch
+    _ = model.train()
+    
+    train_one_epoch_finetune(dataloaders['training'], model, optimizer, criterion, opt, epoch)
+
+    dataloaders['training'].dataset.reshuffle()
+    ### Evaluate
+    _ = model.eval()
+    #Each dataset requires slightly different dataloaders.
+    if opt.dataset == 'Inaturalist':
+        eval_params = {'dataloader':dataloaders['testing'], 'model':model, 'opt':opt, 'epoch':epoch}
+    elif opt.dataset=='semi_fungi':
+        eval_params = {'dataloader':dataloaders['testing'], 'model':model, 'opt':opt, 'epoch':epoch}
+
+    #Compute Evaluation metrics, print them and store in LOG.
+    if opt.infrequent_eval == 1:
+        epoch_freq = 10
+    else:
+        epoch_freq = 1
+
+    if epoch%epoch_freq == 0:
+        results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
+        writer.add_scalar('global/recall1',results[0][0],epoch+1)
+        writer.add_scalar('global/recall2',results[0][1],epoch+1)
+        writer.add_scalar('global/recall3',results[0][2],epoch+1)
+        writer.add_scalar('global/recall4',results[0][3],epoch+1)
+        writer.add_scalar('global/NMI',results[1],epoch+1)
+        writer.add_scalar('global/F1',results[2],epoch+1)
+
+    #Update the Metric Plot and save it.
+    #LOG.update_info_plot()
+    #(optional) compute ratio of intra- to interdistances.
+    if opt.dist_measure:
+        distance_measure.measure(model, epoch)
+        # distance_measure_test.measure(model, epoch)
+
+    ### Learning Rate Scheduling Step
+    if opt.scheduler != 'none':
+        scheduler.step()
+
+    print('\n-----\n')
+
+print("Time:" ,time.time() - start)
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/get_features.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/get_features.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+"""to do:
+
+clean all of the files - particularly the main.py and also the losses and dataset files and the file for doing the dataloading
+
+-- fast loading etc
+
+need to change all of the copyrights at the top of all of the files
+
+"""
+
+#################### LIBRARIES ########################
+import warnings
+warnings.filterwarnings("ignore")
+
+import os, numpy as np, argparse, random, matplotlib, datetime
+os.chdir(os.path.dirname(os.path.realpath(__file__)))
+matplotlib.use('agg')
+
+import auxiliaries as aux
+import datasets as data
+import netlib as netlib
+import losses as losses
+import evaluate as eval
+import torch.multiprocessing
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+################### INPUT ARGUMENTS ###################
+parser = argparse.ArgumentParser()
+####### Main Parameter: Dataset to use for Training
+parser.add_argument('--dataset',      default='Inaturalist',   type=str, help='Dataset to use.', choices=['Inaturalist', 'semi_fungi'])
+### General Training Parameters
+parser.add_argument('--lr',                default=0.00001,  type=float, help='Learning Rate for network parameters.')
+parser.add_argument('--fc_lr_mul',         default=5,        type=float, help='OPTIONAL: Multiply the embedding layer learning rate by this value. If set to 0, the embedding layer shares the same learning rate.')
+parser.add_argument('--n_epochs',          default=400,       type=int,   help='Number of training epochs.')
+parser.add_argument('--kernels',           default=8,        type=int,   help='Number of workers for pytorch dataloader.')
+parser.add_argument('--bs',                default=112 ,     type=int,   help='Mini-Batchsize to use.')
+parser.add_argument('--samples_per_class', default=4,        type=int,   help='Number of samples in one class drawn before choosing the next class')
+parser.add_argument('--seed',              default=1,        type=int,   help='Random seed for reproducibility.')
+parser.add_argument('--scheduler',         default='step',   type=str,   help='Type of learning rate scheduling. Currently: step & exp.')
+parser.add_argument('--gamma',             default=0.3,      type=float, help='Learning rate reduction after tau epochs.')
+parser.add_argument('--decay',             default=0.0004,   type=float, help='Weight decay for optimizer.')
+parser.add_argument('--tau',               default= [200,300],nargs='+',type=int,help='Stepsize(s) before reducing learning rate.')
+parser.add_argument('--infrequent_eval', default=0,type=int, help='only compute evaluation metrics every 10 epochs')
+parser.add_argument('--opt', default = 'adam',help='adam or sgd')
+##### Loss-specific Settings
+parser.add_argument('--loss',         default='smoothap', type=str)
+parser.add_argument('--sigmoid_temperature', default=0.01, type=float, help='SmoothAP: the temperature of the sigmoid used in SmoothAP loss')
+##### Evaluation Settings
+parser.add_argument('--k_vals',       nargs='+', default=[1,2,4,8], type=int, help='Recall @ Values.')
+parser.add_argument('--resume', default='', type=str, help='path to checkpoint to load weights from (if empty then ImageNet pre-trained weights are loaded')
+##### Network parameters
+parser.add_argument('--embed_dim',    default=512,         type=int,   help='Embedding dimensionality of the network')
+parser.add_argument('--arch',         default='resnet50',  type=str,   help='Network backend choice: resnet50, googlenet, BNinception')
+parser.add_argument('--grad_measure',                      action='store_true', help='If added, gradients passed from embedding layer to the last conv-layer are stored in each iteration.')
+parser.add_argument('--dist_measure',                      action='store_true', help='If added, the ratio between intra- and interclass distances is stored after each epoch.')
+parser.add_argument('--not_pretrained',                    action='store_true', help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+##### Setup Parameters
+parser.add_argument('--gpu',          default=0,           type=int,   help='GPU-id for GPU to use.')
+parser.add_argument('--savename',     default='',          type=str,   help='Save folder name if any special information is to be included.')
+### Paths to datasets and storage folder
+parser.add_argument('--source_path',  default='/scratch/shared/beegfs/abrown/datasets',         type=str, help='Path to data')
+parser.add_argument('--save_path',    default=os.getcwd()+'/Training_Results', type=str, help='Where to save the checkpoints')
+### adational
+parser.add_argument('--trainset',     default="lin_train_set1.txt", type=str)
+parser.add_argument('--all_trainset',     default="train_set1.txt", type=str)
+parser.add_argument('--testset',      default="test_set1.txt", type=str)
+parser.add_argument('--finetune',     default='true', type=str)
+parser.add_argument('--cluster_path', default="", type=str)
+parser.add_argument('--get_features', default="false", type=str)
+parser.add_argument('--class_num', default=948, type=int)
+parser.add_argument('--iter',         default=0, type=int)
+parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                    help="Whether to use batch normalizations in projection head (Default: False)")
+parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                    help='Key to use in the checkpoint (example: "teacher")')
+parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+parser.add_argument('--linsize', default=29011, type=int, help="Lin data size.")
+parser.add_argument('--uinsize', default=18403, type=int, help="Uin data size.")
+opt = parser.parse_args()
+"""============================================================================"""
+opt.source_path += '/' + opt.dataset
+opt.save_path += '/' + opt.dataset + "_" + str(opt.embed_dim)
+
+if opt.dataset== 'Inaturalist':
+    opt.n_epochs = 90
+    opt.tau = [40,70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.dataset=='semi_fungi':
+    opt.tau = [40,70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.get_features == "true":
+    opt.get_features = True
+if opt.get_features == "false":
+    opt.get_features = False
+
+if opt.finetune == 'true':
+    opt.finetune = True
+elif opt.finetune == 'false':
+    opt.finetune = False
+
+"""==========================================================================="""
+################### TensorBoard Settings ##################
+timestamp = datetime.datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
+exp_name = aux.args2exp_name(opt)
+opt.save_name = f"weights_{exp_name}" +'/'+ timestamp
+random.seed(opt.seed)
+np.random.seed(opt.seed)
+torch.manual_seed(opt.seed)
+torch.cuda.manual_seed(opt.seed); torch.cuda.manual_seed_all(opt.seed)
+
+"""============================================================================"""
+################### GPU SETTINGS ###########################
+os.environ["CUDA_DEVICE_ORDER"]   ="PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"]= str(opt.gpu)
+print('using #GPUs:',torch.cuda.device_count())
+
+"""============================================================================"""
+##################### NETWORK SETUP ##################
+
+opt.device = torch.device('cuda')
+model      = netlib.networkselect(opt)
+
+#Push to Device
+if torch.cuda.device_count() > 1:
+    model  = torch.nn.DataParallel(model)
+_          = model.to(opt.device)
+
+#Place trainable parameter in list of parameters to train:
+
+if 'fc_lr_mul' in vars(opt).keys() and opt.fc_lr_mul!=0:
+
+    all_but_fc_params = list(filter(lambda x: 'last_linear' not in x[0],model.named_parameters()))
+
+    for ind, param in enumerate(all_but_fc_params):
+        all_but_fc_params[ind] = param[1]
+
+    if torch.cuda.device_count() > 1:
+        fc_params         = model.module.model.last_linear.parameters()
+    else:
+        fc_params         = model.model.last_linear.parameters()
+
+    to_optim          = [{'params':all_but_fc_params,'lr':opt.lr,'weight_decay':opt.decay},
+                         {'params':fc_params,'lr':opt.lr*opt.fc_lr_mul,'weight_decay':opt.decay}]
+else:
+    to_optim   = [{'params':model.parameters(),'lr':opt.lr,'weight_decay':opt.decay}]
+"""============================================================================"""
+#################### DATALOADER SETUPS ##################
+#Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+#The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+#Is simply using the training set, however running under the same rules as 'testing' dataloader,
+#i.e. no shuffling and no random cropping.
+dataloaders      = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt)
+#Because the number of supervised classes is dataset dependent, we store them after
+#initializing the dataloader
+opt.num_classes  = len(dataloaders['training'].dataset.avail_classes)
+
+"""============================================================================"""
+#################### CREATE LOGGING FILES ###############
+#Each dataset usually has a set of standard metrics to log. aux.metrics_to_examine()
+#returns a dict which lists metrics to log for training ('train') and validation/testing ('val')
+
+metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+# example output: {'train': ['Epochs', 'Time', 'Train Loss', 'Time'],
+#                  'val': ['Epochs','Time','NMI','F1', 'Recall @ 1','Recall @ 2','Recall @ 4','Recall @ 8']}
+
+#Using the provided metrics of interest, we generate a LOGGER instance.
+#Note that 'start_new' denotes that a new folder should be made in which everything will be stored.
+#This includes network weights as well.
+#If graphviz is installed on the system, a computational graph of the underlying
+#network will be made as well.
+
+"""============================================================================"""
+#################### LOSS SETUP ####################
+#Depending on opt.loss and opt.sampling, the respective criterion is returned,
+#and if the loss has trainable parameters, to_optim is appended.
+LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+criterion, to_optim = losses.loss_select(opt.loss, opt, to_optim)
+_ = criterion.to(opt.device)
+
+"""============================================================================"""
+##################### OPTIONAL EVALUATIONS #####################
+#Store the averaged gradients returned from the embedding to the last conv. layer.
+if opt.grad_measure:
+    grad_measure = eval.GradientMeasure(opt, name='baseline')
+#Store the relative distances between average intra- and inter-class distance.
+if opt.dist_measure:
+    #Add a distance measure for training distance ratios
+    distance_measure = eval.DistanceMeasure(dataloaders['evaluation'], opt, name='Train', update_epochs=1)
+    # #If uncommented: Do the same for the test set
+    # distance_measure_test = eval.DistanceMeasure(dataloaders['testing'], opt, name='Train', update_epochs=1)
+
+"""============================================================================"""
+#################### OPTIM SETUP ####################
+#As optimizer, Adam with standard parameters is used.
+if opt.opt == 'adam':
+    optimizer    = torch.optim.Adam(to_optim)
+elif opt.opt == 'sgd':
+    optimizer    = torch.optim.SGD(to_optim)
+else:
+    raise Exception('unknown optimiser')
+# for the SOA measures in the paper - need to use SGD and 0.05 learning rate
+#optimizer    = torch.optim.Adam(to_optim)
+#optimizer    = torch.optim.SGD(to_optim)
+if opt.scheduler=='exp':
+    scheduler    = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma)
+elif opt.scheduler=='step':
+    scheduler    = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.tau, gamma=opt.gamma)
+elif opt.scheduler=='none':
+    print('No scheduling used!')
+else:
+    raise Exception('No scheduling option for input: {}'.format(opt.scheduler))
+
+def same_model(model1,model2):
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+
+"""============================================================================"""
+"""================================ TESTING ==================================="""
+"""============================================================================"""
+################### SCRIPT MAIN ##########################
+print('\n-----\n')
+# Compute Evaluation metrics, print them and store in LOG.
+
+_    = model.eval()
+aux.vis(model, dataloaders['training'], opt.device, split="T_train_iter"+str(opt.iter)+"_"+str(opt.loss), opt=opt)
+aux.vis(model, dataloaders['testing'], opt.device, split="all_train_iter"+str(opt.iter)+"_"+str(opt.loss), opt=opt)
+aux.vis(model, dataloaders['eval'], opt.device, split="test_iter"+str(opt.iter)+"_"+str(opt.loss), opt=opt)
+#Update the Metric Plot and save it.
+print('\n-----\n')
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/losses.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/losses.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+###################### LIBRARIES #################################################
+import warnings
+warnings.filterwarnings("ignore")
+
+import torch, faiss
+import numpy as np
+from scipy import sparse
+
+
+"""================================================================================================="""
+############ LOSS SELECTION FUNCTION #####################
+def loss_select(loss, opt, to_optim):
+    """
+    Selection function which returns the respective criterion while appending to list of trainable parameters if required.
+
+    Args:
+        loss:     str, name of loss function to return.
+        opt:      argparse.Namespace, contains all training-specific parameters.
+        to_optim: list of trainable parameters. Is extend if loss function contains those as well.
+    Returns:
+        criterion (torch.nn.Module inherited), to_optim (optionally appended)
+    """
+    if loss == 'smoothap':
+        loss_params  = {'anneal':opt.sigmoid_temperature, 'batch_size':opt.bs, "num_id":int(opt.bs / opt.samples_per_class), 'feat_dims':opt.embed_dim}
+        criterion    = SmoothAP(**loss_params)
+    else:
+        raise Exception('Loss {} not available!'.format(loss))
+
+    return criterion, to_optim
+
+
+"""==============================================Smooth-AP========================================"""
+
+def sigmoid(tensor, temp=1.0):
+    """ temperature controlled sigmoid
+    takes as input a torch tensor (tensor) and passes it through a sigmoid, controlled by temperature: temp
+    """
+    exponent = -tensor / temp
+    # clamp the input tensor for stability
+    exponent = torch.clamp(exponent, min=-50, max=50)
+    y = 1.0 / (1.0 + torch.exp(exponent))
+    return y
+
+
+def compute_aff(x):
+    """computes the affinity matrix between an input vector and itself"""
+    return torch.mm(x, x.t())
+
+
+class BinarizedF(torch.autograd.Function):
+    def forward(self, inp):
+        self.save_for_backward(inp)
+        a = torch.ones_like(inp)
+        b = torch.zeros_like(inp)
+        output = torch.where(inp > 0, a, b)
+        return output
+
+    def backward(self, output_grad):
+        inp, = self.saved_tensors
+        input_abs = torch.abs(inp)
+        ones = torch.ones_like(inp)
+        zeros = torch.zeros_like(inp)
+        input_grad = torch.where(input_abs > 0, ones, zeros)
+        return input_grad
+
+
+class BinarizedModule(torch.nn.Module):
+    def __init__(self):
+        super(BinarizedModule, self).__init__()
+        self.BF = BinarizedF()
+
+    def forward(self, inp):
+        output = self.BF(inp)
+        return output
+
+
+class SmoothAP(torch.nn.Module):
+    """PyTorch implementation of the Smooth-AP loss.
+    implementation of the Smooth-AP loss. Takes as input the mini-batch of CNN-produced feature embeddings and returns
+    the value of the Smooth-AP loss. The mini-batch must be formed of a defined number of classes. Each class must
+    have the same number of instances represented in the mini-batch and must be ordered sequentially by class.
+    e.g. the labels for a mini-batch with batch size 9, and 3 represented classes (A,B,C) must look like:
+        labels = ( A, A, A, B, B, B, C, C, C)
+    (the order of the classes however does not matter)
+    For each instance in the mini-batch, the loss computes the Smooth-AP when it is used as the query and the rest of the
+    mini-batch is used as the retrieval set. The positive set is formed of the other instances in the batch from the
+    same class. The loss returns the average Smooth-AP across all instances in the mini-batch.
+    Args:
+        anneal : float
+            the temperature of the sigmoid that is used to smooth the ranking function. A low value of the temperature
+            results in a steep sigmoid, that tightly approximates the heaviside step function in the ranking function.
+        batch_size : int
+            the batch size being used during training.
+        num_id : int
+            the number of different classes that are represented in the batch.
+        feat_dims : int
+            the dimension of the input feature embeddings
+    Shape:
+        - Input (preds): (batch_size, feat_dims) (must be a cuda torch float tensor)
+        - Output: scalar
+    Examples::
+        >>> loss = SmoothAP(0.01, 60, 6, 256)
+        >>> input = torch.randn(60, 256, requires_grad=True).cuda()
+        >>> output = loss(input)
+        >>> output.backward()
+    """
+
+    def __init__(self, anneal, batch_size, num_id, feat_dims):
+        """
+        Parameters
+        ----------
+        anneal : float
+            the temperature of the sigmoid that is used to smooth the ranking function
+        batch_size : int
+            the batch size being used
+        num_id : int
+            the number of different classes that are represented in the batch
+        feat_dims : int
+            the dimension of the input feature embeddings
+        """
+        super(SmoothAP, self).__init__()
+
+        assert(batch_size%num_id==0)
+
+        self.anneal = anneal
+        self.batch_size = batch_size
+        self.num_id = num_id
+        self.feat_dims = feat_dims
+
+    def forward(self, preds):
+        """Forward pass for all input predictions: preds - (batch_size x feat_dims) """
+
+
+        # ------ differentiable ranking of all retrieval set ------
+        # compute the mask which ignores the relevance score of the query to itself
+        mask = 1.0 - torch.eye(self.batch_size)
+        mask = mask.unsqueeze(dim=0).repeat(self.batch_size, 1, 1)
+        # compute the relevance scores via cosine similarity of the CNN-produced embedding vectors
+        sim_all = compute_aff(preds)
+        sim_all_repeat = sim_all.unsqueeze(dim=1).repeat(1, self.batch_size, 1)
+        # compute the difference matrix
+        sim_diff = sim_all_repeat - sim_all_repeat.permute(0, 2, 1)
+        # pass through the sigmoid
+        sim_sg = sigmoid(sim_diff, temp=self.anneal) * mask.cuda()
+        # compute the rankings
+        sim_all_rk = torch.sum(sim_sg, dim=-1) + 1
+
+        # ------ differentiable ranking of only positive set in retrieval set ------
+        # compute the mask which only gives non-zero weights to the positive set
+        xs = preds.view(self.num_id, int(self.batch_size / self.num_id), self.feat_dims)
+        pos_mask = 1.0 - torch.eye(int(self.batch_size / self.num_id))
+        pos_mask = pos_mask.unsqueeze(dim=0).unsqueeze(dim=0).repeat(self.num_id, int(self.batch_size / self.num_id), 1, 1)
+        # compute the relevance scores
+        sim_pos = torch.bmm(xs, xs.permute(0, 2, 1))
+        sim_pos_repeat = sim_pos.unsqueeze(dim=2).repeat(1, 1, int(self.batch_size / self.num_id), 1)
+        # compute the difference matrix
+        sim_pos_diff = sim_pos_repeat - sim_pos_repeat.permute(0, 1, 3, 2)
+        # pass through the sigmoid
+        sim_pos_sg = sigmoid(sim_pos_diff, temp=self.anneal) * pos_mask.cuda()
+        # compute the rankings of the positive set
+        sim_pos_rk = torch.sum(sim_pos_sg, dim=-1) + 1
+
+        # sum the values of the Smooth-AP for all instances in the mini-batch
+        ap = torch.zeros(1).cuda()
+        group = int(self.batch_size / self.num_id)
+        for ind in range(self.num_id):
+            pos_divide = torch.sum(sim_pos_rk[ind] / (sim_all_rk[(ind * group):((ind + 1) * group), (ind * group):((ind + 1) * group)]))
+            ap = ap + ((pos_divide / group) / self.batch_size)
+        return (1 - ap)
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/main.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/main.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+"""to do:
+
+clean all of the files - particularly the main.py and also the losses and dataset files and the file for doing the dataloading
+
+-- fast loading etc
+
+need to change all of the copyrights at the top of all of the files
+
+"""
+
+#################### LIBRARIES ########################
+import warnings
+
+warnings.filterwarnings("ignore")
+
+import os, numpy as np, argparse, random, matplotlib, datetime
+
+os.chdir(os.path.dirname(os.path.realpath(__file__)))
+from pathlib import Path
+
+matplotlib.use('agg')
+from tqdm import tqdm
+
+import auxiliaries as aux
+import datasets as data
+import netlib as netlib
+import losses as losses
+import evaluate as eval
+from tensorboardX import SummaryWriter
+import torch.multiprocessing
+
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+################### INPUT ARGUMENTS ###################
+parser = argparse.ArgumentParser()
+####### Main Parameter: Dataset to use for Training
+parser.add_argument('--dataset', default='vehicle_id', type=str, help='Dataset to use.',
+                    choices=['SoftInaturalist', 'Inaturalist', 'vehicle_id', 'semi_fungi'])
+### General Training Parameters
+parser.add_argument('--lr', default=0.00001, type=float, help='Learning Rate for network parameters.')
+parser.add_argument('--fc_lr_mul', default=5, type=float,
+                    help='OPTIONAL: Multiply the embedding layer learning rate by this value. If set to 0, the embedding layer shares the same learning rate.')
+parser.add_argument('--n_epochs', default=400, type=int, help='Number of training epochs.')
+parser.add_argument('--kernels', default=8, type=int, help='Number of workers for pytorch dataloader.')
+parser.add_argument('--bs', default=112, type=int, help='Mini-Batchsize to use.')
+parser.add_argument('--samples_per_class', default=4, type=int,
+                    help='Number of samples in one class drawn before choosing the next class')
+parser.add_argument('--seed', default=1, type=int, help='Random seed for reproducibility.')
+parser.add_argument('--scheduler', default='step', type=str,
+                    help='Type of learning rate scheduling. Currently: step & exp.')
+parser.add_argument('--gamma', default=0.3, type=float, help='Learning rate reduction after tau epochs.')
+parser.add_argument('--decay', default=0.0004, type=float, help='Weight decay for optimizer.')
+parser.add_argument('--tau', default=[200, 300], nargs='+', type=int, help='Stepsize(s) before reducing learning rate.')
+parser.add_argument('--infrequent_eval', default=0, type=int, help='only compute evaluation metrics every 10 epochs')
+parser.add_argument('--opt', default='adam', help='adam or sgd')
+##### Loss-specific Settings
+parser.add_argument('--loss', default='smoothap', type=str)
+parser.add_argument('--sigmoid_temperature', default=0.01, type=float,
+                    help='SmoothAP: the temperature of the sigmoid used in SmoothAP loss')
+##### Evaluation Settings
+parser.add_argument('--k_vals', nargs='+', default=[1, 2, 4, 8], type=int, help='Recall @ Values.')
+parser.add_argument('--resume', default='', type=str,
+                    help='path to checkpoint to load weights from (if empty then ImageNet pre-trained weights are loaded')
+##### Network parameters
+parser.add_argument('--embed_dim', default=512, type=int, help='Embedding dimensionality of the network')
+parser.add_argument('--arch', default='resnet50', type=str,
+                    help='Network backend choice: resnet50')
+parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                    help="Whether to use batch normalizations in projection head (Default: False)")
+parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                    help='Key to use in the checkpoint (example: "teacher")')
+parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+parser.add_argument('--grad_measure', action='store_true',
+                    help='If added, gradients passed from embedding layer to the last conv-layer are stored in each iteration.')
+parser.add_argument('--dist_measure', action='store_true',
+                    help='If added, the ratio between intra- and interclass distances is stored after each epoch.')
+parser.add_argument('--not_pretrained', action='store_true',
+                    help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+##### Setup Parameters
+parser.add_argument('--gpu', default=0, type=int, help='GPU-id for GPU to use.')
+parser.add_argument('--savename', default='', type=str,
+                    help='Save folder name if any special information is to be included.')
+### Paths to datasets and storage folder
+parser.add_argument('--source_path', default='/scratch/shared/beegfs/abrown/datasets', type=str, help='Path to data')
+parser.add_argument('--save_path', default=os.getcwd() + '/Training_Results', type=str,
+                    help='Where to save the checkpoints')
+### additional parameters
+parser.add_argument('--trainset', default="lin_train_set1.txt", type=str)
+parser.add_argument('--testset', default="Inaturalist_test_set1.txt", type=str)
+parser.add_argument('--cluster_path', default="", type=str)
+parser.add_argument('--finetune', default="false", type=str)
+parser.add_argument('--class_num', default=948, type=int)
+parser.add_argument('--get_features', default="false", type=str)
+parser.add_argument('--linsize', default=29011, type=int, help="Lin data size.")
+parser.add_argument('--uinsize', default=18403, type=int, help="Uin data size.")
+parser.add_argument('--iter', default=0, type=int)
+
+opt = parser.parse_args()
+"""============================================================================"""
+if opt.dataset == "SoftInaturalist":
+    opt.source_path += '/Inaturalist'
+    opt.save_path += '/Inaturalist' + "_" + str(opt.embed_dim)
+else:
+    opt.source_path += '/' + opt.dataset
+    opt.save_path += '/' + opt.dataset + "_" + str(opt.embed_dim)
+
+if opt.dataset == 'Inaturalist':
+    # opt.n_epochs = 90
+    opt.tau = [40, 70]
+    opt.k_vals = [1, 4, 16, 32]
+
+if opt.dataset == 'SoftInaturalist':
+    # opt.n_epochs = 90
+    opt.tau = [40, 70]
+    opt.k_vals = [1, 4, 16, 32]
+
+if opt.dataset == 'vehicle_id':
+    opt.k_vals = [1, 5]
+
+if opt.dataset == 'semi_fungi':
+    opt.tau = [40, 70]
+    opt.k_vals = [1, 4, 16, 32]
+
+if opt.finetune == 'true':
+    opt.finetune = True
+elif opt.finetune == 'false':
+    opt.finetune = False
+
+if opt.get_features == 'true':
+    opt.get_features = True
+elif opt.get_features == 'false':
+    opt.get_features = False
+
+"""==========================================================================="""
+################### TensorBoard Settings ##################
+timestamp = datetime.datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
+exp_name = aux.args2exp_name(opt)
+opt.save_name = f"weights_{exp_name}" + '/' + timestamp
+random.seed(opt.seed)
+np.random.seed(opt.seed)
+torch.manual_seed(opt.seed)
+torch.cuda.manual_seed(opt.seed);
+torch.cuda.manual_seed_all(opt.seed)
+tensorboard_path = Path(f"logs/logs_{exp_name}") / timestamp
+
+tensorboard_path.parent.mkdir(exist_ok=True, parents=True)
+global writer;
+writer = SummaryWriter(tensorboard_path)
+"""============================================================================"""
+################### GPU SETTINGS ###########################
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"]= str(opt.gpu)
+print('using #GPUs:', torch.cuda.device_count())
+
+"""============================================================================"""
+##################### NETWORK SETUP ##################
+
+opt.device = torch.device('cuda')
+model = netlib.networkselect(opt)
+
+# Push to Device
+if torch.cuda.device_count() > 1:
+    model = torch.nn.DataParallel(model)
+_ = model.to(opt.device)
+# Place trainable parameter in list of parameters to train:
+
+if 'fc_lr_mul' in vars(opt).keys() and opt.fc_lr_mul != 0:
+
+    all_but_fc_params = list(filter(lambda x: 'last_linear' not in x[0], model.named_parameters()))
+
+    for ind, param in enumerate(all_but_fc_params):
+        all_but_fc_params[ind] = param[1]
+
+    if torch.cuda.device_count() > 1:
+        fc_params = model.module.model.last_linear.parameters()
+    else:
+        fc_params = model.model.last_linear.parameters()
+
+    to_optim = [{'params': all_but_fc_params, 'lr': opt.lr, 'weight_decay': opt.decay},
+                {'params': fc_params, 'lr': opt.lr * opt.fc_lr_mul, 'weight_decay': opt.decay}]
+else:
+    to_optim = [{'params': model.parameters(), 'lr': opt.lr, 'weight_decay': opt.decay}]
+"""============================================================================"""
+#################### DATALOADER SETUPS ##################
+# Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+# The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+# Is simply using the training set, however running under the same rules as 'testing' dataloader,
+# i.e. no shuffling and no random cropping.
+dataloaders = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt)
+# Because the number of supervised classes is dataset dependent, we store them after
+# initializing the dataloader
+opt.num_classes = len(dataloaders['training'].dataset.avail_classes)
+
+"""============================================================================"""
+#################### CREATE LOGGING FILES ###############
+# Each dataset usually has a set of standard metrics to log. aux.metrics_to_examine()
+# returns a dict which lists metrics to log for training ('train') and validation/testing ('val')
+
+metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+# example output: {'train': ['Epochs', 'Time', 'Train Loss', 'Time'],
+#                  'val': ['Epochs','Time','NMI','F1', 'Recall @ 1','Recall @ 2','Recall @ 4','Recall @ 8']}
+
+# Using the provided metrics of interest, we generate a LOGGER instance.
+# Note that 'start_new' denotes that a new folder should be made in which everything will be stored.
+# This includes network weights as well.
+LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+# If graphviz is installed on the system, a computational graph of the underlying
+# network will be made as well.
+
+"""============================================================================"""
+#################### LOSS SETUP ####################
+# Depending on opt.loss and opt.sampling, the respective criterion is returned,
+# and if the loss has trainable parameters, to_optim is appended.
+criterion, to_optim = losses.loss_select(opt.loss, opt, to_optim)
+_ = criterion.to(opt.device)
+
+"""============================================================================"""
+##################### OPTIONAL EVALUATIONS #####################
+# Store the averaged gradients returned from the embedding to the last conv. layer.
+if opt.grad_measure:
+    grad_measure = eval.GradientMeasure(opt, name='baseline')
+# Store the relative distances between average intra- and inter-class distance.
+if opt.dist_measure:
+    # Add a distance measure for training distance ratios
+    distance_measure = eval.DistanceMeasure(dataloaders['evaluation'], opt, name='Train', update_epochs=1)
+    # #If uncommented: Do the same for the test set
+    # distance_measure_test = eval.DistanceMeasure(dataloaders['testing'], opt, name='Train', update_epochs=1)
+
+"""============================================================================"""
+#################### OPTIM SETUP ####################
+# As optimizer, Adam with standard parameters is used.
+if opt.opt == 'adam':
+    optimizer = torch.optim.Adam(to_optim)
+elif opt.opt == 'sgd':
+    optimizer = torch.optim.SGD(to_optim)
+else:
+    raise Exception('unknown optimiser')
+# for the SOA measures in the paper - need to use SGD and 0.05 learning rate
+# optimizer    = torch.optim.Adam(to_optim)
+# optimizer    = torch.optim.SGD(to_optim)
+if opt.scheduler == 'exp':
+    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma)
+elif opt.scheduler == 'step':
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.tau, gamma=opt.gamma)
+elif opt.scheduler == 'none':
+    print('No scheduling used!')
+else:
+    raise Exception('No scheduling option for input: {}'.format(opt.scheduler))
+
+
+def same_model(model1, model2):
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+"""============================================================================"""
+
+
+#################### TRAINER FUNCTION ############################
+def train_one_epoch(train_dataloader, model, optimizer, criterion, opt, epoch):
+    """
+    This function is called every epoch to perform training of the network over one full
+    (randomized) iteration of the dataset.
+
+    Args:
+        train_dataloader: torch.utils.data.DataLoader, returns (augmented) training data.
+        model:            Network to train.
+        optimizer:        Optimizer to use for training.
+        criterion:        criterion to use during training.
+        opt:              argparse.Namespace, Contains all relevant parameters.
+        epoch:            int, Current epoch.
+
+    Returns:
+        Nothing!
+    """
+
+    loss_collect = []
+
+    start = time.time()
+    data_iterator = tqdm(train_dataloader, desc='Epoch {} Training...'.format(epoch))
+
+    for i, (class_labels, input) in enumerate(data_iterator):
+        # Compute embeddings for input batch
+        features = model(input.to(opt.device))
+
+        # Compute loss.
+        if opt.loss != 'smoothap':
+            loss = criterion(features, class_labels)
+        else:
+            loss = criterion(features)
+
+        # Ensure gradients are set to zero at beginning
+        optimizer.zero_grad()
+        # Compute gradient
+        loss.backward()
+
+        train_dataloader.dataset.classes_visited = []
+
+        if opt.grad_measure:
+            # If desired, save computed gradients.
+            grad_measure.include(model.model.last_linear)
+
+        # Update weights using comp. gradients.
+        optimizer.step()
+
+        # Store loss per iteration.
+        loss_collect.append(loss.item())
+        if i == len(train_dataloader) - 1:
+            data_iterator.set_description('Epoch (Train) {0}: Mean Loss [{1:.4f}]'.format(epoch, np.mean(loss_collect)))
+
+    # Save metrics
+    LOG.log('train', LOG.metrics_to_log['train'], [epoch, np.round(time.time() - start, 4), np.mean(loss_collect)])
+    writer.add_scalar('global/training_loss', np.mean(loss_collect), epoch)
+    if opt.grad_measure:
+        # Dump stored gradients to Pickle-File.
+        grad_measure.dump(epoch)
+
+
+"""============================================================================"""
+"""========================== MAIN TRAINING PART =============================="""
+"""============================================================================"""
+################### SCRIPT MAIN ##########################
+print('\n-----\n')
+# Each dataset requires slightly different dataloaders.
+
+if opt.dataset == 'SoftInaturalist' or 'Inaturalist' or 'semi_fungi':
+    eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': 0}
+
+elif opt.dataset == 'vehicle_id':
+    eval_params = {
+        'dataloaders': [dataloaders['testing_set1'], dataloaders['testing_set2'], dataloaders['testing_set3']],
+        'model': model, 'opt': opt, 'epoch': 0}
+# Compute Evaluation metrics, print them and store in LOG.
+print('epochs -> ' + str(opt.n_epochs))
+import time
+
+for epoch in range(opt.n_epochs):
+    ### Print current learning rates for all parameters
+    if opt.scheduler != 'none': print(
+        'Running with learning rates {}...'.format(' | '.join('{}'.format(x) for x in scheduler.get_lr())))
+
+    ### Train one epoch
+    _ = model.train()
+
+    train_one_epoch(dataloaders['training'], model, optimizer, criterion, opt, epoch)
+
+    dataloaders['training'].dataset.reshuffle()
+    ### Evaluate
+    _ = model.eval()
+    # Each dataset requires slightly different dataloaders.
+    if opt.dataset == 'Inaturalist':
+        eval_params = {'dataloader': dataloaders['evaluation'], 'model': model, 'opt': opt, 'epoch': epoch}
+    elif opt.dataset == 'vehicle_id':
+        eval_params = {
+            'dataloaders': [dataloaders['testing_set1'], dataloaders['testing_set2'], dataloaders['testing_set3']],
+            'model': model, 'opt': opt, 'epoch': epoch}
+    elif opt.dataset == 'semi_fungi':
+        eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': epoch}
+
+    # Compute Evaluation metrics, print them and store in LOG.
+    if opt.infrequent_eval == 1:
+        epoch_freq = 5
+    else:
+        epoch_freq = 1
+
+    if not opt.dataset == 'vehicle_id':
+        if epoch % epoch_freq == 0:
+            results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
+            writer.add_scalar('global/recall1', results[0][0], epoch + 1)
+            writer.add_scalar('global/recall2', results[0][1], epoch + 1)
+            writer.add_scalar('global/recall3', results[0][2], epoch + 1)
+            writer.add_scalar('global/recall4', results[0][3], epoch + 1)
+            writer.add_scalar('global/NMI', results[1], epoch + 1)
+            writer.add_scalar('global/F1', results[2], epoch + 1)
+
+    else:
+        results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
+        writer.add_scalar('global/recall1', results[2], epoch + 1)
+        writer.add_scalar('global/recall2', results[3],
+                          epoch + 1)  # writer.add_scalar('global/recall3',results[0][2],0)
+        writer.add_scalar('global/recall3', results[6], epoch + 1)
+        writer.add_scalar('global/recall4', results[7], epoch + 1)
+        writer.add_scalar('global/recall5', results[10], epoch + 1)
+        writer.add_scalar('global/recall6', results[11], epoch + 1)
+    # Update the Metric Plot and save it.
+    # LOG.update_info_plot()
+    # (optional) compute ratio of intra- to interdistances.
+    if opt.dist_measure:
+        distance_measure.measure(model, epoch)
+        # distance_measure_test.measure(model, epoch)
+
+    ### Learning Rate Scheduling Step
+    if opt.scheduler != 'none':
+        scheduler.step()
+
+    print('\n-----\n')
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/netlib.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/netlib.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+############################ LIBRARIES ######################################
+from collections import OrderedDict
+import os
+import torch
+import torch.nn as nn
+import pretrainedmodels as ptm
+import auxiliaries as aux
+
+"""============================================================="""
+
+
+def initialize_weights(model):
+    """
+    Function to initialize network weights.
+    NOTE: NOT USED IN MAIN SCRIPT.
+
+    Args:
+        model: PyTorch Network
+    Returns:
+        Nothing!
+    """
+    for idx, module in enumerate(model.modules()):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+        elif isinstance(module, nn.BatchNorm2d):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(0, 0.01)
+            module.bias.data.zero_()
+
+
+"""=================================================================================================================================="""
+
+
+### ATTRIBUTE CHANGE HELPER
+def rename_attr(model, attr, name):
+    """
+    Rename attribute in a class. Simply helper function.
+
+    Args:
+        model:  General Class for which attributes should be renamed.
+        attr:   str, Name of target attribute.
+        name:   str, New attribute name.
+    """
+    setattr(model, name, getattr(model, attr))
+    delattr(model, attr)
+
+
+"""=================================================================================================================================="""
+
+
+### NETWORK SELECTION FUNCTION
+def networkselect(opt):
+    """
+    Selection function for available networks.
+
+    Args:
+        opt: argparse.Namespace, contains all training-specific training parameters.
+    Returns:
+        Network of choice
+    """
+    if opt.arch == 'resnet50':
+        network = ResNet50(opt)
+    else:
+        raise Exception('Network {} not available!'.format(opt.arch))
+
+    if opt.resume:
+        weights = torch.load(os.path.join(opt.save_path, opt.resume))
+        weights_state_dict = weights['state_dict']
+
+        if torch.cuda.device_count() > 1:
+            encoder_state_dict = OrderedDict()
+            for k, v in weights_state_dict.items():
+                k = k.replace('module.', '')
+                encoder_state_dict[k] = v
+
+            network.load_state_dict(encoder_state_dict)
+        else:
+            network.load_state_dict(weights_state_dict)
+
+    # print("=================== network =======================")
+    # for parameter in network.parameters():
+    #     parameter.requires_grad = False
+    # for parameter in network.layer_blocks[-1].parameters():
+    #     parameter.requires_grad = True
+
+    return network
+
+
+"""============================================================="""
+
+
+class ResNet50(nn.Module):
+    """
+    Container for ResNet50 s.t. it can be used for metric learning.
+    The Network has been broken down to allow for higher modularity, if one wishes
+    to target specific layers/blocks directly.
+    """
+
+    def __init__(self, opt, list_style=False, no_norm=False):
+        super(ResNet50, self).__init__()
+
+        self.pars = opt
+
+        if not opt.not_pretrained:
+            print('Getting pretrained weights...')
+            self.model = ptm.__dict__['resnet50'](num_classes=1000, pretrained='imagenet')
+            print('Done.')
+        else:
+            print('Not utilizing pretrained weights!')
+            self.model = ptm.__dict__['resnet50'](num_classes=1000, pretrained=None)
+        for module in filter(lambda m: type(m) == nn.BatchNorm2d, self.model.modules()):
+            module.eval()
+            module.train = lambda _: None
+
+        if opt.embed_dim != 2048:
+            self.model.last_linear = torch.nn.Linear(self.model.last_linear.in_features, opt.embed_dim)
+
+        self.layer_blocks = nn.ModuleList([self.model.layer1, self.model.layer2, self.model.layer3, self.model.layer4])
+        self.loss = opt.loss
+        self.feature = True
+
+    def forward(self, x, feature=False, is_init_cluster_generation=False):
+        x = self.model.maxpool(self.model.relu(self.model.bn1(self.model.conv1(x))))
+
+        for layerblock in self.layer_blocks:
+            x = layerblock(x)
+
+        x = self.model.avgpool(x)
+        x = x.view(x.size(0), -1)
+
+        if self.pars.embed_dim != 2048:
+            mod_x = self.model.last_linear(x)
+        else:
+            mod_x = x
+
+        feat = torch.nn.functional.normalize(mod_x, dim=-1)
+
+        if feature or self.loss == 'smoothap':
+            return feat
+        else:
+            pred = self.linear(feat)
+            return pred
--- a/examples/pytorch/hilander/PSS/__init__.py
+++ b/examples/pytorch/hilander/PSS/__init__.py
--- a/examples/pytorch/hilander/PSS/test.sh
+++ b/examples/pytorch/hilander/PSS/test.sh
+python Smooth_AP/src/evaluate_model.py \
+--dataset Inaturalist \
+--bs 384 \
+--source_path ~/code/Smooth_AP/data/ --embed_dim 128 \
+--resume $CHECKPOINT_PATH \
+--class_num 948 --loss smoothap \
+--trainset lin_train_set1.txt \
+--testset Inaturalist_test_set1.txt \
+--linsize 29011 --uinsize 18403
\ No newline at end of file
--- a/examples/pytorch/hilander/PSS/test_subg_inat.py
+++ b/examples/pytorch/hilander/PSS/test_subg_inat.py
+import argparse, time, os, pickle
+import random
+import sys
+sys.path.append("..")
+
+from utils.deduce import get_edge_dist
+import numpy as np
+import shutil
+
+import dgl
+import torch
+import torch.optim as optim
+
+from models import LANDER
+from dataset import LanderDataset
+from utils import evaluation, decode, build_next_level, stop_iterating
+
+from matplotlib import pyplot as plt
+import seaborn
+
+STATISTIC = False
+
+###########
+# ArgParser
+parser = argparse.ArgumentParser()
+
+# Dataset
+parser.add_argument('--data_path', type=str, required=True)
+parser.add_argument('--model_filename', type=str, default='lander.pth')
+parser.add_argument('--faiss_gpu', action='store_true')
+parser.add_argument('--num_workers', type=int, default=0)
+parser.add_argument('--output_filename', type=str, default='data/features.pkl')
+
+# HyperParam
+parser.add_argument('--knn_k', type=int, default=10)
+parser.add_argument('--levels', type=int, default=1)
+parser.add_argument('--tau', type=float, default=0.5)
+parser.add_argument('--threshold', type=str, default='prob')
+parser.add_argument('--metrics', type=str, default='pairwise,bcubed,nmi')
+parser.add_argument('--early_stop', action='store_true')
+
+# Model
+parser.add_argument('--hidden', type=int, default=512)
+parser.add_argument('--num_conv', type=int, default=4)
+parser.add_argument('--dropout', type=float, default=0.)
+parser.add_argument('--gat', action='store_true')
+parser.add_argument('--gat_k', type=int, default=1)
+parser.add_argument('--balance', action='store_true')
+parser.add_argument('--use_cluster_feat', action='store_true')
+parser.add_argument('--use_focal_loss', action='store_true')
+parser.add_argument('--use_gt', action='store_true')
+
+# Subgraph
+parser.add_argument('--batch_size', type=int, default=4096)
+parser.add_argument('--mode', type=str, default="1head")
+parser.add_argument('--midpoint', type=str, default="false")
+parser.add_argument('--linsize', type=int, default=29011)
+parser.add_argument('--uinsize', type=int, default=18403)
+parser.add_argument('--inclasses', type=int, default=948)
+parser.add_argument('--thresh', type=float, default=1.0)
+
+parser.add_argument('--draw', type=str, default='false')
+parser.add_argument('--density_distance_pkl', type=str, default="density_distance.pkl")
+parser.add_argument('--density_lindistance_jpg', type=str, default="density_lindistance.jpg")
+
+args = parser.parse_args()
+print(args)
+MODE = args.mode
+linsize = args.linsize
+uinsize = args.uinsize
+inclasses = args.inclasses
+
+if args.draw == 'false':
+    args.draw = False
+elif args.draw == 'true':
+    args.draw = True
+
+###########################
+# Environment Configuration
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+##################
+# Data Preparation
+with open(args.data_path, 'rb') as f:
+    loaded_data = pickle.load(f)
+    path2idx, features, pred_labels, labels, masks = loaded_data
+
+idx2path = {v: k for k, v in path2idx.items()}
+gtlabels = labels
+
+orifeatures = features
+orilabels = gtlabels
+
+if MODE == "selectbydensity":
+    lastusim = np.where(masks == 1)
+    masks[lastusim] = 2
+    selectedidx = np.where(masks != 0)
+    features = features[selectedidx]
+    labels = gtlabels[selectedidx]
+    selectmasks = masks[selectedidx]
+    print("filtered features:", len(features))
+    print("mask0:", len(np.where(masks == 0)[0]))
+    print("mask1:", len(np.where(masks == 1)[0]))
+    print("mask2:", len(np.where(masks == 2)[0]))
+elif MODE == "recluster":
+    selectedidx = np.where(masks == 1)
+    features = features[selectedidx]
+    labels = gtlabels[selectedidx]
+    labelspred = pred_labels[selectedidx]
+    selectmasks = masks[selectedidx]
+    gtlabels = gtlabels[selectedidx]
+    print("filtered features:", len(features))
+else:
+    selectedidx = np.where(masks != 0)
+    features = features[selectedidx]
+    labels = gtlabels[selectedidx]
+    labelspred = pred_labels[selectedidx]
+    selectmasks = masks[selectedidx]
+    gtlabels = gtlabels[selectedidx]
+    print("filtered features:", len(features))
+
+global_features = features.copy()  # global features
+dataset = LanderDataset(features=features, labels=labels, k=args.knn_k,
+                        levels=1, faiss_gpu=False)
+g = dataset.gs[0]
+g.ndata['pred_den'] = torch.zeros((g.number_of_nodes()))
+g.edata['prob_conn'] = torch.zeros((g.number_of_edges(), 2))
+global_labels = labels.copy()
+ids = np.arange(g.number_of_nodes())
+global_edges = ([], [])
+global_peaks = np.array([], dtype=np.long)
+global_edges_len = len(global_edges[0])
+global_num_nodes = g.number_of_nodes()
+
+global_densities = g.ndata['density'][:linsize]
+global_densities = np.sort(global_densities)
+xs = np.arange(len(global_densities))
+
+fanouts = [args.knn_k - 1 for i in range(args.num_conv + 1)]
+sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+# fix the number of edges
+test_loader = dgl.dataloading.NodeDataLoader(
+    g, torch.arange(g.number_of_nodes()), sampler,
+    batch_size=args.batch_size,
+    shuffle=False,
+    drop_last=False,
+    num_workers=args.num_workers
+)
+
+##################
+# Model Definition
+if not args.use_gt:
+    feature_dim = g.ndata['features'].shape[1]
+    model = LANDER(feature_dim=feature_dim, nhid=args.hidden,
+                   num_conv=args.num_conv, dropout=args.dropout,
+                   use_GAT=args.gat, K=args.gat_k,
+                   balance=args.balance,
+                   use_cluster_feat=args.use_cluster_feat,
+                   use_focal_loss=args.use_focal_loss)
+    model.load_state_dict(torch.load(args.model_filename))
+    model = model.to(device)
+    model.eval()
+
+# number of edges added is the indicator for early stopping
+num_edges_add_last_level = np.Inf
+##################################
+# Predict connectivity and density
+for level in range(args.levels):
+    print("level:", level)
+    if not args.use_gt:
+        total_batches = len(test_loader)
+        for batch, minibatch in enumerate(test_loader):
+            input_nodes, sub_g, bipartites = minibatch
+            sub_g = sub_g.to(device)
+            bipartites = [b.to(device) for b in bipartites]
+            with torch.no_grad():
+                output_bipartite = model(bipartites)
+            global_nid = output_bipartite.dstdata[dgl.NID]
+            global_eid = output_bipartite.edata['global_eid']
+            g.ndata['pred_den'][global_nid] = output_bipartite.dstdata['pred_den'].to('cpu')
+            g.edata['prob_conn'][global_eid] = output_bipartite.edata['prob_conn'].to('cpu')
+            torch.cuda.empty_cache()
+            if (batch + 1) % 10 == 0:
+                print('Batch %d / %d for inference' % (batch, total_batches))
+
+    new_pred_labels, peaks, \
+    global_edges, global_pred_labels, global_peaks = decode(g, args.tau, args.threshold, args.use_gt,
+                                                            ids, global_edges, global_num_nodes,
+                                                            global_peaks)
+    if level == 0:
+        global_pred_densities = g.ndata['pred_den']
+        global_densities = g.ndata['density']
+        g.edata['prob_conn'] = torch.zeros((g.number_of_edges(), 2))
+
+    ids = ids[peaks]
+    new_global_edges_len = len(global_edges[0])
+    num_edges_add_this_level = new_global_edges_len - global_edges_len
+    if stop_iterating(level, args.levels, args.early_stop, num_edges_add_this_level, num_edges_add_last_level,
+                      args.knn_k):
+        break
+    global_edges_len = new_global_edges_len
+    num_edges_add_last_level = num_edges_add_this_level
+
+    # build new dataset
+    features, labels, cluster_features = build_next_level(features, labels, peaks,
+                                                          global_features, global_pred_labels, global_peaks)
+    # After the first level, the number of nodes reduce a lot. Using cpu faiss is faster.
+    dataset = LanderDataset(features=features, labels=labels, k=args.knn_k,
+                            levels=1, faiss_gpu=False, cluster_features=cluster_features)
+    g = dataset.gs[0]
+    g.ndata['pred_den'] = torch.zeros((g.number_of_nodes()))
+    g.edata['prob_conn'] = torch.zeros((g.number_of_edges(), 2))
+    test_loader = dgl.dataloading.NodeDataLoader(
+        g, torch.arange(g.number_of_nodes()), sampler,
+        batch_size=args.batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=args.num_workers
+    )
+
+if MODE == "selectbydensity":
+    thresh = args.thresh
+    global_pred_densities = np.array(global_pred_densities).astype(float)
+    global_densities = np.array(global_densities).astype(float)
+    distance = np.abs(global_pred_densities - global_densities)
+    print("densities shape", global_pred_densities.shape)
+    print(global_pred_densities.max(), global_pred_densities.min())
+
+    selectidx = np.where(global_pred_densities > thresh)[0]
+    selected_pred_densities = global_pred_densities[selectidx]
+    selected_densities = global_densities[selectidx]
+    selected_distance = np.abs(selected_pred_densities - selected_densities)
+    print(np.mean(selected_distance))
+    print("number of selected samples:", len(selectidx))
+
+    notselectidx = np.where(global_pred_densities <= thresh)
+    print("not selected:", len(notselectidx[0]))
+    global_pred_labels[notselectidx] = -1
+
+    global_pred_labels_new = np.zeros_like(orilabels)
+    global_pred_labels_new[:] = -1
+    Tidx = np.where(masks != 2)
+    print("T:", len(Tidx[0]))
+
+    l_in_gt = orilabels[Tidx]
+    l_in_features = orifeatures[Tidx]
+    l_in_gt_new = np.zeros_like(l_in_gt)
+    l_in_unique = np.unique(l_in_gt)
+    for i in range(len(l_in_unique)):
+        l_in = l_in_unique[i]
+        l_in_idx = np.where(l_in_gt == l_in)
+        l_in_gt_new[l_in_idx] = i
+    print("len(l_in_unique)", len(l_in_unique))
+
+    if args.draw:
+        prototypes = np.zeros((len(l_in_unique), features.shape[1]))
+        for i in range(len(l_in_unique)):
+            idx = np.where(l_in_gt_new == i)
+            prototypes[i] = np.mean(l_in_features[idx], axis=0)
+
+        similarity_matrix = torch.mm(torch.from_numpy(global_features.astype(np.float32)),
+                                     torch.from_numpy(prototypes.astype(np.float32)).t())
+        similarity_matrix = (1 - similarity_matrix) / 2
+        minvalues, selected_pred_labels = torch.min(similarity_matrix, 1)
+        # far-close ratio
+        closeidx = np.where(minvalues < 0.15)
+        faridx = np.where(minvalues >= 0.15)
+        print("far:", len(faridx[0]))
+        print("close:", len(closeidx[0]))
+
+        cutidx = np.where(global_pred_densities >= 0.5)
+        draw_minvalues = minvalues[cutidx]
+        draw_densities = global_pred_densities[cutidx]
+        with open(args.density_distance_pkl, 'wb') as f:
+            pickle.dump((global_pred_densities, minvalues), f)
+        print("dumped.")
+        plt.clf()
+        fig, ax = plt.subplots()
+        import random
+
+        if len(draw_densities) > 10000:
+            samples_idx = random.sample(range(len(draw_minvalues)), 10000)
+            ax.plot(draw_densities[random], draw_minvalues[random], color='tab:blue', marker='*', linestyle="None",
+                    markersize=1)
+        else:
+            ax.plot(draw_densities[random], draw_minvalues[random], color='tab:blue', marker='*', linestyle="None",
+                    markersize=1)
+        plt.savefig(args.density_lindistance_jpg)
+
+    global_pred_labels_new[Tidx] = l_in_gt_new
+    global_pred_labels[selectidx] = global_pred_labels[selectidx] + len(l_in_unique)
+    global_pred_labels_new[selectedidx] = global_pred_labels
+
+    global_pred_labels = global_pred_labels_new
+    linunique = np.unique(global_pred_labels[Tidx])
+    uunique = np.unique(global_pred_labels[selectedidx])
+    allnique = np.unique(global_pred_labels)
+    print("labels")
+    print(len(linunique), len(uunique), len(allnique))
+
+    global_masks = np.zeros_like(masks)
+    global_masks[:] = 1
+    global_masks[np.array(selectedidx[0])[notselectidx]] = 2
+    Tidx = np.where(masks != 2)
+    global_masks[Tidx] = 0
+    print("mask0", len(np.where(global_masks == 0)[0]))
+    print("mask1", len(np.where(global_masks == 1)[0]))
+    print("mask2", len(np.where(global_masks == 2)[0]))
+    print("all", len(masks), len(orilabels), len(orifeatures))
+
+    global_gt_labels = orilabels
+
+if MODE == "recluster":
+    global_pred_labels_new = np.zeros_like(orilabels)
+    global_pred_labels_new[:] = -1
+    Tidx = np.where(masks == 0)
+    print("T:", len(Tidx[0]))
+
+    l_in_gt = orilabels[Tidx]
+    l_in_features = orifeatures[Tidx]
+    l_in_gt_new = np.zeros_like(l_in_gt)
+    l_in_unique = np.unique(l_in_gt)
+    for i in range(len(l_in_unique)):
+        l_in = l_in_unique[i]
+        l_in_idx = np.where(l_in_gt == l_in)
+        l_in_gt_new[l_in_idx] = i
+    print("len(l_in_unique)", len(l_in_unique))
+
+    global_pred_labels_new[Tidx] = l_in_gt_new
+    print(len(global_pred_labels))
+    print(len(selectedidx[0]))
+    global_pred_labels_new[selectedidx[0]] = global_pred_labels + len(l_in_unique)
+    global_pred_labels = global_pred_labels_new
+    global_masks = masks
+    print("mask0", len(np.where(global_masks == 0)[0]))
+    print("mask1", len(np.where(global_masks == 1)[0]))
+    print("mask2", len(np.where(global_masks == 2)[0]))
+    print("all", len(masks), len(orilabels), len(orifeatures))
+    global_gt_labels = orilabels
+
+if MODE == "donothing":
+    global_masks = masks
+    pass
+
+print("##################### L_in ########################")
+print(linsize)
+if len(global_pred_labels) >= linsize:
+    evaluation(global_pred_labels[:linsize], global_gt_labels[:linsize], args.metrics)
+else:
+    print("No samples in L_in!")
+print("##################### U_in ########################")
+uinidx = np.where(global_pred_labels[linsize:linsize + uinsize] != -1)[0]
+uinidx = uinidx + linsize
+print(len(uinidx))
+if len(uinidx):
+    evaluation(global_pred_labels[uinidx], global_gt_labels[uinidx], args.metrics)
+else:
+    print("No samples in U_in!")
+print("##################### U_out ########################")
+uoutidx = np.where(global_pred_labels[linsize + uinsize:] != -1)[0]
+uoutidx = uoutidx + linsize + uinsize
+print(len(uoutidx))
+if len(uoutidx):
+    evaluation(global_pred_labels[uoutidx], global_gt_labels[uoutidx], args.metrics)
+else:
+    print("No samples in U_out!")
+print("##################### U ########################")
+uidx = np.where(global_pred_labels[linsize:] != -1)[0]
+uidx = uidx + linsize
+print(len(uidx))
+if len(uidx):
+    evaluation(global_pred_labels[uidx], global_gt_labels[uidx], args.metrics)
+else:
+    print("No samples in U!")
+print("##################### L+U ########################")
+luidx = np.where(global_pred_labels != -1)[0]
+print(len(luidx))
+evaluation(global_pred_labels[luidx], global_gt_labels[luidx], args.metrics)
+print("##################### new selected samples ########################")
+sidx = np.where(global_masks == 1)[0]
+print(len(sidx))
+if len(sidx) != 0:
+    evaluation(global_pred_labels[sidx], global_gt_labels[sidx], args.metrics)
+print("##################### not selected samples ########################")
+nsidx = np.where(global_masks == 2)[0]
+print(len(nsidx))
+if len(nsidx) != 0:
+    evaluation(global_pred_labels[nsidx], global_gt_labels[nsidx], args.metrics)
+
+with open(args.output_filename, 'wb') as f:
+    print(orifeatures.shape)
+    print(global_pred_labels.shape)
+    print(global_gt_labels.shape)
+    print(global_masks.shape)
+    pickle.dump([path2idx, orifeatures, global_pred_labels, global_gt_labels, global_masks], f)
--- a/examples/pytorch/hilander/PSS/train.sh
+++ b/examples/pytorch/hilander/PSS/train.sh
+#!/bin/bash
+
+mkdir hilander_checkpoint
+
+####################### ITER 0 #######################
+# iter 0 (supervised baseline) - train Smooth-AP
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python Smooth_AP/src/main.py \
+--dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+--n_epochs 400 --bs 384 \
+--source_path "../../data/" --embed_dim 128 \
+--class_num 948 --loss smoothap --infrequent_eval 1 \
+--trainset lin_train_set1.txt --testset Inaturalist_test_set1.txt
+
+# iter 0 (supervised baseline) - get feature
+python Smooth_AP/src/get_features.py \
+--dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+--n_epochs 400 --bs 384 \
+--source_path "../../data/" --embed_dim 128 \
+--resume "0/checkpoint_0.pth.tar" \
+--finetune false --get_features true --iter 0 \
+--class_num 948 --loss smoothap \
+--trainset lin_train_set1.txt \
+--all_trainset train_set1.txt \
+--testset Inaturalist_test_set1.txt \
+--linsize 29011
+
+# iter 0 (supervised baseline) - train hi-lander
+python train_subg_inat.py \
+--data_path "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/T_train_iter0_smoothap_inat_features.pkl" \
+--model_filename '/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter0.pth' \
+--knn_k 10,5,3 --levels 2,3,4 \
+--hidden 512 --epochs 1000 --lr 0.01 \
+--batch_size 4096 --num_conv 1 --gat --balance
+
+# iter 0 (supervised baseline) - get pseudo labels
+python test_subg_inat.py \
+--data_path '/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/all_train_iter0_smoothap_inat_features.pkl' \
+--model_filename '/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter0.pth'  --knn_k 10 \
+--tau 0.9 --level 10 --threshold prob \
+--hidden 512 --num_conv 1 --gat --batch_size 4096 --early_stop \
+--mode selectbydensity --thresh 0.8 \
+--linsize 29011 --uinsize 18403 --inclasses 948 \
+--output_filename 'data/inat_hilander_l_smoothap_train_selectbydensity_iter0.pkl'
+
+
+for i in {1..4} ; do
+  last_iter=`expr $i - 1`
+  echo ${last_iter}
+  # iter i - train Smooth-AP
+  python Smooth_AP/src/finetune_1head.py \
+  --dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+  --n_epochs 400 --bs 384 --class_num 1024 \
+  --source_path "../../data/" --embed_dim 128 \
+  --trainset lin_train_set1.txt --testset Inaturalist_test_set1.txt \
+  --cluster_path "../../data/inat_hilander_l_smoothap_train_selectbydensity_iter${last_iter}.pkl" \
+  --finetune true --loss smoothap --infrequent_eval 1 --iter ${i}
+
+  # iter i - get feature
+  python Smooth_AP/src/get_features.py \
+  --dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+  --n_epochs 400 --bs 384 \
+  --source_path "../../data/" --embed_dim 128 \
+  --resume "${i}/checkpoint_${i}.pth.tar" \
+  --finetune false --get_features true --iter ${i} \
+  --class_num 948 --loss smoothap \
+  --trainset lin_train_set1.txt \
+  --all_trainset train_set1.txt \
+  --testset Inaturalist_test_set1.txt \
+  --linsize 29011 --uinsize 18403 \
+  --cluster_path "../../data/inat_hilander_l_smoothap_train_selectbydensity_iter${last_iter}.pkl"
+
+  # iter i - train hi-lander
+  python train_subg_inat.py \
+  --data_path "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/T_train_iter${i}_smoothap_inat_features.pkl" \
+  --model_filename "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter${i}.pth" \
+  --knn_k 10,5,3 --levels 2,3,4 \
+  --hidden 512 --epochs 1000 --lr 0.01 \
+  --batch_size 4096 --num_conv 1 --gat --balance
+
+  # iter i - get pseudo labels
+  python test_subg_inat.py \
+  --data_path "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/all_train_iter${i}_smoothap_inat_features.pkl" \
+  --model_filename "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter${i}.pth"  --knn_k 10 \
+  --tau 0.9 --level 10 --threshold prob \
+  --hidden 512 --num_conv 1 --gat --batch_size 4096 --early_stop \
+  --mode selectbydensity --thresh 0.8 \
+  --linsize 29011 --uinsize 18403 --inclasses 948 \
+  --output_filename "data/inat_hilander_l_smoothap_train_selectbydensity_iter${i}.pkl"
+done
--- a/examples/pytorch/hilander/PSS/train_subg_inat.py
+++ b/examples/pytorch/hilander/PSS/train_subg_inat.py
+import argparse, time, os, pickle
+import random
+
+import numpy as np
+
+import dgl
+import torch
+import torch.optim as optim
+
+import sys
+sys.path.append("..")
+from models import LANDER
+from dataset import LanderDataset
+
+###########
+# ArgParser
+parser = argparse.ArgumentParser()
+
+# Dataset
+parser.add_argument('--data_path', type=str, required=True)
+parser.add_argument('--levels', type=str, default='1')
+parser.add_argument('--faiss_gpu', action='store_true')
+parser.add_argument('--model_filename', type=str, default='lander.pth')
+
+# KNN
+parser.add_argument('--knn_k', type=str, default='10')
+parser.add_argument('--num_workers', type=int, default=0)
+
+# Model
+parser.add_argument('--hidden', type=int, default=512)
+parser.add_argument('--num_conv', type=int, default=1)
+parser.add_argument('--dropout', type=float, default=0.)
+parser.add_argument('--gat', action='store_true')
+parser.add_argument('--gat_k', type=int, default=1)
+parser.add_argument('--balance', action='store_true')
+parser.add_argument('--use_cluster_feat', action='store_true')
+parser.add_argument('--use_focal_loss', action='store_true')
+
+# Training
+parser.add_argument('--epochs', type=int, default=100)
+parser.add_argument('--batch_size', type=int, default=1024)
+parser.add_argument('--lr', type=float, default=0.1)
+parser.add_argument('--momentum', type=float, default=0.9)
+parser.add_argument('--weight_decay', type=float, default=1e-5)
+
+args = parser.parse_args()
+print(args)
+
+###########################
+# Environment Configuration
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+# setup_seed(20)
+
+##################
+# Data Preparation
+with open(args.data_path, 'rb') as f:
+    path2idx, features, labels, _, masks = pickle.load(f)
+    # lidx = np.where(masks==0)
+    # features = features[lidx]
+    # labels = labels[lidx]
+    print("features.shape:", features.shape)
+    print("labels.shape:", labels.shape)
+
+
+k_list = [int(k) for k in args.knn_k.split(',')]
+lvl_list = [int(l) for l in args.levels.split(',')]
+gs = []
+nbrs = []
+ks = []
+datasets = []
+for k, l in zip(k_list, lvl_list):
+    print("k:", k)
+    print("levels:", l)
+    dataset = LanderDataset(features=features, labels=labels, k=k,
+                                levels=l, faiss_gpu=args.faiss_gpu)
+    gs += [g for g in dataset.gs]
+    ks += [k for g in dataset.gs]
+    nbrs += [nbr for nbr in dataset.nbrs]
+    datasets.append(dataset)
+
+# with open("./dataset.pkl", 'rb') as f:
+#     datasets = pickle.load(f)
+# for i in range(len(datasets)):
+#     dataset = datasets[i]
+#     k = k_list[i]
+#     gs += [g for g in dataset.gs]
+#     ks += [k for g in dataset.gs]
+#     nbrs += [nbr for nbr in dataset.nbrs]
+
+
+with open("./dataset.pkl", 'wb') as f:
+    pickle.dump(datasets, f)
+
+print('Dataset Prepared.')
+
+def set_train_sampler_loader(g, k):
+    fanouts = [k-1 for i in range(args.num_conv + 1)]
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+    # fix the number of edges
+    train_dataloader = dgl.dataloading.NodeDataLoader(
+        g, torch.arange(g.number_of_nodes()), sampler,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=args.num_workers
+    )
+    return train_dataloader
+
+train_loaders = []
+for gidx, g in enumerate(gs):
+    train_dataloader = set_train_sampler_loader(gs[gidx], ks[gidx])
+    train_loaders.append(train_dataloader)
+
+##################
+# Model Definition
+feature_dim = gs[0].ndata['features'].shape[1]
+print("feature dimension:", feature_dim)
+model = LANDER(feature_dim=feature_dim, nhid=args.hidden,
+               num_conv=args.num_conv, dropout=args.dropout,
+               use_GAT=args.gat, K=args.gat_k,
+               balance=args.balance,
+               use_cluster_feat=args.use_cluster_feat,
+               use_focal_loss=args.use_focal_loss)
+model = model.to(device)
+model.train()
+
+#################
+# Hyperparameters
+opt = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum,
+                weight_decay=args.weight_decay)
+
+# keep num_batch_per_loader the same for every sub_dataloader
+num_batch_per_loader = len(train_loaders[0])
+train_loaders = [iter(train_loader) for train_loader in train_loaders]
+num_loaders = len(train_loaders)
+scheduler = optim.lr_scheduler.CosineAnnealingLR(opt,
+                                                 T_max=args.epochs * num_batch_per_loader * num_loaders,
+                                                 eta_min=1e-5)
+
+print('Start Training.')
+
+###############
+# Training Loop
+for epoch in range(args.epochs):
+    loss_den_val_total = []
+    loss_conn_val_total = []
+    loss_val_total = []
+    for batch in range(num_batch_per_loader):
+        for loader_id in range(num_loaders):
+            try:
+                minibatch = next(train_loaders[loader_id])
+            except:
+                train_loaders[loader_id] = iter(set_train_sampler_loader(gs[loader_id], ks[loader_id]))
+                minibatch = next(train_loaders[loader_id])
+            input_nodes, sub_g, bipartites = minibatch
+            sub_g = sub_g.to(device)
+            bipartites = [b.to(device) for b in bipartites]
+            # get the feature for the input_nodes
+            opt.zero_grad()
+            output_bipartite = model(bipartites)
+            loss, loss_den_val, loss_conn_val = model.compute_loss(output_bipartite)
+            loss_den_val_total.append(loss_den_val)
+            loss_conn_val_total.append(loss_conn_val)
+            loss_val_total.append(loss.item())
+            loss.backward()
+            opt.step()
+            if (batch + 1) % 10 == 0:
+                print('epoch: %d, batch: %d / %d, loader_id : %d / %d, loss: %.6f, loss_den: %.6f, loss_conn: %.6f'%
+                      (epoch, batch, num_batch_per_loader, loader_id, num_loaders,
+                       loss.item(), loss_den_val, loss_conn_val))
+            scheduler.step()
+    print('epoch: %d, loss: %.6f, loss_den: %.6f, loss_conn: %.6f'%
+          (epoch, np.array(loss_val_total).mean(),
+           np.array(loss_den_val_total).mean(), np.array(loss_conn_val_total).mean()))
+    torch.save(model.state_dict(), args.model_filename)
+
+torch.save(model.state_dict(), args.model_filename)
--- a/examples/pytorch/hilander/__init__.py
+++ b/examples/pytorch/hilander/__init__.py
--- a/examples/pytorch/ogb/ogbn-mag/README.md
+++ b/examples/pytorch/ogb/ogbn-mag/README.md
@@ -5,11 +5,6 @@ The following options can be specified via command line arguments:
 ```
 optional arguments:
  -h, --help            show this help message and exit
-  --dropout DROPOUT     dropout probability
-  --n-hidden N_HIDDEN   number of hidden units
-  --lr LR               learning rate
-  -e N_EPOCHS, --n-epochs N_EPOCHS
-                        number of training epochs
  --runs RUNS
 ```

@@ -58,68 +53,3 @@ ParameterDict(
 The input features are passed to a modified version of the R-GCN architecture.  As in the R-GCN paper, each _edge-type_ has its own linear projection matrix (the "weight" ModuleDict below).  Different from the original paper, however, each _node-type_ has its own "self" linear projection matrix (the "loop_weights" ModuleDict below).  There are 7 edge-types:  4 natural edge-types ("cites", "affiliated_with", "has_topic" and "writes") and 3 manufactured reverse edge-types ("rev-affiliated_with", "rev-has_topic", "rev-writes").  As mentioned above, note that there is _not_ a reverse edge type like "rev-cites", and instead the reverse edges are given the same type of "cites".  This exception was presumably made because the source and destinate nodes are of type "paper".  Whereas the 7 "relation" linear layers do not have a bias, the 4 "self" linear layers do.

 With two of these layers, a hidden dimension size of 64 and 349 output classes, we end up with 337,460 R-GCN model parameters.
-
-```
-EntityClassify(
-  (layers): ModuleList(
-    (0): RelGraphConvLayer(
-      (conv): HeteroGraphConv(
-        (mods): ModuleDict(
-          (affiliated_with): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (cites): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (has_topic): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (rev-affiliated_with): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (rev-has_topic): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (rev-writes): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (writes): GraphConv(in=128, out=64, normalization=right, activation=None)
-        )
-      )
-      (weight): ModuleDict(
-        (affiliated_with): Linear(in_features=128, out_features=64, bias=False)
-        (cites): Linear(in_features=128, out_features=64, bias=False)
-        (has_topic): Linear(in_features=128, out_features=64, bias=False)
-        (rev-affiliated_with): Linear(in_features=128, out_features=64, bias=False)
-        (rev-has_topic): Linear(in_features=128, out_features=64, bias=False)
-        (rev-writes): Linear(in_features=128, out_features=64, bias=False)
-        (writes): Linear(in_features=128, out_features=64, bias=False)
-      )
-      (loop_weights): ModuleDict(
-        (author): Linear(in_features=128, out_features=64, bias=True)
-        (field_of_study): Linear(in_features=128, out_features=64, bias=True)
-        (institution): Linear(in_features=128, out_features=64, bias=True)
-        (paper): Linear(in_features=128, out_features=64, bias=True)
-      )
-      (dropout): Dropout(p=0.5, inplace=False)
-    )
-    (1): RelGraphConvLayer(
-      (conv): HeteroGraphConv(
-        (mods): ModuleDict(
-          (affiliated_with): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (cites): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (has_topic): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (rev-affiliated_with): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (rev-has_topic): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (rev-writes): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (writes): GraphConv(in=64, out=349, normalization=right, activation=None)
-        )
-      )
-      (weight): ModuleDict(
-        (affiliated_with): Linear(in_features=64, out_features=349, bias=False)
-        (cites): Linear(in_features=64, out_features=349, bias=False)
-        (has_topic): Linear(in_features=64, out_features=349, bias=False)
-        (rev-affiliated_with): Linear(in_features=64, out_features=349, bias=False)
-        (rev-has_topic): Linear(in_features=64, out_features=349, bias=False)
-        (rev-writes): Linear(in_features=64, out_features=349, bias=False)
-        (writes): Linear(in_features=64, out_features=349, bias=False)
-      )
-      (loop_weights): ModuleDict(
-        (author): Linear(in_features=64, out_features=349, bias=True)
-        (field_of_study): Linear(in_features=64, out_features=349, bias=True)
-        (institution): Linear(in_features=64, out_features=349, bias=True)
-        (paper): Linear(in_features=64, out_features=349, bias=True)
-      )
-      (dropout): Dropout(p=0.0, inplace=False)
-    )
-  )
-)
-```
--- a/examples/pytorch/ogb/ogbn-mag/hetero_rgcn.py
+++ b/examples/pytorch/ogb/ogbn-mag/hetero_rgcn.py
@@ -4,136 +4,93 @@ from tqdm import tqdm

 import dgl
 import dgl.nn as dglnn
+from dgl.nn import HeteroEmbedding
+from dgl import Compose, AddReverse, ToSimple
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 from ogb.nodeproppred import DglNodePropPredDataset, Evaluator

+def prepare_data(args):
+    dataset = DglNodePropPredDataset(name="ogbn-mag")
+    split_idx = dataset.get_idx_split()
+    # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
+    g, labels = dataset[0]
+    labels = labels['paper'].flatten()

-def extract_embed(node_embed, input_nodes):
-    emb = {}
-    for ntype, nid in input_nodes.items():
-        nid = input_nodes[ntype]
-        if ntype in node_embed:
-            emb[ntype] = node_embed[ntype][nid]
-    return emb
-
-class RelGraphEmbed(nn.Module):
-    r"""Embedding layer for featureless heterograph.
-    
-    Parameters
-    ----------
-    g : DGLGraph
-        Input graph.
-    embed_size : int
-        The length of each embedding vector
-    exclude : list[str]
-        The list of node-types to exclude (e.g., because they have natural features)
-    """
-    def __init__(self, g, embed_size, exclude=list()):
+    transform = Compose([ToSimple(), AddReverse()])
+    g = transform(g)

-        super(RelGraphEmbed, self).__init__()
-        self.g = g
-        self.embed_size = embed_size
+    print("Loaded graph: {}".format(g))

-        # create learnable embeddings for all nodes, except those with a node-type in the "exclude" list
-        self.embeds = nn.ParameterDict()
-        for ntype in g.ntypes:
-            if ntype in exclude:
-                continue
-            embed = nn.Parameter(th.Tensor(g.number_of_nodes(ntype), self.embed_size))
-            self.embeds[ntype] = embed
+    logger = Logger(args.runs)

-        self.reset_parameters()
+    # train sampler
+    sampler = dgl.dataloading.MultiLayerNeighborSampler([25, 20])
+    train_loader = dgl.dataloading.DataLoader(
+        g, split_idx['train'], sampler,
+        batch_size=1024, shuffle=True, num_workers=0)

-    def reset_parameters(self):
-        for emb in self.embeds.values():
-            nn.init.xavier_uniform_(emb)
+    return g, labels, dataset.num_classes, split_idx, logger, train_loader

-    def forward(self, block=None):
-        return self.embeds
+def extract_embed(node_embed, input_nodes):
+    emb = node_embed({
+        ntype: input_nodes[ntype] for ntype in input_nodes if ntype != 'paper'
+    })
+    return emb

+def rel_graph_embed(graph, embed_size):
+    node_num = {}
+    for ntype in graph.ntypes:
+        if ntype == 'paper':
+            continue
+        node_num[ntype] = graph.num_nodes(ntype)
+    embeds = HeteroEmbedding(node_num, embed_size)
+    return embeds

 class RelGraphConvLayer(nn.Module):
-    r"""Relational graph convolution layer.
-
-    Parameters
-    ----------
-    in_feat : int
-        Input feature size.
-    out_feat : int
-        Output feature size.
-    ntypes : list[str]
-        Node type names
-    rel_names : list[str]
-        Relation names.
-    weight : bool, optional
-        True if a linear layer is applied after message passing. Default: True
-    bias : bool, optional
-        True if bias is added. Default: True
-    activation : callable, optional
-        Activation function. Default: None
-    self_loop : bool, optional
-        True to include self loop message. Default: False
-    dropout : float, optional
-        Dropout rate. Default: 0.0
-    """
    def __init__(self,
                 in_feat,
                 out_feat,
                 ntypes,
                 rel_names,
-                 *,
-                 weight=True,
-                 bias=True,
                 activation=None,
-                 self_loop=False,
                 dropout=0.0):
        super(RelGraphConvLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.ntypes = ntypes
        self.rel_names = rel_names
-        self.bias = bias
        self.activation = activation
-        self.self_loop = self_loop

        self.conv = dglnn.HeteroGraphConv({
                rel : dglnn.GraphConv(in_feat, out_feat, norm='right', weight=False, bias=False)
                for rel in rel_names
            })

-        self.use_weight = weight
-        if self.use_weight:
        self.weight = nn.ModuleDict({
            rel_name: nn.Linear(in_feat, out_feat, bias=False)
            for rel_name in self.rel_names
        })

        # weight for self loop
-        if self.self_loop:
        self.loop_weights = nn.ModuleDict({
-                ntype: nn.Linear(in_feat, out_feat, bias=bias)
+            ntype: nn.Linear(in_feat, out_feat, bias=True)
            for ntype in self.ntypes
        })

        self.dropout = nn.Dropout(dropout)
-
        self.reset_parameters()

-    
    def reset_parameters(self):
-        if self.use_weight:
        for layer in self.weight.values():
            layer.reset_parameters()

-        if self.self_loop:
        for layer in self.loop_weights.values():
            layer.reset_parameters()

    def forward(self, g, inputs):
-        """Forward computation
-
+        """
        Parameters
        ----------
        g : DGLHeteroGraph
@@ -147,83 +104,41 @@ class RelGraphConvLayer(nn.Module):
            New node features for each node type.
        """
        g = g.local_var()
-        if self.use_weight:
        wdict = {rel_name: {'weight': self.weight[rel_name].weight.T}
                 for rel_name in self.rel_names}
-        else:
-            wdict = {}

-        if g.is_block:
        inputs_dst = {k: v[:g.number_of_dst_nodes(k)] for k, v in inputs.items()}
-        else:
-            inputs_dst = inputs

        hs = self.conv(g, inputs, mod_kwargs=wdict)

        def _apply(ntype, h):
-            if self.self_loop:
            h = h + self.loop_weights[ntype](inputs_dst[ntype])
            if self.activation:
                h = self.activation(h)
            return self.dropout(h)
-        return {ntype : _apply(ntype, h) for ntype, h in hs.items()}

+        return {ntype : _apply(ntype, h) for ntype, h in hs.items()}

 class EntityClassify(nn.Module):
-    r"""
-    R-GCN node classification model
-
-    Parameters
-    ----------
-    g : DGLGraph
-        The heterogenous graph used for message passing
-    in_dim : int
-        Input feature size.
-    h_dim : int
-        Hidden dimension size.
-    out_dim : int
-        Output dimension size.
-    num_hidden_layers : int, optional
-        Number of RelGraphConvLayers. Default: 1
-    dropout : float, optional
-        Dropout rate. Default: 0.0
-    use_self_loop : bool, optional
-        True to include self loop message in RelGraphConvLayers. Default: True
-    """
-    def __init__(self,
-                 g, in_dim,
-                 h_dim, out_dim,
-                 num_hidden_layers=1,
-                 dropout=0,
-                 use_self_loop=True):
+    def __init__(self, g, in_dim, out_dim):
        super(EntityClassify, self).__init__()
-        self.g = g
        self.in_dim = in_dim
-        self.h_dim = h_dim
+        self.h_dim = 64
        self.out_dim = out_dim
        self.rel_names = list(set(g.etypes))
        self.rel_names.sort()
-        self.num_hidden_layers = num_hidden_layers
-        self.dropout = dropout
-        self.use_self_loop = use_self_loop
+        self.dropout = 0.5

        self.layers = nn.ModuleList()
        # i2h
        self.layers.append(RelGraphConvLayer(
            self.in_dim, self.h_dim, g.ntypes, self.rel_names,
-            activation=F.relu, self_loop=self.use_self_loop,
-            dropout=self.dropout))
-        # h2h
-        for _ in range(self.num_hidden_layers):
-            self.layers.append(RelGraphConvLayer(
-                self.h_dim, self.h_dim, g.ntypes, self.rel_names,
-                activation=F.relu, self_loop=self.use_self_loop,
-                dropout=self.dropout))
+            activation=F.relu, dropout=self.dropout))
+
        # h2o
        self.layers.append(RelGraphConvLayer(
            self.h_dim, self.out_dim, g.ntypes, self.rel_names,
-            activation=None,
-            self_loop=self.use_self_loop))
+            activation=None))

    def reset_parameters(self):
        for layer in self.layers:
@@ -234,7 +149,6 @@ class EntityClassify(nn.Module):
            h = layer(block, h)
        return h

-
 class Logger(object):
    r"""
    This class was taken directly from the PyG implementation and can be found
@@ -242,8 +156,7 @@ class Logger(object):

    This was done to ensure that performance was measured in precisely the same way
    """
-    def __init__(self, runs, info=None):
-        self.info = info
+    def __init__(self, runs):
        self.results = [[] for _ in range(runs)]

    def add_result(self, run, result):
@@ -283,110 +196,14 @@ class Logger(object):
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')

-
-def parse_args():
-    # DGL
-    parser = argparse.ArgumentParser(description='RGCN')
-    parser.add_argument("--dropout", type=float, default=0.5,
-            help="dropout probability")
-    parser.add_argument("--n-hidden", type=int, default=64,
-            help="number of hidden units")
-    parser.add_argument("--lr", type=float, default=0.01,
-            help="learning rate")
-    parser.add_argument("-e", "--n-epochs", type=int, default=3,
-            help="number of training epochs")
-
-    # OGB
-    parser.add_argument('--runs', type=int, default=10)
-
-    args = parser.parse_args()
-    return args
-
-def prepare_data(args):
-    dataset = DglNodePropPredDataset(name="ogbn-mag")
-    split_idx = dataset.get_idx_split()
-    g, labels = dataset[0] # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
-    labels = labels['paper'].flatten()
-
-    def add_reverse_hetero(g, combine_like=True):
-        r"""
-        Parameters
-        ----------
-        g : DGLGraph
-            The heterogenous graph where reverse edges should be added
-        combine_like : bool, optional
-            Whether reverse-edges that have identical source/destination 
-            node types should be combined with the existing edge-type, 
-            rather than creating a new edge type.  Default: True.
-        """
-        relations = {}
-        num_nodes_dict = {ntype: g.num_nodes(ntype) for ntype in g.ntypes}
-        for metapath in g.canonical_etypes:
-            src_ntype, rel_type, dst_ntype = metapath
-            src, dst = g.all_edges(etype=rel_type)
-
-            if src_ntype==dst_ntype and combine_like:
-                # Make edges un-directed instead of making a reverse edge type
-                relations[metapath] = (th.cat([src, dst], dim=0), th.cat([dst, src], dim=0))
-            else:
-                # Original edges
-                relations[metapath] = (src, dst)
-
-                reverse_metapath = (dst_ntype, 'rev-' + rel_type, src_ntype)
-                relations[reverse_metapath] = (dst, src)           # Reverse edges
-
-        new_g = dgl.heterograph(relations, num_nodes_dict=num_nodes_dict)
-        # Remove duplicate edges
-        new_g = dgl.to_simple(new_g, return_counts=None, writeback_mapping=False, copy_ndata=True)
-
-        # copy_ndata:
-        for ntype in g.ntypes:
-            for k, v in g.nodes[ntype].data.items():
-                new_g.nodes[ntype].data[k] = v.detach().clone()
-
-        return new_g
-
-    g = add_reverse_hetero(g)
-    print("Loaded graph: {}".format(g))
-
-    logger = Logger(args['runs'], args)
-
-    # train sampler
-    sampler = dgl.dataloading.MultiLayerNeighborSampler(args['fanout'])
-    train_loader = dgl.dataloading.DataLoader(
-        g, split_idx['train'], sampler,
-        batch_size=args['batch_size'], shuffle=True, num_workers=0)
-    
-    return (g, labels, dataset.num_classes, split_idx,  
-            logger, train_loader)
-
-def get_model(g, num_classes, args):
-    embed_layer = RelGraphEmbed(g, 128, exclude=['paper'])
-    
-    model = EntityClassify(
-        g, 128, args['n_hidden'], num_classes,
-        num_hidden_layers=args['num_layers'] - 2,
-        dropout=args['dropout'],
-        use_self_loop=True,
-    )
-
-    print(embed_layer)
-    print(f"Number of embedding parameters: {sum(p.numel() for p in embed_layer.parameters())}")
-    print(model)
-    print(f"Number of model parameters: {sum(p.numel() for p in model.parameters())}")
-
-    return embed_layer, model
-
 def train(g, model, node_embed, optimizer, train_loader, split_idx,
-          labels, logger, device, run, args):
-    
-    # training loop
+          labels, logger, device, run):
    print("start training...")
    category = 'paper'

-    for epoch in range(args['n_epochs']):
-        N_train= split_idx['train'][category].shape[0]
-        pbar = tqdm(total=N_train)
+    for epoch in range(3):
+        num_train = split_idx['train'][category].shape[0]
+        pbar = tqdm(total=num_train)
        pbar.set_description(f'Epoch {epoch:02d}')
        model.train()

@@ -400,11 +217,9 @@ def train(g, model, node_embed, optimizer, train_loader, split_idx,
            emb = extract_embed(node_embed, input_nodes)
            # Add the batch's raw "paper" features
            emb.update({'paper': g.ndata['feat']['paper'][input_nodes['paper']]})
-            lbl = labels[seeds]

-            if th.cuda.is_available():
-                emb = {k : e.cuda() for k, e in emb.items()}
-                lbl = lbl.cuda()
+            emb = {k : e.to(device) for k, e in emb.items()}
+            lbl = labels[seeds].to(device)

            optimizer.zero_grad()
            logits = model(emb, blocks)[category]
@@ -418,9 +233,9 @@ def train(g, model, node_embed, optimizer, train_loader, split_idx,
            pbar.update(batch_size)

        pbar.close()
-        loss = total_loss / N_train
+        loss = total_loss / num_train

-        result = test(g, model, node_embed, labels, device, split_idx, args)
+        result = test(g, model, node_embed, labels, device, split_idx)
        logger.add_result(run, result)
        train_acc, valid_acc, test_acc = result
        print(f'Run: {run + 1:02d}, '
@@ -433,19 +248,19 @@ def train(g, model, node_embed, optimizer, train_loader, split_idx,
    return logger

 @th.no_grad()
-def test(g, model, node_embed, y_true, device, split_idx, args):
+def test(g, model, node_embed, y_true, device, split_idx):
    model.eval()
    category = 'paper'
    evaluator = Evaluator(name='ogbn-mag')

-    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(args['num_layers'])
+    # 2 GNN layers
+    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
    loader = dgl.dataloading.DataLoader(
        g, {'paper': th.arange(g.num_nodes('paper'))}, sampler,
        batch_size=16384, shuffle=False, num_workers=0)

-    N = y_true.size(0)
-    pbar = tqdm(total=N)
-    pbar.set_description(f'Full Inference')
+    pbar = tqdm(total=y_true.size(0))
+    pbar.set_description(f'Inference')

    y_hats = list()

@@ -457,9 +272,7 @@ def test(g, model, node_embed, y_true, device, split_idx, args):
        emb = extract_embed(node_embed, input_nodes)
        # Get the batch's raw "paper" features
        emb.update({'paper': g.ndata['feat']['paper'][input_nodes['paper']]})
-        
-        if th.cuda.is_available():
-            emb = {k : e.cuda() for k, e in emb.items()}
+        emb = {k : e.to(device) for k, e in emb.items()}

        logits = model(emb, blocks)[category]
        y_hat = logits.log_softmax(dim=-1).argmax(dim=1, keepdims=True)
@@ -488,40 +301,36 @@ def test(g, model, node_embed, y_true, device, split_idx, args):
    return train_acc, valid_acc, test_acc

 def main(args):
-    # Static parameters
-    hyperparameters = dict(
-        num_layers=2,
-        fanout=[25, 20], 
-        batch_size=1024,
-    )
-    hyperparameters.update(vars(args))
-    print(hyperparameters)
-
    device = f'cuda:0' if th.cuda.is_available() else 'cpu'

-    (g, labels, num_classes, split_idx, 
-        logger, train_loader) = prepare_data(hyperparameters)
+    g, labels, num_classes, split_idx, logger, train_loader = prepare_data(args)
+
+    embed_layer = rel_graph_embed(g, 128)
+    model = EntityClassify(g, 128, num_classes).to(device)

-    embed_layer, model = get_model(g, num_classes, hyperparameters)
-    model = model.to(device)
+    print(f"Number of embedding parameters: {sum(p.numel() for p in embed_layer.parameters())}")
+    print(f"Number of model parameters: {sum(p.numel() for p in model.parameters())}")

-    for run in range(hyperparameters['runs']):
+    for run in range(args.runs):

        embed_layer.reset_parameters()
        model.reset_parameters()

        # optimizer
        all_params = itertools.chain(model.parameters(), embed_layer.parameters())
-        optimizer = th.optim.Adam(all_params, lr=hyperparameters['lr'])
-
-        logger = train(g, model, embed_layer(), optimizer, train_loader, split_idx,
-              labels, logger, device, run, hyperparameters)
+        optimizer = th.optim.Adam(all_params, lr=0.01)

+        logger = train(g, model, embed_layer, optimizer, train_loader, split_idx,
+              labels, logger, device, run)
        logger.print_statistics(run)

    print("Final performance: ")
    logger.print_statistics()

 if __name__ == '__main__':
-    args = parse_args()
+    parser = argparse.ArgumentParser(description='RGCN')
+    parser.add_argument('--runs', type=int, default=10)
+
+    args = parser.parse_args()
+
    main(args)
--- a/examples/pytorch/rgcn-hetero-ogbn-mag/main.py
+++ b/examples/pytorch/rgcn-hetero-ogbn-mag/main.py
-import argparse
-from itertools import chain
-from timeit import default_timer
-from typing import Callable, Tuple, Union
-
-import dgl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import utils
-from model import EntityClassify, RelGraphEmbedding
-
-
-def train(
-    embedding_layer: nn.Module,
-    model: nn.Module,
-    device: Union[str, torch.device],
-    embedding_optimizer: torch.optim.Optimizer,
-    model_optimizer: torch.optim.Optimizer,
-    loss_function: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
-    labels: torch.Tensor,
-    predict_category: str,
-    dataloader: dgl.dataloading.DataLoader,
-) -> Tuple[float]:
-    model.train()
-
-    total_loss = 0
-    total_accuracy = 0
-
-    start = default_timer()
-
-    embedding_layer = embedding_layer.to(device)
-    model = model.to(device)
-    loss_function = loss_function.to(device)
-
-    for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader):
-        embedding_optimizer.zero_grad()
-        model_optimizer.zero_grad()
-
-        in_nodes = {rel: nid.to(device) for rel, nid in in_nodes.items()}
-        out_nodes = out_nodes[predict_category].to(device)
-        blocks = [block.to(device) for block in blocks]
-
-        batch_labels = labels[out_nodes].to(device)
-
-        embedding = embedding_layer(in_nodes=in_nodes, device=device)
-        logits = model(blocks, embedding)[predict_category]
-
-        loss = loss_function(logits, batch_labels)
-
-        indices = logits.argmax(dim=-1)
-        correct = torch.sum(indices == batch_labels)
-        accuracy = correct.item() / len(batch_labels)
-
-        loss.backward()
-        model_optimizer.step()
-        embedding_optimizer.step()
-
-        total_loss += loss.item()
-        total_accuracy += accuracy
-
-    stop = default_timer()
-    time = stop - start
-
-    total_loss /= step + 1
-    total_accuracy /= step + 1
-
-    return time, total_loss, total_accuracy
-
-
-def validate(
-    embedding_layer: nn.Module,
-    model: nn.Module,
-    device: Union[str, torch.device],
-    inference_mode: str,
-    loss_function: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
-    hg: dgl.DGLHeteroGraph,
-    labels: torch.Tensor,
-    predict_category: str,
-    dataloader: dgl.dataloading.DataLoader = None,
-    eval_batch_size: int = None,
-    eval_num_workers: int = None,
-    mask: torch.Tensor = None,
-) -> Tuple[float]:
-    embedding_layer.eval()
-    model.eval()
-
-    start = default_timer()
-
-    embedding_layer = embedding_layer.to(device)
-    model = model.to(device)
-    loss_function = loss_function.to(device)
-
-    valid_labels = labels[mask].to(device)
-
-    with torch.no_grad():
-        if inference_mode == 'neighbor_sampler':
-            total_loss = 0
-            total_accuracy = 0
-
-            for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader):
-                in_nodes = {rel: nid.to(device)
-                            for rel, nid in in_nodes.items()}
-                out_nodes = out_nodes[predict_category].to(device)
-                blocks = [block.to(device) for block in blocks]
-
-                batch_labels = labels[out_nodes].to(device)
-
-                embedding = embedding_layer(in_nodes=in_nodes, device=device)
-                logits = model(blocks, embedding)[predict_category]
-
-                loss = loss_function(logits, batch_labels)
-
-                indices = logits.argmax(dim=-1)
-                correct = torch.sum(indices == batch_labels)
-                accuracy = correct.item() / len(batch_labels)
-
-                total_loss += loss.item()
-                total_accuracy += accuracy
-
-            total_loss /= step + 1
-            total_accuracy /= step + 1
-        elif inference_mode == 'full_neighbor_sampler':
-            logits = model.inference(
-                hg,
-                eval_batch_size,
-                eval_num_workers,
-                embedding_layer,
-                device,
-            )[predict_category][mask]
-
-            total_loss = loss_function(logits, valid_labels)
-
-            indices = logits.argmax(dim=-1)
-            correct = torch.sum(indices == valid_labels)
-            total_accuracy = correct.item() / len(valid_labels)
-
-            total_loss = total_loss.item()
-        else:
-            embedding = embedding_layer(device=device)
-            logits = model(hg, embedding)[predict_category][mask]
-
-            total_loss = loss_function(logits, valid_labels)
-
-            indices = logits.argmax(dim=-1)
-            correct = torch.sum(indices == valid_labels)
-            total_accuracy = correct.item() / len(valid_labels)
-
-            total_loss = total_loss.item()
-
-    stop = default_timer()
-    time = stop - start
-
-    return time, total_loss, total_accuracy
-
-
-def run(args: argparse.ArgumentParser) -> None:
-    torch.manual_seed(args.seed)
-
-    dataset, hg, train_idx, valid_idx, test_idx = utils.process_dataset(
-        args.dataset,
-        root=args.dataset_root,
-    )
-    predict_category = dataset.predict_category
-    labels = hg.nodes[predict_category].data['labels']
-
-    training_device = torch.device('cuda' if args.gpu_training else 'cpu')
-    inference_device = torch.device('cuda' if args.gpu_inference else 'cpu')
-
-    inferfence_mode = args.inference_mode
-
-    fanouts = [int(fanout) for fanout in args.fanouts.split(',')]
-
-    train_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-    train_dataloader = dgl.dataloading.DataLoader(
-        hg,
-        {predict_category: train_idx},
-        train_sampler,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=args.num_workers,
-    )
-
-    if inferfence_mode == 'neighbor_sampler':
-        valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-        valid_dataloader = dgl.dataloading.DataLoader(
-            hg,
-            {predict_category: valid_idx},
-            valid_sampler,
-            batch_size=args.eval_batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=args.eval_num_workers,
-        )
-
-        if args.test_validation:
-            test_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-            test_dataloader = dgl.dataloading.DataLoader(
-                hg,
-                {predict_category: test_idx},
-                test_sampler,
-                batch_size=args.eval_batch_size,
-                shuffle=False,
-                drop_last=False,
-                num_workers=args.eval_num_workers,
-            )
-    else:
-        valid_dataloader = None
-
-        if args.test_validation:
-            test_dataloader = None
-
-    in_feats = hg.nodes[predict_category].data['feat'].shape[-1]
-    out_feats = dataset.num_classes
-
-    num_nodes = {}
-    node_feats = {}
-
-    for ntype in hg.ntypes:
-        num_nodes[ntype] = hg.num_nodes(ntype)
-        node_feats[ntype] = hg.nodes[ntype].data.get('feat')
-
-    activations = {'leaky_relu': F.leaky_relu, 'relu': F.relu}
-
-    embedding_layer = RelGraphEmbedding(hg, in_feats, num_nodes, node_feats)
-    model = EntityClassify(
-        hg,
-        in_feats,
-        args.hidden_feats,
-        out_feats,
-        args.num_bases,
-        args.num_layers,
-        norm=args.norm,
-        layer_norm=args.layer_norm,
-        input_dropout=args.input_dropout,
-        dropout=args.dropout,
-        activation=activations[args.activation],
-        self_loop=args.self_loop,
-    )
-
-    loss_function = nn.CrossEntropyLoss()
-
-    embedding_optimizer = torch.optim.SparseAdam(
-        embedding_layer.node_embeddings.parameters(), lr=args.embedding_lr)
-
-    if args.node_feats_projection:
-        all_parameters = chain(
-            model.parameters(), embedding_layer.embeddings.parameters())
-        model_optimizer = torch.optim.Adam(all_parameters, lr=args.model_lr)
-    else:
-        model_optimizer = torch.optim.Adam(
-            model.parameters(), lr=args.model_lr)
-
-    checkpoint = utils.Callback(args.early_stopping_patience,
-                                args.early_stopping_monitor)
-
-    print('## Training started ##')
-
-    for epoch in range(args.num_epochs):
-        train_time, train_loss, train_accuracy = train(
-            embedding_layer,
-            model,
-            training_device,
-            embedding_optimizer,
-            model_optimizer,
-            loss_function,
-            labels,
-            predict_category,
-            train_dataloader,
-        )
-        valid_time, valid_loss, valid_accuracy = validate(
-            embedding_layer,
-            model,
-            inference_device,
-            inferfence_mode,
-            loss_function,
-            hg,
-            labels,
-            predict_category=predict_category,
-            dataloader=valid_dataloader,
-            eval_batch_size=args.eval_batch_size,
-            eval_num_workers=args.eval_num_workers,
-            mask=valid_idx,
-        )
-
-        checkpoint.create(
-            epoch,
-            train_time,
-            valid_time,
-            train_loss,
-            valid_loss,
-            train_accuracy,
-            valid_accuracy,
-            {'embedding_layer': embedding_layer, 'model': model},
-        )
-
-        print(
-            f'Epoch: {epoch + 1:03} '
-            f'Train Loss: {train_loss:.2f} '
-            f'Valid Loss: {valid_loss:.2f} '
-            f'Train Accuracy: {train_accuracy:.4f} '
-            f'Valid Accuracy: {valid_accuracy:.4f} '
-            f'Train Epoch Time: {train_time:.2f} '
-            f'Valid Epoch Time: {valid_time:.2f}'
-        )
-
-        if checkpoint.should_stop:
-            print('## Training finished: early stopping ##')
-
-            break
-        elif epoch >= args.num_epochs - 1:
-            print('## Training finished ##')
-
-    print(
-        f'Best Epoch: {checkpoint.best_epoch} '
-        f'Train Loss: {checkpoint.best_epoch_train_loss:.2f} '
-        f'Valid Loss: {checkpoint.best_epoch_valid_loss:.2f} '
-        f'Train Accuracy: {checkpoint.best_epoch_train_accuracy:.4f} '
-        f'Valid Accuracy: {checkpoint.best_epoch_valid_accuracy:.4f}'
-    )
-
-    if args.test_validation:
-        print('## Test data validation ##')
-
-        embedding_layer.load_state_dict(
-            checkpoint.best_epoch_model_parameters['embedding_layer'])
-        model.load_state_dict(checkpoint.best_epoch_model_parameters['model'])
-
-        test_time, test_loss, test_accuracy = validate(
-            embedding_layer,
-            model,
-            inference_device,
-            inferfence_mode,
-            loss_function,
-            hg,
-            labels,
-            predict_category=predict_category,
-            dataloader=test_dataloader,
-            eval_batch_size=args.eval_batch_size,
-            eval_num_workers=args.eval_num_workers,
-            mask=test_idx,
-        )
-
-        print(
-            f'Test Loss: {test_loss:.2f} '
-            f'Test Accuracy: {test_accuracy:.4f} '
-            f'Test Epoch Time: {test_time:.2f}'
-        )
-
-
-if __name__ == '__main__':
-    argparser = argparse.ArgumentParser('RGCN')
-
-    argparser.add_argument('--gpu-training', dest='gpu_training',
-                           action='store_true')
-    argparser.add_argument('--no-gpu-training', dest='gpu_training',
-                           action='store_false')
-    argparser.set_defaults(gpu_training=True)
-    argparser.add_argument('--gpu-inference', dest='gpu_inference',
-                           action='store_true')
-    argparser.add_argument('--no-gpu-inference', dest='gpu_inference',
-                           action='store_false')
-    argparser.set_defaults(gpu_inference=True)
-    argparser.add_argument('--inference-mode', default='neighbor_sampler', type=str,
-                           choices=['neighbor_sampler', 'full_neighbor_sampler', 'full_graph'])
-    argparser.add_argument('--dataset', default='ogbn-mag', type=str,
-                           choices=['ogbn-mag'])
-    argparser.add_argument('--dataset-root', default='dataset', type=str)
-    argparser.add_argument('--num-epochs', default=500, type=int)
-    argparser.add_argument('--embedding-lr', default=0.01, type=float)
-    argparser.add_argument('--model-lr', default=0.01, type=float)
-    argparser.add_argument('--node-feats-projection',
-                           dest='node_feats_projection', action='store_true')
-    argparser.add_argument('--no-node-feats-projection',
-                           dest='node_feats_projection', action='store_false')
-    argparser.set_defaults(node_feats_projection=False)
-    argparser.add_argument('--hidden-feats', default=64, type=int)
-    argparser.add_argument('--num-bases', default=2, type=int)
-    argparser.add_argument('--num-layers', default=2, type=int)
-    argparser.add_argument('--norm', default='right',
-                           type=str, choices=['both', 'none', 'right'])
-    argparser.add_argument('--layer-norm', dest='layer_norm',
-                           action='store_true')
-    argparser.add_argument('--no-layer-norm', dest='layer_norm',
-                           action='store_false')
-    argparser.set_defaults(layer_norm=False)
-    argparser.add_argument('--input-dropout', default=0.1, type=float)
-    argparser.add_argument('--dropout', default=0.5, type=float)
-    argparser.add_argument('--activation', default='relu', type=str,
-                           choices=['leaky_relu', 'relu'])
-    argparser.add_argument('--self-loop', dest='self_loop',
-                           action='store_true')
-    argparser.add_argument('--no-self-loop', dest='self_loop',
-                           action='store_false')
-    argparser.set_defaults(self_loop=True)
-    argparser.add_argument('--fanouts', default='25,20', type=str)
-    argparser.add_argument('--batch-size', default=1024, type=int)
-    argparser.add_argument('--eval-batch-size', default=1024, type=int)
-    argparser.add_argument('--num-workers', default=4, type=int)
-    argparser.add_argument('--eval-num-workers', default=4, type=int)
-    argparser.add_argument('--early-stopping-patience', default=10, type=int)
-    argparser.add_argument('--early-stopping-monitor', default='loss',
-                           type=str, choices=['accuracy', 'loss'])
-    argparser.add_argument('--test-validation', dest='test_validation',
-                           action='store_true')
-    argparser.add_argument('--no-test-validation', dest='test_validation',
-                           action='store_false')
-    argparser.set_defaults(test_validation=True)
-    argparser.add_argument('--seed', default=13, type=int)
-
-    args = argparser.parse_args()
-
-    run(args)
--- a/examples/pytorch/rgcn-hetero-ogbn-mag/model.py
+++ b/examples/pytorch/rgcn-hetero-ogbn-mag/model.py
-from typing import Callable, Dict, List, Union
-
-import dgl
-import dgl.nn.pytorch as dglnn
-import torch
-import torch.nn as nn
-
-
-class RelGraphEmbedding(nn.Module):
-    def __init__(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        embedding_size: int,
-        num_nodes: Dict[str, int],
-        node_feats: Dict[str, torch.Tensor],
-        node_feats_projection: bool = False,
-    ):
-        super().__init__()
-        self._hg = hg
-        self._node_feats = node_feats
-        self._node_feats_projection = node_feats_projection
-        self.node_embeddings = nn.ModuleDict()
-
-        if node_feats_projection:
-            self.embeddings = nn.ParameterDict()
-
-        for ntype in hg.ntypes:
-            if node_feats[ntype] is None:
-                node_embedding = nn.Embedding(
-                    num_nodes[ntype], embedding_size, sparse=True)
-                nn.init.uniform_(node_embedding.weight, -1, 1)
-
-                self.node_embeddings[ntype] = node_embedding
-            elif node_feats[ntype] is not None and node_feats_projection:
-                input_embedding_size = node_feats[ntype].shape[-1]
-                embedding = nn.Parameter(torch.Tensor(
-                    input_embedding_size, embedding_size))
-                nn.init.xavier_uniform_(embedding)
-
-                self.embeddings[ntype] = embedding
-
-    def forward(
-        self,
-        in_nodes: Dict[str, torch.Tensor] = None,
-        device: torch.device = None,
-    ) -> Dict[str, torch.Tensor]:
-        if in_nodes is not None:
-            ntypes = [ntype for ntype in in_nodes.keys()]
-            nids = [nid for nid in in_nodes.values()]
-        else:
-            ntypes = self._hg.ntypes
-            nids = [self._hg.nodes(ntype) for ntype in ntypes]
-
-        x = {}
-
-        for ntype, nid in zip(ntypes, nids):
-            if self._node_feats[ntype] is None:
-                x[ntype] = self.node_embeddings[ntype](nid)
-            else:
-                if device is not None:
-                    self._node_feats[ntype] = self._node_feats[ntype].to(
-                        device)
-
-                if self._node_feats_projection:
-                    x[ntype] = self._node_feats[ntype][nid] @ self.embeddings[ntype]
-                else:
-                    x[ntype] = self._node_feats[ntype][nid]
-
-        return x
-
-
-class RelGraphConvLayer(nn.Module):
-    def __init__(
-        self,
-        in_feats: int,
-        out_feats: int,
-        rel_names: List[str],
-        num_bases: int,
-        norm: str = 'right',
-        weight: bool = True,
-        bias: bool = True,
-        activation: Callable[[torch.Tensor], torch.Tensor] = None,
-        dropout: float = None,
-        self_loop: bool = False,
-    ):
-        super().__init__()
-        self._rel_names = rel_names
-        self._num_rels = len(rel_names)
-        self._conv = dglnn.HeteroGraphConv({rel: dglnn.GraphConv(
-            in_feats, out_feats, norm=norm, weight=False, bias=False) for rel in rel_names})
-        self._use_weight = weight
-        self._use_basis = num_bases < self._num_rels and weight
-        self._use_bias = bias
-        self._activation = activation
-        self._dropout = nn.Dropout(dropout) if dropout is not None else None
-        self._use_self_loop = self_loop
-
-        if weight:
-            if self._use_basis:
-                self.basis = dglnn.WeightBasis(
-                    (in_feats, out_feats), num_bases, self._num_rels)
-            else:
-                self.weight = nn.Parameter(torch.Tensor(
-                    self._num_rels, in_feats, out_feats))
-                nn.init.xavier_uniform_(
-                    self.weight, gain=nn.init.calculate_gain('relu'))
-
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
-            nn.init.zeros_(self.bias)
-
-        if self_loop:
-            self.self_loop_weight = nn.Parameter(
-                torch.Tensor(in_feats, out_feats))
-            nn.init.xavier_uniform_(
-                self.self_loop_weight, gain=nn.init.calculate_gain('relu'))
-
-    def _apply_layers(
-        self,
-        ntype: str,
-        inputs: torch.Tensor,
-        inputs_dst: torch.Tensor = None,
-    ) -> torch.Tensor:
-        x = inputs
-
-        if inputs_dst is not None:
-            x += torch.matmul(inputs_dst[ntype], self.self_loop_weight)
-
-        if self._use_bias:
-            x += self.bias
-
-        if self._activation is not None:
-            x = self._activation(x)
-
-        if self._dropout is not None:
-            x = self._dropout(x)
-
-        return x
-
-    def forward(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        inputs: Dict[str, torch.Tensor],
-    ) -> Dict[str, torch.Tensor]:
-        hg = hg.local_var()
-
-        if self._use_weight:
-            weight = self.basis() if self._use_basis else self.weight
-            weight_dict = {self._rel_names[i]: {'weight': w.squeeze(
-                dim=0)} for i, w in enumerate(torch.split(weight, 1, dim=0))}
-        else:
-            weight_dict = {}
-
-        if self._use_self_loop:
-            if hg.is_block:
-                inputs_dst = {ntype: h[:hg.num_dst_nodes(
-                    ntype)] for ntype, h in inputs.items()}
-            else:
-                inputs_dst = inputs
-        else:
-            inputs_dst = None
-
-        x = self._conv(hg, inputs, mod_kwargs=weight_dict)
-        x = {ntype: self._apply_layers(ntype, h, inputs_dst)
-             for ntype, h in x.items()}
-
-        return x
-
-
-class EntityClassify(nn.Module):
-    def __init__(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        in_feats: int,
-        hidden_feats: int,
-        out_feats: int,
-        num_bases: int,
-        num_layers: int,
-        norm: str = 'right',
-        layer_norm: bool = False,
-        input_dropout: float = 0,
-        dropout: float = 0,
-        activation: Callable[[torch.Tensor], torch.Tensor] = None,
-        self_loop: bool = False,
-    ):
-        super().__init__()
-        self._hidden_feats = hidden_feats
-        self._out_feats = out_feats
-        self._num_layers = num_layers
-        self._input_dropout = nn.Dropout(input_dropout)
-        self._dropout = nn.Dropout(dropout)
-        self._activation = activation
-        self._rel_names = sorted(list(set(hg.etypes)))
-        self._num_rels = len(self._rel_names)
-
-        if num_bases < 0 or num_bases > self._num_rels:
-            self._num_bases = self._num_rels
-        else:
-            self._num_bases = num_bases
-
-        self._layers = nn.ModuleList()
-
-        self._layers.append(RelGraphConvLayer(
-            in_feats,
-            hidden_feats,
-            self._rel_names,
-            self._num_bases,
-            norm=norm,
-            self_loop=self_loop,
-        ))
-
-        for _ in range(1, num_layers - 1):
-            self._layers.append(RelGraphConvLayer(
-                hidden_feats,
-                hidden_feats,
-                self._rel_names,
-                self._num_bases,
-                norm=norm,
-                self_loop=self_loop,
-            ))
-
-        self._layers.append(RelGraphConvLayer(
-            hidden_feats,
-            out_feats,
-            self._rel_names,
-            self._num_bases,
-            norm=norm,
-            self_loop=self_loop,
-        ))
-
-        if layer_norm:
-            self._layer_norms = nn.ModuleList()
-
-            for _ in range(num_layers - 1):
-                self._layer_norms.append(nn.LayerNorm(hidden_feats))
-        else:
-            self._layer_norms = None
-
-    def _apply_layers(
-        self,
-        layer_idx: int,
-        inputs: Dict[str, torch.Tensor],
-    ) -> Dict[str, torch.Tensor]:
-        x = inputs
-
-        for ntype, h in x.items():
-            if self._layer_norms is not None:
-                h = self._layer_norms[layer_idx](h)
-
-            if self._activation is not None:
-                h = self._activation(h)
-
-            x[ntype] = self._dropout(h)
-
-        return x
-
-    def forward(
-        self,
-        hg: Union[dgl.DGLHeteroGraph, List[dgl.DGLHeteroGraph]],
-        inputs: Dict[str, torch.Tensor],
-    ) -> Dict[str, torch.Tensor]:
-        x = {ntype: self._input_dropout(h) for ntype, h in inputs.items()}
-
-        if isinstance(hg, list):
-            for i, (layer, block) in enumerate(zip(self._layers, hg)):
-                x = layer(block, x)
-
-                if i < self._num_layers - 1:
-                    x = self._apply_layers(i, x)
-        else:
-            for i, layer in enumerate(self._layers):
-                x = layer(hg, x)
-
-                if i < self._num_layers - 1:
-                    x = self._apply_layers(i, x)
-
-        return x
-
-    def inference(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        batch_size: int,
-        num_workers: int,
-        embedding_layer: nn.Module,
-        device: torch.device,
-    ) -> Dict[str, torch.Tensor]:
-        for i, layer in enumerate(self._layers):
-            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
-            dataloader = dgl.dataloading.DataLoader(
-                hg,
-                {ntype: hg.nodes(ntype) for ntype in hg.ntypes},
-                sampler,
-                batch_size=batch_size,
-                shuffle=False,
-                drop_last=False,
-                num_workers=num_workers,
-            )
-
-            if i < self._num_layers - 1:
-                y = {ntype: torch.zeros(hg.num_nodes(
-                    ntype), self._hidden_feats, device=device) for ntype in hg.ntypes}
-            else:
-                y = {ntype: torch.zeros(hg.num_nodes(
-                    ntype), self._out_feats, device=device) for ntype in hg.ntypes}
-
-            for in_nodes, out_nodes, blocks in dataloader:
-                in_nodes = {rel: nid.to(device)
-                            for rel, nid in in_nodes.items()}
-                out_nodes = {rel: nid.to(device)
-                             for rel, nid in out_nodes.items()}
-                block = blocks[0].to(device)
-
-                if i == 0:
-                    h = embedding_layer(in_nodes=in_nodes, device=device)
-                else:
-                    h = {ntype: x[ntype][in_nodes[ntype]]
-                         for ntype in hg.ntypes}
-
-                h = layer(block, h)
-
-                if i < self._num_layers - 1:
-                    h = self._apply_layers(i, h)
-
-                for ntype in out_nodes:
-                    y[ntype][out_nodes[ntype]] = h[ntype]
-
-            x = y
-
-        return x
--- a/examples/pytorch/rgcn-hetero-ogbn-mag/utils.py
+++ b/examples/pytorch/rgcn-hetero-ogbn-mag/utils.py
-from copy import deepcopy
-from typing import Dict, List, Tuple, Union
-
-import dgl
-import torch
-import torch.nn as nn
-from ogb.nodeproppred import DglNodePropPredDataset
-
-
-class Callback:
-    def __init__(
-        self,
-        patience: int,
-        monitor: str,
-    ) -> None:
-        self._patience = patience
-        self._monitor = monitor
-        self._lookback = 0
-        self._best_epoch = None
-        self._train_times = []
-        self._valid_times = []
-        self._train_losses = []
-        self._valid_losses = []
-        self._train_accuracies = []
-        self._valid_accuracies = []
-        self._model_parameters = {}
-
-    @property
-    def best_epoch(self) -> int:
-        return self._best_epoch + 1
-
-    @property
-    def train_times(self) -> List[float]:
-        return self._train_times
-
-    @property
-    def valid_times(self) -> List[float]:
-        return self._valid_times
-
-    @property
-    def train_losses(self) -> List[float]:
-        return self._train_losses
-
-    @property
-    def valid_losses(self) -> List[float]:
-        return self._valid_losses
-
-    @property
-    def train_accuracies(self) -> List[float]:
-        return self._train_accuracies
-
-    @property
-    def valid_accuracies(self) -> List[float]:
-        return self._valid_accuracies
-
-    @property
-    def best_epoch_training_time(self) -> float:
-        return sum(self._train_times[:self._best_epoch])
-
-    @property
-    def best_epoch_train_loss(self) -> float:
-        return self._train_losses[self._best_epoch]
-
-    @property
-    def best_epoch_valid_loss(self) -> float:
-        return self._valid_losses[self._best_epoch]
-
-    @property
-    def best_epoch_train_accuracy(self) -> float:
-        return self._train_accuracies[self._best_epoch]
-
-    @property
-    def best_epoch_valid_accuracy(self) -> float:
-        return self._valid_accuracies[self._best_epoch]
-
-    @property
-    def best_epoch_model_parameters(
-            self) -> Union[Dict[str, torch.Tensor], Dict[str, Dict[str, torch.Tensor]]]:
-        return self._model_parameters
-
-    @property
-    def should_stop(self) -> bool:
-        return self._lookback >= self._patience
-
-    def create(
-        self,
-        epoch: int,
-        train_time: float,
-        valid_time: float,
-        train_loss: float,
-        valid_loss: float,
-        train_accuracy: float,
-        valid_accuracy: float,
-        model: Union[nn.Module, Dict[str, nn.Module]],
-    ) -> None:
-        self._train_times.append(train_time)
-        self._valid_times.append(valid_time)
-        self._train_losses.append(train_loss)
-        self._valid_losses.append(valid_loss)
-        self._train_accuracies.append(train_accuracy)
-        self._valid_accuracies.append(valid_accuracy)
-
-        best_epoch = False
-
-        if self._best_epoch is None:
-            best_epoch = True
-        elif self._monitor == 'loss':
-            if valid_loss < self._valid_losses[self._best_epoch]:
-                best_epoch = True
-        elif self._monitor == 'accuracy':
-            if valid_accuracy > self._valid_accuracies[self._best_epoch]:
-                best_epoch = True
-
-        if best_epoch:
-            self._best_epoch = epoch
-
-            if isinstance(model, dict):
-                for name, current_model in model.items():
-                    self._model_parameters[name] = deepcopy(
-                        current_model.to('cpu').state_dict())
-            else:
-                self._model_parameters = deepcopy(model.to('cpu').state_dict())
-
-            self._lookback = 0
-        else:
-            self._lookback += 1
-
-
-class OGBDataset:
-    def __init__(
-        self,
-        g: Union[dgl.DGLGraph, dgl.DGLHeteroGraph],
-        num_labels: int,
-        predict_category: str = None,
-    ) -> None:
-        self._g = g
-        self._num_labels = num_labels
-        self._predict_category = predict_category
-
-    @property
-    def num_labels(self) -> int:
-        return self._num_labels
-
-    @property
-    def num_classes(self) -> int:
-        return self._num_labels
-
-    @property
-    def predict_category(self) -> str:
-        return self._predict_category
-
-    def __getitem__(self, idx: int) -> Union[dgl.DGLGraph, dgl.DGLHeteroGraph]:
-        return self._g
-
-
-def load_ogbn_mag(root: str = None) -> OGBDataset:
-    dataset = DglNodePropPredDataset(name='ogbn-mag', root=root)
-
-    split_idx = dataset.get_idx_split()
-
-    train_idx = split_idx['train']['paper']
-    valid_idx = split_idx['valid']['paper']
-    test_idx = split_idx['test']['paper']
-
-    hg_original, labels = dataset[0]
-
-    labels = labels['paper'].squeeze()
-    num_labels = dataset.num_classes
-
-    subgraphs = {}
-
-    for etype in hg_original.canonical_etypes:
-        src, dst = hg_original.all_edges(etype=etype)
-
-        subgraphs[etype] = (src, dst)
-        subgraphs[(etype[2], f'rev-{etype[1]}', etype[0])] = (dst, src)
-
-    hg = dgl.heterograph(subgraphs)
-
-    hg.nodes['paper'].data['feat'] = hg_original.nodes['paper'].data['feat']
-    hg.nodes['paper'].data['labels'] = labels
-
-    train_mask = torch.zeros((hg.num_nodes('paper'),), dtype=torch.bool)
-    train_mask[train_idx] = True
-    valid_mask = torch.zeros((hg.num_nodes('paper'),), dtype=torch.bool)
-    valid_mask[valid_idx] = True
-    test_mask = torch.zeros((hg.num_nodes('paper'),), dtype=torch.bool)
-    test_mask[test_idx] = True
-
-    hg.nodes['paper'].data['train_mask'] = train_mask
-    hg.nodes['paper'].data['valid_mask'] = valid_mask
-    hg.nodes['paper'].data['test_mask'] = test_mask
-
-    ogb_dataset = OGBDataset(hg, num_labels, 'paper')
-
-    return ogb_dataset
-
-
-def process_dataset(
-    name: str,
-    root: str = None,
-) -> Tuple[OGBDataset, dgl.DGLHeteroGraph, torch.Tensor]:
-    if root is None:
-        root = 'datasets'
-
-    if name == 'ogbn-mag':
-        dataset = load_ogbn_mag(root=root)
-
-    g = dataset[0]
-
-    predict_category = dataset.predict_category
-
-    train_idx = torch.nonzero(
-        g.nodes[predict_category].data['train_mask'], as_tuple=True)[0]
-    valid_idx = torch.nonzero(
-        g.nodes[predict_category].data['valid_mask'], as_tuple=True)[0]
-    test_idx = torch.nonzero(
-        g.nodes[predict_category].data['test_mask'], as_tuple=True)[0]
-
-    return dataset, g, train_idx, valid_idx, test_idx
--- a/examples/pytorch/rgcn/entity_sample.py
+++ b/examples/pytorch/rgcn/entity_sample.py
@@ -121,6 +121,7 @@ def main(args):
                 ns_mode=True)
    labels = labels.to(device)
    model = model.to(device)
+    inv_target = inv_target.to(device)

    optimizer = th.optim.Adam(model.parameters(), lr=1e-2, weight_decay=args.wd)


--- a/examples/pytorch/rgcn/entity_sample_multi_gpu.py
+++ b/examples/pytorch/rgcn/entity_sample_multi_gpu.py
@@ -60,6 +60,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, queue=None):
                 ns_mode=True)
    labels = labels.to(device)
    model = model.to(device)
+    inv_target = inv_target.to(device)
    model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)

    optimizer = th.optim.Adam(model.parameters(), lr=1e-2, weight_decay=args.wd)

--- a/include/dgl/immutable_graph.h
+++ b/include/dgl/immutable_graph.h
@@ -273,19 +273,19 @@ class COO : public GraphInterface {
  // TODO(da): add constructor for creating COO from shared memory

  void AddVertices(uint64_t num_vertices) override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  void AddEdge(dgl_id_t src, dgl_id_t dst) override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  void AddEdges(IdArray src_ids, IdArray dst_ids) override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  void Clear() override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  DLContext Context() const override {