Merge branch 'master' into dist_part

e9b624fe · Minjie Wang · GitHub · 8086d1ed · a88e7f7e · e9b624fe
Unverified Commit e9b624fe authored Aug 11, 2022 by Minjie Wang Committed by GitHub Aug 11, 2022
20 changed files
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/evaluate_model.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/evaluate_model.py
+import os, torch, argparse
+import netlib as netlib
+import auxiliaries as aux
+import datasets as data
+import evaluate as eval
+
+if __name__ == '__main__':
+
+    ################## INPUT ARGUMENTS ###################
+    parser = argparse.ArgumentParser()
+    ####### Main Parameter: Dataset to use for Training
+    parser.add_argument('--dataset', default='vehicle_id', type=str, help='Dataset to use.',
+                        choices=['Inaturalist', 'vehicle_id'])
+    parser.add_argument('--source_path', default='/scratch/shared/beegfs/abrown/datasets', type=str,
+                        help='Path to training data.')
+    parser.add_argument('--save_path', default=os.getcwd() + '/Training_Results', type=str,
+                        help='Where to save everything.')
+    parser.add_argument('--savename', default='', type=str,
+                        help='Save folder name if any special information is to be included.')
+
+    ### General Training Parameters
+    parser.add_argument('--kernels', default=8, type=int, help='Number of workers for pytorch dataloader.')
+    parser.add_argument('--bs', default=112, type=int, help='Mini-Batchsize to use.')
+    parser.add_argument('--samples_per_class', default=4, type=int,help='Number of samples in one class drawn before choosing the next class. Set to >1 for losses other than ProxyNCA.')
+    parser.add_argument('--loss', default='smoothap', type=str)
+
+    ##### Evaluation Settings
+    parser.add_argument('--k_vals', nargs='+', default=[1, 2, 4, 8], type=int, help='Recall @ Values.')
+    ##### Network parameters
+    parser.add_argument('--embed_dim', default=512, type=int,
+                        help='Embedding dimensionality of the network. Note: in literature, dim=128 is used for ResNet50 and dim=512 for GoogLeNet.')
+    parser.add_argument('--arch', default='resnet50', type=str,
+                        help='Network backend choice: resnet50, googlenet, BNinception')
+    parser.add_argument('--gpu', default=0, type=int, help='GPU-id for GPU to use.')
+    parser.add_argument('--resume', default='', type=str, help='path to where weights to be evaluated are saved.')
+    parser.add_argument('--not_pretrained', action='store_true',
+                        help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+
+    parser.add_argument('--trainset', default="lin_train_set1.txt", type=str)
+    parser.add_argument('--testset', default="Inaturalist_test_set1.txt", type=str)
+    parser.add_argument('--cluster_path', default="", type=str)
+    parser.add_argument('--finetune', default="false", type=str)
+    parser.add_argument('--class_num', default=948, type=int)
+    parser.add_argument('--get_features', default="false", type=str)
+    parser.add_argument('--patch_size', default=16, type=int, help='vit patch size')
+    parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+    parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                        help="Whether to use batch normalizations in projection head (Default: False)")
+    parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                        help='Key to use in the checkpoint (example: "teacher")')
+    parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+    parser.add_argument('--norm_last_layer', default=True, type=aux.bool_flag,
+                        help="""Whether or not to weight normalize the last layer of the DINO head.
+        Not normalizing leads to better performance but can make the training unstable.
+        In our experiments, we typically set this paramater to False with vit_small and True with vit_base.""")
+    parser.add_argument('--linsize', default=29011, type=int, help="Lin data size.")
+    parser.add_argument('--uinsize', default=18403, type=int, help="Uin data size.")
+    opt = parser.parse_args()
+
+    """============================================================================"""
+    opt.source_path += '/' + opt.dataset
+
+    if opt.dataset == 'Inaturalist':
+        opt.n_epochs = 90
+        opt.tau = [40, 70]
+        opt.k_vals = [1, 4, 16, 32]
+
+    if opt.dataset == 'vehicle_id':
+        opt.k_vals = [1, 5]
+
+    if opt.finetune == 'true':
+        opt.finetune = True
+    elif opt.finetune == 'false':
+        opt.finetune = False
+
+    if opt.get_features == 'true':
+        opt.get_features = True
+    elif opt.get_features == 'false':
+        opt.get_features = False
+
+    metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+    LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+
+    """============================================================================"""
+    ##################### NETWORK SETUP ##################
+
+    opt.device = torch.device('cuda')
+    model = netlib.networkselect(opt)
+
+    # Push to Device
+    _ = model.to(opt.device)
+
+    """============================================================================"""
+    #################### DATALOADER SETUPS ##################
+    # Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+    # The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+    # Is simply using the training set, however running under the same rules as 'testing' dataloader,
+    # i.e. no shuffling and no random cropping.
+    dataloaders = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt)
+    # Because the number of supervised classes is dataset dependent, we store them after
+    # initializing the dataloader
+    opt.num_classes = len(dataloaders['training'].dataset.avail_classes)
+
+    if opt.dataset == 'Inaturalist':
+        eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': 0}
+
+    elif opt.dataset == 'vehicle_id':
+        eval_params = {
+            'dataloaders': [dataloaders['testing_set1'], dataloaders['testing_set2'], dataloaders['testing_set3']],
+            'model': model, 'opt': opt, 'epoch': 0}
+
+    """============================================================================"""
+    ####################evaluation ##################
+
+    results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/finetune_1head.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/finetune_1head.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+"""to do:
+
+clean all of the files - particularly the main.py and also the losses and dataset files and the file for doing the dataloading
+
+-- fast loading etc
+
+need to change all of the copyrights at the top of all of the files
+
+"""
+
+#################### LIBRARIES ########################
+import warnings
+warnings.filterwarnings("ignore")
+
+import os, numpy as np, argparse, random, matplotlib, datetime
+os.chdir(os.path.dirname(os.path.realpath(__file__)))
+from pathlib import Path
+matplotlib.use('agg')
+from tqdm import tqdm
+
+import auxiliaries as aux
+import datasets as data
+import netlib as netlib
+import losses as losses
+import evaluate as eval
+from tensorboardX import SummaryWriter
+import torch.multiprocessing
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+import time
+start = time.time()
+
+################### INPUT ARGUMENTS ###################
+parser = argparse.ArgumentParser()
+####### Main Parameter: Dataset to use for Training
+parser.add_argument('--dataset',      default='Inaturalist',   type=str, help='Dataset to use.', choices=['Inaturalist','semi_fungi'])
+### General Training Parameters
+parser.add_argument('--lr',                default=0.00001,  type=float, help='Learning Rate for network parameters.')
+parser.add_argument('--fc_lr_mul',         default=5,        type=float, help='OPTIONAL: Multiply the embedding layer learning rate by this value. If set to 0, the embedding layer shares the same learning rate.')
+parser.add_argument('--n_epochs',          default=400,       type=int,   help='Number of training epochs.')
+parser.add_argument('--kernels',           default=8,        type=int,   help='Number of workers for pytorch dataloader.')
+parser.add_argument('--bs',                default=112 ,     type=int,   help='Mini-Batchsize to use.')
+parser.add_argument('--samples_per_class', default=4,        type=int,   help='Number of samples in one class drawn before choosing the next class')
+parser.add_argument('--seed',              default=1,        type=int,   help='Random seed for reproducibility.')
+parser.add_argument('--scheduler',         default='step',   type=str,   help='Type of learning rate scheduling. Currently: step & exp.')
+parser.add_argument('--gamma',             default=0.3,      type=float, help='Learning rate reduction after tau epochs.')
+parser.add_argument('--decay',             default=0.001,   type=float, help='Weight decay for optimizer.')
+parser.add_argument('--tau',               default= [200,300],nargs='+',type=int,help='Stepsize(s) before reducing learning rate.')
+parser.add_argument('--infrequent_eval', default=0,type=int, help='only compute evaluation metrics every 10 epochs')
+parser.add_argument('--opt', default = 'adam',help='adam or sgd')
+##### Loss-specific Settings
+parser.add_argument('--loss',         default='smoothap', type=str)
+parser.add_argument('--sigmoid_temperature', default=0.01, type=float, help='SmoothAP: the temperature of the sigmoid used in SmoothAP loss')
+##### Evaluation Settings
+parser.add_argument('--k_vals',       nargs='+', default=[1,2,4,8], type=int, help='Recall @ Values.')
+parser.add_argument('--resume', default='', type=str, help='path to checkpoint to load weights from (if empty then ImageNet pre-trained weights are loaded')
+##### Network parameters
+parser.add_argument('--embed_dim',    default=512,         type=int,   help='Embedding dimensionality of the network')
+parser.add_argument('--arch',         default='resnet50',  type=str,   help='Network backend choice: resnet50, googlenet, BNinception')
+parser.add_argument('--grad_measure',                      action='store_true', help='If added, gradients passed from embedding layer to the last conv-layer are stored in each iteration.')
+parser.add_argument('--dist_measure',                      action='store_true', help='If added, the ratio between intra- and interclass distances is stored after each epoch.')
+parser.add_argument('--not_pretrained',                    action='store_true', help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+##### Setup Parameters
+parser.add_argument('--gpu',          default=0,           type=int,   help='GPU-id for GPU to use.')
+parser.add_argument('--savename',     default='',          type=str,   help='Save folder name if any special information is to be included.')
+### Paths to datasets and storage folder
+parser.add_argument('--source_path',  default='/scratch/shared/beegfs/abrown/datasets',         type=str, help='Path to data')
+parser.add_argument('--save_path',    default=os.getcwd()+'/Training_Results', type=str, help='Where to save the checkpoints')
+### additional parameters
+parser.add_argument('--trainset',     default="lin_train_set1.txt", type=str)
+parser.add_argument('--testset',      default="Inaturalist_test_set1.txt", type=str)
+parser.add_argument('--cluster_path', default="", type=str)
+parser.add_argument('--finetune',     default='true', type=str)
+parser.add_argument('--class_num',    default=948, type=int)
+parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                    help="Whether to use batch normalizations in projection head (Default: False)")
+parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                    help='Key to use in the checkpoint (example: "teacher")')
+parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+parser.add_argument('--iter', default=1, type=int)
+
+opt = parser.parse_args()
+"""============================================================================"""
+opt.source_path += '/' + opt.dataset
+opt.save_path += '/' + opt.dataset + "_" + str(opt.embed_dim)
+
+if opt.dataset== 'Inaturalist':
+    # opt.n_epochs = 90
+    opt.tau = [40, 70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.dataset=='semi_fungi':
+    opt.tau = [40,70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.finetune == 'true':
+    opt.finetune = True
+elif opt.finetune == 'false':
+    opt.finetune = False
+
+"""==========================================================================="""
+################### TensorBoard Settings ##################
+timestamp = datetime.datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
+exp_name = aux.args2exp_name(opt)
+opt.save_name = f"weights_{exp_name}" +'/'+ timestamp
+random.seed(opt.seed)
+np.random.seed(opt.seed)
+torch.manual_seed(opt.seed)
+torch.cuda.manual_seed(opt.seed); torch.cuda.manual_seed_all(opt.seed)
+tensorboard_path = Path(f"logs/logs_{exp_name}") / timestamp
+
+tensorboard_path.parent.mkdir(exist_ok=True, parents=True)
+global writer;
+writer = SummaryWriter(tensorboard_path)
+"""============================================================================"""
+################### GPU SETTINGS ###########################
+os.environ["CUDA_DEVICE_ORDER"]   ="PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"]= str(opt.gpu)
+print('using #GPUs:',torch.cuda.device_count())
+
+"""============================================================================"""
+#################### DATALOADER SETUPS ##################
+#Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+#The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+#Is simply using the training set, however running under the same rules as 'testing' dataloader,
+#i.e. no shuffling and no random cropping.
+dataloaders      = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt, cluster_path=opt.cluster_path)
+#Because the number of supervised classes is dataset dependent, we store them after
+#initializing the dataloader
+opt.num_classes  = len(dataloaders['training'].dataset.avail_classes)
+print("num_classes:", opt.num_classes)
+print("train dataset size:", len(dataloaders['training']))
+
+"""============================================================================"""
+##################### NETWORK SETUP ##################
+
+opt.device = torch.device('cuda')
+model      = netlib.networkselect(opt)
+
+#Push to Device
+if torch.cuda.device_count() > 1:
+    model = torch.nn.DataParallel(model)
+_          = model.to(opt.device)
+#Place trainable parameter in list of parameters to train:
+
+if 'fc_lr_mul' in vars(opt).keys() and opt.fc_lr_mul!=0:
+
+    all_but_fc_params = list(filter(lambda x: 'last_linear' not in x[0],model.named_parameters()))
+
+    for ind, param in enumerate(all_but_fc_params):
+        all_but_fc_params[ind] = param[1]
+
+    if torch.cuda.device_count() > 1:
+        fc_params         = model.module.model.last_linear.parameters()
+    else:
+        fc_params         = model.model.last_linear.parameters()
+
+    to_optim          = [{'params':all_but_fc_params,'lr':opt.lr,'weight_decay':opt.decay},
+                         {'params':fc_params,'lr':opt.lr*opt.fc_lr_mul,'weight_decay':opt.decay}]
+else:
+    to_optim   = [{'params':model.parameters(),'lr':opt.lr,'weight_decay':opt.decay}]
+"""============================================================================"""
+#################### CREATE LOGGING FILES ###############
+#Each dataset usually has a set of standard metrics to log. aux.metrics_to_examine()
+#returns a dict which lists metrics to log for training ('train') and validation/testing ('val')
+
+metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+# example output: {'train': ['Epochs', 'Time', 'Train Loss', 'Time'],
+#                  'val': ['Epochs','Time','NMI','F1', 'Recall @ 1','Recall @ 2','Recall @ 4','Recall @ 8']}
+
+#Using the provided metrics of interest, we generate a LOGGER instance.
+#Note that 'start_new' denotes that a new folder should be made in which everything will be stored.
+#This includes network weights as well.
+LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+#If graphviz is installed on the system, a computational graph of the underlying
+#network will be made as well.
+
+"""============================================================================"""
+#################### LOSS SETUP ####################
+#Depending on opt.loss and opt.sampling, the respective criterion is returned,
+#and if the loss has trainable parameters, to_optim is appended.
+criterion, to_optim = losses.loss_select(opt.loss, opt, to_optim)
+_ = criterion.to(opt.device)
+
+"""============================================================================"""
+##################### OPTIONAL EVALUATIONS #####################
+#Store the averaged gradients returned from the embedding to the last conv. layer.
+if opt.grad_measure:
+    grad_measure = eval.GradientMeasure(opt, name='baseline')
+#Store the relative distances between average intra- and inter-class distance.
+if opt.dist_measure:
+    #Add a distance measure for training distance ratios
+    distance_measure = eval.DistanceMeasure(dataloaders['evaluation'], opt, name='Train', update_epochs=1)
+    # #If uncommented: Do the same for the test set
+    # distance_measure_test = eval.DistanceMeasure(dataloaders['testing'], opt, name='Train', update_epochs=1)
+
+"""============================================================================"""
+#################### OPTIM SETUP ####################
+#As optimizer, Adam with standard parameters is used.
+if opt.opt == 'adam':
+    optimizer    = torch.optim.Adam(to_optim)
+elif opt.opt == 'sgd':
+    optimizer    = torch.optim.SGD(to_optim)
+else:
+    raise Exception('unknown optimiser')
+# for the SOA measures in the paper - need to use SGD and 0.05 learning rate
+#optimizer    = torch.optim.Adam(to_optim)
+#optimizer    = torch.optim.SGD(to_optim)
+if opt.scheduler=='exp':
+    scheduler    = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma)
+elif opt.scheduler=='step':
+    scheduler    = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.tau, gamma=opt.gamma)
+elif opt.scheduler=='none':
+    print('No scheduling used!')
+else:
+    raise Exception('No scheduling option for input: {}'.format(opt.scheduler))
+
+def same_model(model1,model2):
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+"""============================================================================"""
+#################### TRAINER FUNCTION ############################
+def train_one_epoch_finetune(train_dataloader, model, optimizer, criterion, opt, epoch):
+    """
+    This function is called every epoch to perform training of the network over one full
+    (randomized) iteration of the dataset.
+
+    Args:
+        train_dataloader: torch.utils.data.DataLoader, returns (augmented) training data.
+        model:            Network to train.
+        optimizer:        Optimizer to use for training.
+        criterion:        criterion to use during training.
+        opt:              argparse.Namespace, Contains all relevant parameters.
+        epoch:            int, Current epoch.
+
+    Returns:
+        Nothing!
+    """
+
+    loss_collect         = []
+
+    start = time.time()
+    data_iterator = tqdm(train_dataloader, desc='Epoch {} Training gt labels...'.format(epoch))
+    for i,(class_labels, input) in enumerate(data_iterator):
+
+        #Compute embeddings for input batch
+        features  = model(input.to(opt.device))
+
+        #Compute loss.
+        if opt.loss != 'smoothap':
+            loss      = criterion(features, class_labels)
+        else:
+            loss      = criterion(features)
+
+        #Ensure gradients are set to zero at beginning
+        optimizer.zero_grad()
+        #Compute gradient
+        loss.backward()
+
+        train_dataloader.dataset.classes_visited = []
+
+        if opt.grad_measure:
+            #If desired, save computed gradients.
+            grad_measure.include(model.model.last_linear)
+
+        #Update weights using comp. gradients.
+        optimizer.step()
+
+        #Store loss per iteration.
+        loss_collect.append(loss.item())
+        if i==len(train_dataloader)-1: 
+            data_iterator.set_description('Epoch (Train) {0}: Mean Loss [{1:.4f}]'.format(epoch, np.mean(loss_collect)))
+
+    #Save metrics
+    LOG.log('train', LOG.metrics_to_log['train'], [epoch, np.round(time.time()-start,4), np.mean(loss_collect)])
+    writer.add_scalar('global/training_loss',np.mean(loss_collect),epoch)
+    if opt.grad_measure:
+        #Dump stored gradients to Pickle-File.
+        grad_measure.dump(epoch)
+
+"""============================================================================"""
+"""========================== MAIN TRAINING PART =============================="""
+"""============================================================================"""
+################### SCRIPT MAIN ##########################
+print('\n-----\n')
+# Each dataset requires slightly different dataloaders.
+
+if opt.dataset == 'Inaturalist' or 'semi_fungi':
+    eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': 0}
+
+# Compute Evaluation metrics, print them and store in LOG.
+print('epochs -> '+str(opt.n_epochs))
+import time
+
+
+for epoch in range(opt.n_epochs):
+    ### Print current learning rates for all parameters
+    if opt.scheduler!='none': print('Running with learning rates {}...'.format(' | '.join('{}'.format(x) for x in scheduler.get_lr())))
+
+    ### Train one epoch
+    _ = model.train()
+    
+    train_one_epoch_finetune(dataloaders['training'], model, optimizer, criterion, opt, epoch)
+
+    dataloaders['training'].dataset.reshuffle()
+    ### Evaluate
+    _ = model.eval()
+    #Each dataset requires slightly different dataloaders.
+    if opt.dataset == 'Inaturalist':
+        eval_params = {'dataloader':dataloaders['testing'], 'model':model, 'opt':opt, 'epoch':epoch}
+    elif opt.dataset=='semi_fungi':
+        eval_params = {'dataloader':dataloaders['testing'], 'model':model, 'opt':opt, 'epoch':epoch}
+
+    #Compute Evaluation metrics, print them and store in LOG.
+    if opt.infrequent_eval == 1:
+        epoch_freq = 10
+    else:
+        epoch_freq = 1
+
+    if epoch%epoch_freq == 0:
+        results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
+        writer.add_scalar('global/recall1',results[0][0],epoch+1)
+        writer.add_scalar('global/recall2',results[0][1],epoch+1)
+        writer.add_scalar('global/recall3',results[0][2],epoch+1)
+        writer.add_scalar('global/recall4',results[0][3],epoch+1)
+        writer.add_scalar('global/NMI',results[1],epoch+1)
+        writer.add_scalar('global/F1',results[2],epoch+1)
+
+    #Update the Metric Plot and save it.
+    #LOG.update_info_plot()
+    #(optional) compute ratio of intra- to interdistances.
+    if opt.dist_measure:
+        distance_measure.measure(model, epoch)
+        # distance_measure_test.measure(model, epoch)
+
+    ### Learning Rate Scheduling Step
+    if opt.scheduler != 'none':
+        scheduler.step()
+
+    print('\n-----\n')
+
+print("Time:" ,time.time() - start)
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/get_features.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/get_features.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+"""to do:
+
+clean all of the files - particularly the main.py and also the losses and dataset files and the file for doing the dataloading
+
+-- fast loading etc
+
+need to change all of the copyrights at the top of all of the files
+
+"""
+
+#################### LIBRARIES ########################
+import warnings
+warnings.filterwarnings("ignore")
+
+import os, numpy as np, argparse, random, matplotlib, datetime
+os.chdir(os.path.dirname(os.path.realpath(__file__)))
+matplotlib.use('agg')
+
+import auxiliaries as aux
+import datasets as data
+import netlib as netlib
+import losses as losses
+import evaluate as eval
+import torch.multiprocessing
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+################### INPUT ARGUMENTS ###################
+parser = argparse.ArgumentParser()
+####### Main Parameter: Dataset to use for Training
+parser.add_argument('--dataset',      default='Inaturalist',   type=str, help='Dataset to use.', choices=['Inaturalist', 'semi_fungi'])
+### General Training Parameters
+parser.add_argument('--lr',                default=0.00001,  type=float, help='Learning Rate for network parameters.')
+parser.add_argument('--fc_lr_mul',         default=5,        type=float, help='OPTIONAL: Multiply the embedding layer learning rate by this value. If set to 0, the embedding layer shares the same learning rate.')
+parser.add_argument('--n_epochs',          default=400,       type=int,   help='Number of training epochs.')
+parser.add_argument('--kernels',           default=8,        type=int,   help='Number of workers for pytorch dataloader.')
+parser.add_argument('--bs',                default=112 ,     type=int,   help='Mini-Batchsize to use.')
+parser.add_argument('--samples_per_class', default=4,        type=int,   help='Number of samples in one class drawn before choosing the next class')
+parser.add_argument('--seed',              default=1,        type=int,   help='Random seed for reproducibility.')
+parser.add_argument('--scheduler',         default='step',   type=str,   help='Type of learning rate scheduling. Currently: step & exp.')
+parser.add_argument('--gamma',             default=0.3,      type=float, help='Learning rate reduction after tau epochs.')
+parser.add_argument('--decay',             default=0.0004,   type=float, help='Weight decay for optimizer.')
+parser.add_argument('--tau',               default= [200,300],nargs='+',type=int,help='Stepsize(s) before reducing learning rate.')
+parser.add_argument('--infrequent_eval', default=0,type=int, help='only compute evaluation metrics every 10 epochs')
+parser.add_argument('--opt', default = 'adam',help='adam or sgd')
+##### Loss-specific Settings
+parser.add_argument('--loss',         default='smoothap', type=str)
+parser.add_argument('--sigmoid_temperature', default=0.01, type=float, help='SmoothAP: the temperature of the sigmoid used in SmoothAP loss')
+##### Evaluation Settings
+parser.add_argument('--k_vals',       nargs='+', default=[1,2,4,8], type=int, help='Recall @ Values.')
+parser.add_argument('--resume', default='', type=str, help='path to checkpoint to load weights from (if empty then ImageNet pre-trained weights are loaded')
+##### Network parameters
+parser.add_argument('--embed_dim',    default=512,         type=int,   help='Embedding dimensionality of the network')
+parser.add_argument('--arch',         default='resnet50',  type=str,   help='Network backend choice: resnet50, googlenet, BNinception')
+parser.add_argument('--grad_measure',                      action='store_true', help='If added, gradients passed from embedding layer to the last conv-layer are stored in each iteration.')
+parser.add_argument('--dist_measure',                      action='store_true', help='If added, the ratio between intra- and interclass distances is stored after each epoch.')
+parser.add_argument('--not_pretrained',                    action='store_true', help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+##### Setup Parameters
+parser.add_argument('--gpu',          default=0,           type=int,   help='GPU-id for GPU to use.')
+parser.add_argument('--savename',     default='',          type=str,   help='Save folder name if any special information is to be included.')
+### Paths to datasets and storage folder
+parser.add_argument('--source_path',  default='/scratch/shared/beegfs/abrown/datasets',         type=str, help='Path to data')
+parser.add_argument('--save_path',    default=os.getcwd()+'/Training_Results', type=str, help='Where to save the checkpoints')
+### adational
+parser.add_argument('--trainset',     default="lin_train_set1.txt", type=str)
+parser.add_argument('--all_trainset',     default="train_set1.txt", type=str)
+parser.add_argument('--testset',      default="test_set1.txt", type=str)
+parser.add_argument('--finetune',     default='true', type=str)
+parser.add_argument('--cluster_path', default="", type=str)
+parser.add_argument('--get_features', default="false", type=str)
+parser.add_argument('--class_num', default=948, type=int)
+parser.add_argument('--iter',         default=0, type=int)
+parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                    help="Whether to use batch normalizations in projection head (Default: False)")
+parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                    help='Key to use in the checkpoint (example: "teacher")')
+parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+parser.add_argument('--linsize', default=29011, type=int, help="Lin data size.")
+parser.add_argument('--uinsize', default=18403, type=int, help="Uin data size.")
+opt = parser.parse_args()
+"""============================================================================"""
+opt.source_path += '/' + opt.dataset
+opt.save_path += '/' + opt.dataset + "_" + str(opt.embed_dim)
+
+if opt.dataset== 'Inaturalist':
+    opt.n_epochs = 90
+    opt.tau = [40,70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.dataset=='semi_fungi':
+    opt.tau = [40,70]
+    opt.k_vals = [1,4,16,32]
+
+if opt.get_features == "true":
+    opt.get_features = True
+if opt.get_features == "false":
+    opt.get_features = False
+
+if opt.finetune == 'true':
+    opt.finetune = True
+elif opt.finetune == 'false':
+    opt.finetune = False
+
+"""==========================================================================="""
+################### TensorBoard Settings ##################
+timestamp = datetime.datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
+exp_name = aux.args2exp_name(opt)
+opt.save_name = f"weights_{exp_name}" +'/'+ timestamp
+random.seed(opt.seed)
+np.random.seed(opt.seed)
+torch.manual_seed(opt.seed)
+torch.cuda.manual_seed(opt.seed); torch.cuda.manual_seed_all(opt.seed)
+
+"""============================================================================"""
+################### GPU SETTINGS ###########################
+os.environ["CUDA_DEVICE_ORDER"]   ="PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"]= str(opt.gpu)
+print('using #GPUs:',torch.cuda.device_count())
+
+"""============================================================================"""
+##################### NETWORK SETUP ##################
+
+opt.device = torch.device('cuda')
+model      = netlib.networkselect(opt)
+
+#Push to Device
+if torch.cuda.device_count() > 1:
+    model  = torch.nn.DataParallel(model)
+_          = model.to(opt.device)
+
+#Place trainable parameter in list of parameters to train:
+
+if 'fc_lr_mul' in vars(opt).keys() and opt.fc_lr_mul!=0:
+
+    all_but_fc_params = list(filter(lambda x: 'last_linear' not in x[0],model.named_parameters()))
+
+    for ind, param in enumerate(all_but_fc_params):
+        all_but_fc_params[ind] = param[1]
+
+    if torch.cuda.device_count() > 1:
+        fc_params         = model.module.model.last_linear.parameters()
+    else:
+        fc_params         = model.model.last_linear.parameters()
+
+    to_optim          = [{'params':all_but_fc_params,'lr':opt.lr,'weight_decay':opt.decay},
+                         {'params':fc_params,'lr':opt.lr*opt.fc_lr_mul,'weight_decay':opt.decay}]
+else:
+    to_optim   = [{'params':model.parameters(),'lr':opt.lr,'weight_decay':opt.decay}]
+"""============================================================================"""
+#################### DATALOADER SETUPS ##################
+#Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+#The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+#Is simply using the training set, however running under the same rules as 'testing' dataloader,
+#i.e. no shuffling and no random cropping.
+dataloaders      = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt)
+#Because the number of supervised classes is dataset dependent, we store them after
+#initializing the dataloader
+opt.num_classes  = len(dataloaders['training'].dataset.avail_classes)
+
+"""============================================================================"""
+#################### CREATE LOGGING FILES ###############
+#Each dataset usually has a set of standard metrics to log. aux.metrics_to_examine()
+#returns a dict which lists metrics to log for training ('train') and validation/testing ('val')
+
+metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+# example output: {'train': ['Epochs', 'Time', 'Train Loss', 'Time'],
+#                  'val': ['Epochs','Time','NMI','F1', 'Recall @ 1','Recall @ 2','Recall @ 4','Recall @ 8']}
+
+#Using the provided metrics of interest, we generate a LOGGER instance.
+#Note that 'start_new' denotes that a new folder should be made in which everything will be stored.
+#This includes network weights as well.
+#If graphviz is installed on the system, a computational graph of the underlying
+#network will be made as well.
+
+"""============================================================================"""
+#################### LOSS SETUP ####################
+#Depending on opt.loss and opt.sampling, the respective criterion is returned,
+#and if the loss has trainable parameters, to_optim is appended.
+LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+criterion, to_optim = losses.loss_select(opt.loss, opt, to_optim)
+_ = criterion.to(opt.device)
+
+"""============================================================================"""
+##################### OPTIONAL EVALUATIONS #####################
+#Store the averaged gradients returned from the embedding to the last conv. layer.
+if opt.grad_measure:
+    grad_measure = eval.GradientMeasure(opt, name='baseline')
+#Store the relative distances between average intra- and inter-class distance.
+if opt.dist_measure:
+    #Add a distance measure for training distance ratios
+    distance_measure = eval.DistanceMeasure(dataloaders['evaluation'], opt, name='Train', update_epochs=1)
+    # #If uncommented: Do the same for the test set
+    # distance_measure_test = eval.DistanceMeasure(dataloaders['testing'], opt, name='Train', update_epochs=1)
+
+"""============================================================================"""
+#################### OPTIM SETUP ####################
+#As optimizer, Adam with standard parameters is used.
+if opt.opt == 'adam':
+    optimizer    = torch.optim.Adam(to_optim)
+elif opt.opt == 'sgd':
+    optimizer    = torch.optim.SGD(to_optim)
+else:
+    raise Exception('unknown optimiser')
+# for the SOA measures in the paper - need to use SGD and 0.05 learning rate
+#optimizer    = torch.optim.Adam(to_optim)
+#optimizer    = torch.optim.SGD(to_optim)
+if opt.scheduler=='exp':
+    scheduler    = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma)
+elif opt.scheduler=='step':
+    scheduler    = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.tau, gamma=opt.gamma)
+elif opt.scheduler=='none':
+    print('No scheduling used!')
+else:
+    raise Exception('No scheduling option for input: {}'.format(opt.scheduler))
+
+def same_model(model1,model2):
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+
+"""============================================================================"""
+"""================================ TESTING ==================================="""
+"""============================================================================"""
+################### SCRIPT MAIN ##########################
+print('\n-----\n')
+# Compute Evaluation metrics, print them and store in LOG.
+
+_    = model.eval()
+aux.vis(model, dataloaders['training'], opt.device, split="T_train_iter"+str(opt.iter)+"_"+str(opt.loss), opt=opt)
+aux.vis(model, dataloaders['testing'], opt.device, split="all_train_iter"+str(opt.iter)+"_"+str(opt.loss), opt=opt)
+aux.vis(model, dataloaders['eval'], opt.device, split="test_iter"+str(opt.iter)+"_"+str(opt.loss), opt=opt)
+#Update the Metric Plot and save it.
+print('\n-----\n')
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/losses.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/losses.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+###################### LIBRARIES #################################################
+import warnings
+warnings.filterwarnings("ignore")
+
+import torch, faiss
+import numpy as np
+from scipy import sparse
+
+
+"""================================================================================================="""
+############ LOSS SELECTION FUNCTION #####################
+def loss_select(loss, opt, to_optim):
+    """
+    Selection function which returns the respective criterion while appending to list of trainable parameters if required.
+
+    Args:
+        loss:     str, name of loss function to return.
+        opt:      argparse.Namespace, contains all training-specific parameters.
+        to_optim: list of trainable parameters. Is extend if loss function contains those as well.
+    Returns:
+        criterion (torch.nn.Module inherited), to_optim (optionally appended)
+    """
+    if loss == 'smoothap':
+        loss_params  = {'anneal':opt.sigmoid_temperature, 'batch_size':opt.bs, "num_id":int(opt.bs / opt.samples_per_class), 'feat_dims':opt.embed_dim}
+        criterion    = SmoothAP(**loss_params)
+    else:
+        raise Exception('Loss {} not available!'.format(loss))
+
+    return criterion, to_optim
+
+
+"""==============================================Smooth-AP========================================"""
+
+def sigmoid(tensor, temp=1.0):
+    """ temperature controlled sigmoid
+    takes as input a torch tensor (tensor) and passes it through a sigmoid, controlled by temperature: temp
+    """
+    exponent = -tensor / temp
+    # clamp the input tensor for stability
+    exponent = torch.clamp(exponent, min=-50, max=50)
+    y = 1.0 / (1.0 + torch.exp(exponent))
+    return y
+
+
+def compute_aff(x):
+    """computes the affinity matrix between an input vector and itself"""
+    return torch.mm(x, x.t())
+
+
+class BinarizedF(torch.autograd.Function):
+    def forward(self, inp):
+        self.save_for_backward(inp)
+        a = torch.ones_like(inp)
+        b = torch.zeros_like(inp)
+        output = torch.where(inp > 0, a, b)
+        return output
+
+    def backward(self, output_grad):
+        inp, = self.saved_tensors
+        input_abs = torch.abs(inp)
+        ones = torch.ones_like(inp)
+        zeros = torch.zeros_like(inp)
+        input_grad = torch.where(input_abs > 0, ones, zeros)
+        return input_grad
+
+
+class BinarizedModule(torch.nn.Module):
+    def __init__(self):
+        super(BinarizedModule, self).__init__()
+        self.BF = BinarizedF()
+
+    def forward(self, inp):
+        output = self.BF(inp)
+        return output
+
+
+class SmoothAP(torch.nn.Module):
+    """PyTorch implementation of the Smooth-AP loss.
+    implementation of the Smooth-AP loss. Takes as input the mini-batch of CNN-produced feature embeddings and returns
+    the value of the Smooth-AP loss. The mini-batch must be formed of a defined number of classes. Each class must
+    have the same number of instances represented in the mini-batch and must be ordered sequentially by class.
+    e.g. the labels for a mini-batch with batch size 9, and 3 represented classes (A,B,C) must look like:
+        labels = ( A, A, A, B, B, B, C, C, C)
+    (the order of the classes however does not matter)
+    For each instance in the mini-batch, the loss computes the Smooth-AP when it is used as the query and the rest of the
+    mini-batch is used as the retrieval set. The positive set is formed of the other instances in the batch from the
+    same class. The loss returns the average Smooth-AP across all instances in the mini-batch.
+    Args:
+        anneal : float
+            the temperature of the sigmoid that is used to smooth the ranking function. A low value of the temperature
+            results in a steep sigmoid, that tightly approximates the heaviside step function in the ranking function.
+        batch_size : int
+            the batch size being used during training.
+        num_id : int
+            the number of different classes that are represented in the batch.
+        feat_dims : int
+            the dimension of the input feature embeddings
+    Shape:
+        - Input (preds): (batch_size, feat_dims) (must be a cuda torch float tensor)
+        - Output: scalar
+    Examples::
+        >>> loss = SmoothAP(0.01, 60, 6, 256)
+        >>> input = torch.randn(60, 256, requires_grad=True).cuda()
+        >>> output = loss(input)
+        >>> output.backward()
+    """
+
+    def __init__(self, anneal, batch_size, num_id, feat_dims):
+        """
+        Parameters
+        ----------
+        anneal : float
+            the temperature of the sigmoid that is used to smooth the ranking function
+        batch_size : int
+            the batch size being used
+        num_id : int
+            the number of different classes that are represented in the batch
+        feat_dims : int
+            the dimension of the input feature embeddings
+        """
+        super(SmoothAP, self).__init__()
+
+        assert(batch_size%num_id==0)
+
+        self.anneal = anneal
+        self.batch_size = batch_size
+        self.num_id = num_id
+        self.feat_dims = feat_dims
+
+    def forward(self, preds):
+        """Forward pass for all input predictions: preds - (batch_size x feat_dims) """
+
+
+        # ------ differentiable ranking of all retrieval set ------
+        # compute the mask which ignores the relevance score of the query to itself
+        mask = 1.0 - torch.eye(self.batch_size)
+        mask = mask.unsqueeze(dim=0).repeat(self.batch_size, 1, 1)
+        # compute the relevance scores via cosine similarity of the CNN-produced embedding vectors
+        sim_all = compute_aff(preds)
+        sim_all_repeat = sim_all.unsqueeze(dim=1).repeat(1, self.batch_size, 1)
+        # compute the difference matrix
+        sim_diff = sim_all_repeat - sim_all_repeat.permute(0, 2, 1)
+        # pass through the sigmoid
+        sim_sg = sigmoid(sim_diff, temp=self.anneal) * mask.cuda()
+        # compute the rankings
+        sim_all_rk = torch.sum(sim_sg, dim=-1) + 1
+
+        # ------ differentiable ranking of only positive set in retrieval set ------
+        # compute the mask which only gives non-zero weights to the positive set
+        xs = preds.view(self.num_id, int(self.batch_size / self.num_id), self.feat_dims)
+        pos_mask = 1.0 - torch.eye(int(self.batch_size / self.num_id))
+        pos_mask = pos_mask.unsqueeze(dim=0).unsqueeze(dim=0).repeat(self.num_id, int(self.batch_size / self.num_id), 1, 1)
+        # compute the relevance scores
+        sim_pos = torch.bmm(xs, xs.permute(0, 2, 1))
+        sim_pos_repeat = sim_pos.unsqueeze(dim=2).repeat(1, 1, int(self.batch_size / self.num_id), 1)
+        # compute the difference matrix
+        sim_pos_diff = sim_pos_repeat - sim_pos_repeat.permute(0, 1, 3, 2)
+        # pass through the sigmoid
+        sim_pos_sg = sigmoid(sim_pos_diff, temp=self.anneal) * pos_mask.cuda()
+        # compute the rankings of the positive set
+        sim_pos_rk = torch.sum(sim_pos_sg, dim=-1) + 1
+
+        # sum the values of the Smooth-AP for all instances in the mini-batch
+        ap = torch.zeros(1).cuda()
+        group = int(self.batch_size / self.num_id)
+        for ind in range(self.num_id):
+            pos_divide = torch.sum(sim_pos_rk[ind] / (sim_all_rk[(ind * group):((ind + 1) * group), (ind * group):((ind + 1) * group)]))
+            ap = ap + ((pos_divide / group) / self.batch_size)
+        return (1 - ap)
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/main.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/main.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+"""to do:
+
+clean all of the files - particularly the main.py and also the losses and dataset files and the file for doing the dataloading
+
+-- fast loading etc
+
+need to change all of the copyrights at the top of all of the files
+
+"""
+
+#################### LIBRARIES ########################
+import warnings
+
+warnings.filterwarnings("ignore")
+
+import os, numpy as np, argparse, random, matplotlib, datetime
+
+os.chdir(os.path.dirname(os.path.realpath(__file__)))
+from pathlib import Path
+
+matplotlib.use('agg')
+from tqdm import tqdm
+
+import auxiliaries as aux
+import datasets as data
+import netlib as netlib
+import losses as losses
+import evaluate as eval
+from tensorboardX import SummaryWriter
+import torch.multiprocessing
+
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+################### INPUT ARGUMENTS ###################
+parser = argparse.ArgumentParser()
+####### Main Parameter: Dataset to use for Training
+parser.add_argument('--dataset', default='vehicle_id', type=str, help='Dataset to use.',
+                    choices=['SoftInaturalist', 'Inaturalist', 'vehicle_id', 'semi_fungi'])
+### General Training Parameters
+parser.add_argument('--lr', default=0.00001, type=float, help='Learning Rate for network parameters.')
+parser.add_argument('--fc_lr_mul', default=5, type=float,
+                    help='OPTIONAL: Multiply the embedding layer learning rate by this value. If set to 0, the embedding layer shares the same learning rate.')
+parser.add_argument('--n_epochs', default=400, type=int, help='Number of training epochs.')
+parser.add_argument('--kernels', default=8, type=int, help='Number of workers for pytorch dataloader.')
+parser.add_argument('--bs', default=112, type=int, help='Mini-Batchsize to use.')
+parser.add_argument('--samples_per_class', default=4, type=int,
+                    help='Number of samples in one class drawn before choosing the next class')
+parser.add_argument('--seed', default=1, type=int, help='Random seed for reproducibility.')
+parser.add_argument('--scheduler', default='step', type=str,
+                    help='Type of learning rate scheduling. Currently: step & exp.')
+parser.add_argument('--gamma', default=0.3, type=float, help='Learning rate reduction after tau epochs.')
+parser.add_argument('--decay', default=0.0004, type=float, help='Weight decay for optimizer.')
+parser.add_argument('--tau', default=[200, 300], nargs='+', type=int, help='Stepsize(s) before reducing learning rate.')
+parser.add_argument('--infrequent_eval', default=0, type=int, help='only compute evaluation metrics every 10 epochs')
+parser.add_argument('--opt', default='adam', help='adam or sgd')
+##### Loss-specific Settings
+parser.add_argument('--loss', default='smoothap', type=str)
+parser.add_argument('--sigmoid_temperature', default=0.01, type=float,
+                    help='SmoothAP: the temperature of the sigmoid used in SmoothAP loss')
+##### Evaluation Settings
+parser.add_argument('--k_vals', nargs='+', default=[1, 2, 4, 8], type=int, help='Recall @ Values.')
+parser.add_argument('--resume', default='', type=str,
+                    help='path to checkpoint to load weights from (if empty then ImageNet pre-trained weights are loaded')
+##### Network parameters
+parser.add_argument('--embed_dim', default=512, type=int, help='Embedding dimensionality of the network')
+parser.add_argument('--arch', default='resnet50', type=str,
+                    help='Network backend choice: resnet50')
+parser.add_argument('--pretrained_weights', default="", type=str, help='pretrained weight path')
+parser.add_argument('--use_bn_in_head', default=False, type=aux.bool_flag,
+                    help="Whether to use batch normalizations in projection head (Default: False)")
+parser.add_argument("--checkpoint_key", default="teacher", type=str,
+                    help='Key to use in the checkpoint (example: "teacher")')
+parser.add_argument('--drop_path_rate', default=0.1, type=float, help="stochastic depth rate")
+parser.add_argument('--grad_measure', action='store_true',
+                    help='If added, gradients passed from embedding layer to the last conv-layer are stored in each iteration.')
+parser.add_argument('--dist_measure', action='store_true',
+                    help='If added, the ratio between intra- and interclass distances is stored after each epoch.')
+parser.add_argument('--not_pretrained', action='store_true',
+                    help='If added, the network will be trained WITHOUT ImageNet-pretrained weights.')
+##### Setup Parameters
+parser.add_argument('--gpu', default=0, type=int, help='GPU-id for GPU to use.')
+parser.add_argument('--savename', default='', type=str,
+                    help='Save folder name if any special information is to be included.')
+### Paths to datasets and storage folder
+parser.add_argument('--source_path', default='/scratch/shared/beegfs/abrown/datasets', type=str, help='Path to data')
+parser.add_argument('--save_path', default=os.getcwd() + '/Training_Results', type=str,
+                    help='Where to save the checkpoints')
+### additional parameters
+parser.add_argument('--trainset', default="lin_train_set1.txt", type=str)
+parser.add_argument('--testset', default="Inaturalist_test_set1.txt", type=str)
+parser.add_argument('--cluster_path', default="", type=str)
+parser.add_argument('--finetune', default="false", type=str)
+parser.add_argument('--class_num', default=948, type=int)
+parser.add_argument('--get_features', default="false", type=str)
+parser.add_argument('--linsize', default=29011, type=int, help="Lin data size.")
+parser.add_argument('--uinsize', default=18403, type=int, help="Uin data size.")
+parser.add_argument('--iter', default=0, type=int)
+
+opt = parser.parse_args()
+"""============================================================================"""
+if opt.dataset == "SoftInaturalist":
+    opt.source_path += '/Inaturalist'
+    opt.save_path += '/Inaturalist' + "_" + str(opt.embed_dim)
+else:
+    opt.source_path += '/' + opt.dataset
+    opt.save_path += '/' + opt.dataset + "_" + str(opt.embed_dim)
+
+if opt.dataset == 'Inaturalist':
+    # opt.n_epochs = 90
+    opt.tau = [40, 70]
+    opt.k_vals = [1, 4, 16, 32]
+
+if opt.dataset == 'SoftInaturalist':
+    # opt.n_epochs = 90
+    opt.tau = [40, 70]
+    opt.k_vals = [1, 4, 16, 32]
+
+if opt.dataset == 'vehicle_id':
+    opt.k_vals = [1, 5]
+
+if opt.dataset == 'semi_fungi':
+    opt.tau = [40, 70]
+    opt.k_vals = [1, 4, 16, 32]
+
+if opt.finetune == 'true':
+    opt.finetune = True
+elif opt.finetune == 'false':
+    opt.finetune = False
+
+if opt.get_features == 'true':
+    opt.get_features = True
+elif opt.get_features == 'false':
+    opt.get_features = False
+
+"""==========================================================================="""
+################### TensorBoard Settings ##################
+timestamp = datetime.datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
+exp_name = aux.args2exp_name(opt)
+opt.save_name = f"weights_{exp_name}" + '/' + timestamp
+random.seed(opt.seed)
+np.random.seed(opt.seed)
+torch.manual_seed(opt.seed)
+torch.cuda.manual_seed(opt.seed);
+torch.cuda.manual_seed_all(opt.seed)
+tensorboard_path = Path(f"logs/logs_{exp_name}") / timestamp
+
+tensorboard_path.parent.mkdir(exist_ok=True, parents=True)
+global writer;
+writer = SummaryWriter(tensorboard_path)
+"""============================================================================"""
+################### GPU SETTINGS ###########################
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"]= str(opt.gpu)
+print('using #GPUs:', torch.cuda.device_count())
+
+"""============================================================================"""
+##################### NETWORK SETUP ##################
+
+opt.device = torch.device('cuda')
+model = netlib.networkselect(opt)
+
+# Push to Device
+if torch.cuda.device_count() > 1:
+    model = torch.nn.DataParallel(model)
+_ = model.to(opt.device)
+# Place trainable parameter in list of parameters to train:
+
+if 'fc_lr_mul' in vars(opt).keys() and opt.fc_lr_mul != 0:
+
+    all_but_fc_params = list(filter(lambda x: 'last_linear' not in x[0], model.named_parameters()))
+
+    for ind, param in enumerate(all_but_fc_params):
+        all_but_fc_params[ind] = param[1]
+
+    if torch.cuda.device_count() > 1:
+        fc_params = model.module.model.last_linear.parameters()
+    else:
+        fc_params = model.model.last_linear.parameters()
+
+    to_optim = [{'params': all_but_fc_params, 'lr': opt.lr, 'weight_decay': opt.decay},
+                {'params': fc_params, 'lr': opt.lr * opt.fc_lr_mul, 'weight_decay': opt.decay}]
+else:
+    to_optim = [{'params': model.parameters(), 'lr': opt.lr, 'weight_decay': opt.decay}]
+"""============================================================================"""
+#################### DATALOADER SETUPS ##################
+# Returns a dictionary containing 'training', 'testing', and 'evaluation' dataloaders.
+# The 'testing'-dataloader corresponds to the validation set, and the 'evaluation'-dataloader
+# Is simply using the training set, however running under the same rules as 'testing' dataloader,
+# i.e. no shuffling and no random cropping.
+dataloaders = data.give_dataloaders(opt.dataset, opt.trainset, opt.testset, opt)
+# Because the number of supervised classes is dataset dependent, we store them after
+# initializing the dataloader
+opt.num_classes = len(dataloaders['training'].dataset.avail_classes)
+
+"""============================================================================"""
+#################### CREATE LOGGING FILES ###############
+# Each dataset usually has a set of standard metrics to log. aux.metrics_to_examine()
+# returns a dict which lists metrics to log for training ('train') and validation/testing ('val')
+
+metrics_to_log = aux.metrics_to_examine(opt.dataset, opt.k_vals)
+# example output: {'train': ['Epochs', 'Time', 'Train Loss', 'Time'],
+#                  'val': ['Epochs','Time','NMI','F1', 'Recall @ 1','Recall @ 2','Recall @ 4','Recall @ 8']}
+
+# Using the provided metrics of interest, we generate a LOGGER instance.
+# Note that 'start_new' denotes that a new folder should be made in which everything will be stored.
+# This includes network weights as well.
+LOG = aux.LOGGER(opt, metrics_to_log, name='Base', start_new=True)
+# If graphviz is installed on the system, a computational graph of the underlying
+# network will be made as well.
+
+"""============================================================================"""
+#################### LOSS SETUP ####################
+# Depending on opt.loss and opt.sampling, the respective criterion is returned,
+# and if the loss has trainable parameters, to_optim is appended.
+criterion, to_optim = losses.loss_select(opt.loss, opt, to_optim)
+_ = criterion.to(opt.device)
+
+"""============================================================================"""
+##################### OPTIONAL EVALUATIONS #####################
+# Store the averaged gradients returned from the embedding to the last conv. layer.
+if opt.grad_measure:
+    grad_measure = eval.GradientMeasure(opt, name='baseline')
+# Store the relative distances between average intra- and inter-class distance.
+if opt.dist_measure:
+    # Add a distance measure for training distance ratios
+    distance_measure = eval.DistanceMeasure(dataloaders['evaluation'], opt, name='Train', update_epochs=1)
+    # #If uncommented: Do the same for the test set
+    # distance_measure_test = eval.DistanceMeasure(dataloaders['testing'], opt, name='Train', update_epochs=1)
+
+"""============================================================================"""
+#################### OPTIM SETUP ####################
+# As optimizer, Adam with standard parameters is used.
+if opt.opt == 'adam':
+    optimizer = torch.optim.Adam(to_optim)
+elif opt.opt == 'sgd':
+    optimizer = torch.optim.SGD(to_optim)
+else:
+    raise Exception('unknown optimiser')
+# for the SOA measures in the paper - need to use SGD and 0.05 learning rate
+# optimizer    = torch.optim.Adam(to_optim)
+# optimizer    = torch.optim.SGD(to_optim)
+if opt.scheduler == 'exp':
+    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=opt.gamma)
+elif opt.scheduler == 'step':
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.tau, gamma=opt.gamma)
+elif opt.scheduler == 'none':
+    print('No scheduling used!')
+else:
+    raise Exception('No scheduling option for input: {}'.format(opt.scheduler))
+
+
+def same_model(model1, model2):
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+"""============================================================================"""
+
+
+#################### TRAINER FUNCTION ############################
+def train_one_epoch(train_dataloader, model, optimizer, criterion, opt, epoch):
+    """
+    This function is called every epoch to perform training of the network over one full
+    (randomized) iteration of the dataset.
+
+    Args:
+        train_dataloader: torch.utils.data.DataLoader, returns (augmented) training data.
+        model:            Network to train.
+        optimizer:        Optimizer to use for training.
+        criterion:        criterion to use during training.
+        opt:              argparse.Namespace, Contains all relevant parameters.
+        epoch:            int, Current epoch.
+
+    Returns:
+        Nothing!
+    """
+
+    loss_collect = []
+
+    start = time.time()
+    data_iterator = tqdm(train_dataloader, desc='Epoch {} Training...'.format(epoch))
+
+    for i, (class_labels, input) in enumerate(data_iterator):
+        # Compute embeddings for input batch
+        features = model(input.to(opt.device))
+
+        # Compute loss.
+        if opt.loss != 'smoothap':
+            loss = criterion(features, class_labels)
+        else:
+            loss = criterion(features)
+
+        # Ensure gradients are set to zero at beginning
+        optimizer.zero_grad()
+        # Compute gradient
+        loss.backward()
+
+        train_dataloader.dataset.classes_visited = []
+
+        if opt.grad_measure:
+            # If desired, save computed gradients.
+            grad_measure.include(model.model.last_linear)
+
+        # Update weights using comp. gradients.
+        optimizer.step()
+
+        # Store loss per iteration.
+        loss_collect.append(loss.item())
+        if i == len(train_dataloader) - 1:
+            data_iterator.set_description('Epoch (Train) {0}: Mean Loss [{1:.4f}]'.format(epoch, np.mean(loss_collect)))
+
+    # Save metrics
+    LOG.log('train', LOG.metrics_to_log['train'], [epoch, np.round(time.time() - start, 4), np.mean(loss_collect)])
+    writer.add_scalar('global/training_loss', np.mean(loss_collect), epoch)
+    if opt.grad_measure:
+        # Dump stored gradients to Pickle-File.
+        grad_measure.dump(epoch)
+
+
+"""============================================================================"""
+"""========================== MAIN TRAINING PART =============================="""
+"""============================================================================"""
+################### SCRIPT MAIN ##########################
+print('\n-----\n')
+# Each dataset requires slightly different dataloaders.
+
+if opt.dataset == 'SoftInaturalist' or 'Inaturalist' or 'semi_fungi':
+    eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': 0}
+
+elif opt.dataset == 'vehicle_id':
+    eval_params = {
+        'dataloaders': [dataloaders['testing_set1'], dataloaders['testing_set2'], dataloaders['testing_set3']],
+        'model': model, 'opt': opt, 'epoch': 0}
+# Compute Evaluation metrics, print them and store in LOG.
+print('epochs -> ' + str(opt.n_epochs))
+import time
+
+for epoch in range(opt.n_epochs):
+    ### Print current learning rates for all parameters
+    if opt.scheduler != 'none': print(
+        'Running with learning rates {}...'.format(' | '.join('{}'.format(x) for x in scheduler.get_lr())))
+
+    ### Train one epoch
+    _ = model.train()
+
+    train_one_epoch(dataloaders['training'], model, optimizer, criterion, opt, epoch)
+
+    dataloaders['training'].dataset.reshuffle()
+    ### Evaluate
+    _ = model.eval()
+    # Each dataset requires slightly different dataloaders.
+    if opt.dataset == 'Inaturalist':
+        eval_params = {'dataloader': dataloaders['evaluation'], 'model': model, 'opt': opt, 'epoch': epoch}
+    elif opt.dataset == 'vehicle_id':
+        eval_params = {
+            'dataloaders': [dataloaders['testing_set1'], dataloaders['testing_set2'], dataloaders['testing_set3']],
+            'model': model, 'opt': opt, 'epoch': epoch}
+    elif opt.dataset == 'semi_fungi':
+        eval_params = {'dataloader': dataloaders['testing'], 'model': model, 'opt': opt, 'epoch': epoch}
+
+    # Compute Evaluation metrics, print them and store in LOG.
+    if opt.infrequent_eval == 1:
+        epoch_freq = 5
+    else:
+        epoch_freq = 1
+
+    if not opt.dataset == 'vehicle_id':
+        if epoch % epoch_freq == 0:
+            results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
+            writer.add_scalar('global/recall1', results[0][0], epoch + 1)
+            writer.add_scalar('global/recall2', results[0][1], epoch + 1)
+            writer.add_scalar('global/recall3', results[0][2], epoch + 1)
+            writer.add_scalar('global/recall4', results[0][3], epoch + 1)
+            writer.add_scalar('global/NMI', results[1], epoch + 1)
+            writer.add_scalar('global/F1', results[2], epoch + 1)
+
+    else:
+        results = eval.evaluate(opt.dataset, LOG, save=True, **eval_params)
+        writer.add_scalar('global/recall1', results[2], epoch + 1)
+        writer.add_scalar('global/recall2', results[3],
+                          epoch + 1)  # writer.add_scalar('global/recall3',results[0][2],0)
+        writer.add_scalar('global/recall3', results[6], epoch + 1)
+        writer.add_scalar('global/recall4', results[7], epoch + 1)
+        writer.add_scalar('global/recall5', results[10], epoch + 1)
+        writer.add_scalar('global/recall6', results[11], epoch + 1)
+    # Update the Metric Plot and save it.
+    # LOG.update_info_plot()
+    # (optional) compute ratio of intra- to interdistances.
+    if opt.dist_measure:
+        distance_measure.measure(model, epoch)
+        # distance_measure_test.measure(model, epoch)
+
+    ### Learning Rate Scheduling Step
+    if opt.scheduler != 'none':
+        scheduler.step()
+
+    print('\n-----\n')
--- a/examples/pytorch/hilander/PSS/Smooth_AP/src/netlib.py
+++ b/examples/pytorch/hilander/PSS/Smooth_AP/src/netlib.py
+# repo originally forked from https://github.com/Confusezius/Deep-Metric-Learning-Baselines
+
+############################ LIBRARIES ######################################
+from collections import OrderedDict
+import os
+import torch
+import torch.nn as nn
+import pretrainedmodels as ptm
+import auxiliaries as aux
+
+"""============================================================="""
+
+
+def initialize_weights(model):
+    """
+    Function to initialize network weights.
+    NOTE: NOT USED IN MAIN SCRIPT.
+
+    Args:
+        model: PyTorch Network
+    Returns:
+        Nothing!
+    """
+    for idx, module in enumerate(model.modules()):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+        elif isinstance(module, nn.BatchNorm2d):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(0, 0.01)
+            module.bias.data.zero_()
+
+
+"""=================================================================================================================================="""
+
+
+### ATTRIBUTE CHANGE HELPER
+def rename_attr(model, attr, name):
+    """
+    Rename attribute in a class. Simply helper function.
+
+    Args:
+        model:  General Class for which attributes should be renamed.
+        attr:   str, Name of target attribute.
+        name:   str, New attribute name.
+    """
+    setattr(model, name, getattr(model, attr))
+    delattr(model, attr)
+
+
+"""=================================================================================================================================="""
+
+
+### NETWORK SELECTION FUNCTION
+def networkselect(opt):
+    """
+    Selection function for available networks.
+
+    Args:
+        opt: argparse.Namespace, contains all training-specific training parameters.
+    Returns:
+        Network of choice
+    """
+    if opt.arch == 'resnet50':
+        network = ResNet50(opt)
+    else:
+        raise Exception('Network {} not available!'.format(opt.arch))
+
+    if opt.resume:
+        weights = torch.load(os.path.join(opt.save_path, opt.resume))
+        weights_state_dict = weights['state_dict']
+
+        if torch.cuda.device_count() > 1:
+            encoder_state_dict = OrderedDict()
+            for k, v in weights_state_dict.items():
+                k = k.replace('module.', '')
+                encoder_state_dict[k] = v
+
+            network.load_state_dict(encoder_state_dict)
+        else:
+            network.load_state_dict(weights_state_dict)
+
+    # print("=================== network =======================")
+    # for parameter in network.parameters():
+    #     parameter.requires_grad = False
+    # for parameter in network.layer_blocks[-1].parameters():
+    #     parameter.requires_grad = True
+
+    return network
+
+
+"""============================================================="""
+
+
+class ResNet50(nn.Module):
+    """
+    Container for ResNet50 s.t. it can be used for metric learning.
+    The Network has been broken down to allow for higher modularity, if one wishes
+    to target specific layers/blocks directly.
+    """
+
+    def __init__(self, opt, list_style=False, no_norm=False):
+        super(ResNet50, self).__init__()
+
+        self.pars = opt
+
+        if not opt.not_pretrained:
+            print('Getting pretrained weights...')
+            self.model = ptm.__dict__['resnet50'](num_classes=1000, pretrained='imagenet')
+            print('Done.')
+        else:
+            print('Not utilizing pretrained weights!')
+            self.model = ptm.__dict__['resnet50'](num_classes=1000, pretrained=None)
+        for module in filter(lambda m: type(m) == nn.BatchNorm2d, self.model.modules()):
+            module.eval()
+            module.train = lambda _: None
+
+        if opt.embed_dim != 2048:
+            self.model.last_linear = torch.nn.Linear(self.model.last_linear.in_features, opt.embed_dim)
+
+        self.layer_blocks = nn.ModuleList([self.model.layer1, self.model.layer2, self.model.layer3, self.model.layer4])
+        self.loss = opt.loss
+        self.feature = True
+
+    def forward(self, x, feature=False, is_init_cluster_generation=False):
+        x = self.model.maxpool(self.model.relu(self.model.bn1(self.model.conv1(x))))
+
+        for layerblock in self.layer_blocks:
+            x = layerblock(x)
+
+        x = self.model.avgpool(x)
+        x = x.view(x.size(0), -1)
+
+        if self.pars.embed_dim != 2048:
+            mod_x = self.model.last_linear(x)
+        else:
+            mod_x = x
+
+        feat = torch.nn.functional.normalize(mod_x, dim=-1)
+
+        if feature or self.loss == 'smoothap':
+            return feat
+        else:
+            pred = self.linear(feat)
+            return pred
--- a/examples/pytorch/hilander/PSS/__init__.py
+++ b/examples/pytorch/hilander/PSS/__init__.py
--- a/examples/pytorch/hilander/PSS/test.sh
+++ b/examples/pytorch/hilander/PSS/test.sh
+python Smooth_AP/src/evaluate_model.py \
+--dataset Inaturalist \
+--bs 384 \
+--source_path ~/code/Smooth_AP/data/ --embed_dim 128 \
+--resume $CHECKPOINT_PATH \
+--class_num 948 --loss smoothap \
+--trainset lin_train_set1.txt \
+--testset Inaturalist_test_set1.txt \
+--linsize 29011 --uinsize 18403
\ No newline at end of file
--- a/examples/pytorch/hilander/PSS/test_subg_inat.py
+++ b/examples/pytorch/hilander/PSS/test_subg_inat.py
+import argparse, time, os, pickle
+import random
+import sys
+sys.path.append("..")
+
+from utils.deduce import get_edge_dist
+import numpy as np
+import shutil
+
+import dgl
+import torch
+import torch.optim as optim
+
+from models import LANDER
+from dataset import LanderDataset
+from utils import evaluation, decode, build_next_level, stop_iterating
+
+from matplotlib import pyplot as plt
+import seaborn
+
+STATISTIC = False
+
+###########
+# ArgParser
+parser = argparse.ArgumentParser()
+
+# Dataset
+parser.add_argument('--data_path', type=str, required=True)
+parser.add_argument('--model_filename', type=str, default='lander.pth')
+parser.add_argument('--faiss_gpu', action='store_true')
+parser.add_argument('--num_workers', type=int, default=0)
+parser.add_argument('--output_filename', type=str, default='data/features.pkl')
+
+# HyperParam
+parser.add_argument('--knn_k', type=int, default=10)
+parser.add_argument('--levels', type=int, default=1)
+parser.add_argument('--tau', type=float, default=0.5)
+parser.add_argument('--threshold', type=str, default='prob')
+parser.add_argument('--metrics', type=str, default='pairwise,bcubed,nmi')
+parser.add_argument('--early_stop', action='store_true')
+
+# Model
+parser.add_argument('--hidden', type=int, default=512)
+parser.add_argument('--num_conv', type=int, default=4)
+parser.add_argument('--dropout', type=float, default=0.)
+parser.add_argument('--gat', action='store_true')
+parser.add_argument('--gat_k', type=int, default=1)
+parser.add_argument('--balance', action='store_true')
+parser.add_argument('--use_cluster_feat', action='store_true')
+parser.add_argument('--use_focal_loss', action='store_true')
+parser.add_argument('--use_gt', action='store_true')
+
+# Subgraph
+parser.add_argument('--batch_size', type=int, default=4096)
+parser.add_argument('--mode', type=str, default="1head")
+parser.add_argument('--midpoint', type=str, default="false")
+parser.add_argument('--linsize', type=int, default=29011)
+parser.add_argument('--uinsize', type=int, default=18403)
+parser.add_argument('--inclasses', type=int, default=948)
+parser.add_argument('--thresh', type=float, default=1.0)
+
+parser.add_argument('--draw', type=str, default='false')
+parser.add_argument('--density_distance_pkl', type=str, default="density_distance.pkl")
+parser.add_argument('--density_lindistance_jpg', type=str, default="density_lindistance.jpg")
+
+args = parser.parse_args()
+print(args)
+MODE = args.mode
+linsize = args.linsize
+uinsize = args.uinsize
+inclasses = args.inclasses
+
+if args.draw == 'false':
+    args.draw = False
+elif args.draw == 'true':
+    args.draw = True
+
+###########################
+# Environment Configuration
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+##################
+# Data Preparation
+with open(args.data_path, 'rb') as f:
+    loaded_data = pickle.load(f)
+    path2idx, features, pred_labels, labels, masks = loaded_data
+
+idx2path = {v: k for k, v in path2idx.items()}
+gtlabels = labels
+
+orifeatures = features
+orilabels = gtlabels
+
+if MODE == "selectbydensity":
+    lastusim = np.where(masks == 1)
+    masks[lastusim] = 2
+    selectedidx = np.where(masks != 0)
+    features = features[selectedidx]
+    labels = gtlabels[selectedidx]
+    selectmasks = masks[selectedidx]
+    print("filtered features:", len(features))
+    print("mask0:", len(np.where(masks == 0)[0]))
+    print("mask1:", len(np.where(masks == 1)[0]))
+    print("mask2:", len(np.where(masks == 2)[0]))
+elif MODE == "recluster":
+    selectedidx = np.where(masks == 1)
+    features = features[selectedidx]
+    labels = gtlabels[selectedidx]
+    labelspred = pred_labels[selectedidx]
+    selectmasks = masks[selectedidx]
+    gtlabels = gtlabels[selectedidx]
+    print("filtered features:", len(features))
+else:
+    selectedidx = np.where(masks != 0)
+    features = features[selectedidx]
+    labels = gtlabels[selectedidx]
+    labelspred = pred_labels[selectedidx]
+    selectmasks = masks[selectedidx]
+    gtlabels = gtlabels[selectedidx]
+    print("filtered features:", len(features))
+
+global_features = features.copy()  # global features
+dataset = LanderDataset(features=features, labels=labels, k=args.knn_k,
+                        levels=1, faiss_gpu=False)
+g = dataset.gs[0]
+g.ndata['pred_den'] = torch.zeros((g.number_of_nodes()))
+g.edata['prob_conn'] = torch.zeros((g.number_of_edges(), 2))
+global_labels = labels.copy()
+ids = np.arange(g.number_of_nodes())
+global_edges = ([], [])
+global_peaks = np.array([], dtype=np.long)
+global_edges_len = len(global_edges[0])
+global_num_nodes = g.number_of_nodes()
+
+global_densities = g.ndata['density'][:linsize]
+global_densities = np.sort(global_densities)
+xs = np.arange(len(global_densities))
+
+fanouts = [args.knn_k - 1 for i in range(args.num_conv + 1)]
+sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+# fix the number of edges
+test_loader = dgl.dataloading.NodeDataLoader(
+    g, torch.arange(g.number_of_nodes()), sampler,
+    batch_size=args.batch_size,
+    shuffle=False,
+    drop_last=False,
+    num_workers=args.num_workers
+)
+
+##################
+# Model Definition
+if not args.use_gt:
+    feature_dim = g.ndata['features'].shape[1]
+    model = LANDER(feature_dim=feature_dim, nhid=args.hidden,
+                   num_conv=args.num_conv, dropout=args.dropout,
+                   use_GAT=args.gat, K=args.gat_k,
+                   balance=args.balance,
+                   use_cluster_feat=args.use_cluster_feat,
+                   use_focal_loss=args.use_focal_loss)
+    model.load_state_dict(torch.load(args.model_filename))
+    model = model.to(device)
+    model.eval()
+
+# number of edges added is the indicator for early stopping
+num_edges_add_last_level = np.Inf
+##################################
+# Predict connectivity and density
+for level in range(args.levels):
+    print("level:", level)
+    if not args.use_gt:
+        total_batches = len(test_loader)
+        for batch, minibatch in enumerate(test_loader):
+            input_nodes, sub_g, bipartites = minibatch
+            sub_g = sub_g.to(device)
+            bipartites = [b.to(device) for b in bipartites]
+            with torch.no_grad():
+                output_bipartite = model(bipartites)
+            global_nid = output_bipartite.dstdata[dgl.NID]
+            global_eid = output_bipartite.edata['global_eid']
+            g.ndata['pred_den'][global_nid] = output_bipartite.dstdata['pred_den'].to('cpu')
+            g.edata['prob_conn'][global_eid] = output_bipartite.edata['prob_conn'].to('cpu')
+            torch.cuda.empty_cache()
+            if (batch + 1) % 10 == 0:
+                print('Batch %d / %d for inference' % (batch, total_batches))
+
+    new_pred_labels, peaks, \
+    global_edges, global_pred_labels, global_peaks = decode(g, args.tau, args.threshold, args.use_gt,
+                                                            ids, global_edges, global_num_nodes,
+                                                            global_peaks)
+    if level == 0:
+        global_pred_densities = g.ndata['pred_den']
+        global_densities = g.ndata['density']
+        g.edata['prob_conn'] = torch.zeros((g.number_of_edges(), 2))
+
+    ids = ids[peaks]
+    new_global_edges_len = len(global_edges[0])
+    num_edges_add_this_level = new_global_edges_len - global_edges_len
+    if stop_iterating(level, args.levels, args.early_stop, num_edges_add_this_level, num_edges_add_last_level,
+                      args.knn_k):
+        break
+    global_edges_len = new_global_edges_len
+    num_edges_add_last_level = num_edges_add_this_level
+
+    # build new dataset
+    features, labels, cluster_features = build_next_level(features, labels, peaks,
+                                                          global_features, global_pred_labels, global_peaks)
+    # After the first level, the number of nodes reduce a lot. Using cpu faiss is faster.
+    dataset = LanderDataset(features=features, labels=labels, k=args.knn_k,
+                            levels=1, faiss_gpu=False, cluster_features=cluster_features)
+    g = dataset.gs[0]
+    g.ndata['pred_den'] = torch.zeros((g.number_of_nodes()))
+    g.edata['prob_conn'] = torch.zeros((g.number_of_edges(), 2))
+    test_loader = dgl.dataloading.NodeDataLoader(
+        g, torch.arange(g.number_of_nodes()), sampler,
+        batch_size=args.batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=args.num_workers
+    )
+
+if MODE == "selectbydensity":
+    thresh = args.thresh
+    global_pred_densities = np.array(global_pred_densities).astype(float)
+    global_densities = np.array(global_densities).astype(float)
+    distance = np.abs(global_pred_densities - global_densities)
+    print("densities shape", global_pred_densities.shape)
+    print(global_pred_densities.max(), global_pred_densities.min())
+
+    selectidx = np.where(global_pred_densities > thresh)[0]
+    selected_pred_densities = global_pred_densities[selectidx]
+    selected_densities = global_densities[selectidx]
+    selected_distance = np.abs(selected_pred_densities - selected_densities)
+    print(np.mean(selected_distance))
+    print("number of selected samples:", len(selectidx))
+
+    notselectidx = np.where(global_pred_densities <= thresh)
+    print("not selected:", len(notselectidx[0]))
+    global_pred_labels[notselectidx] = -1
+
+    global_pred_labels_new = np.zeros_like(orilabels)
+    global_pred_labels_new[:] = -1
+    Tidx = np.where(masks != 2)
+    print("T:", len(Tidx[0]))
+
+    l_in_gt = orilabels[Tidx]
+    l_in_features = orifeatures[Tidx]
+    l_in_gt_new = np.zeros_like(l_in_gt)
+    l_in_unique = np.unique(l_in_gt)
+    for i in range(len(l_in_unique)):
+        l_in = l_in_unique[i]
+        l_in_idx = np.where(l_in_gt == l_in)
+        l_in_gt_new[l_in_idx] = i
+    print("len(l_in_unique)", len(l_in_unique))
+
+    if args.draw:
+        prototypes = np.zeros((len(l_in_unique), features.shape[1]))
+        for i in range(len(l_in_unique)):
+            idx = np.where(l_in_gt_new == i)
+            prototypes[i] = np.mean(l_in_features[idx], axis=0)
+
+        similarity_matrix = torch.mm(torch.from_numpy(global_features.astype(np.float32)),
+                                     torch.from_numpy(prototypes.astype(np.float32)).t())
+        similarity_matrix = (1 - similarity_matrix) / 2
+        minvalues, selected_pred_labels = torch.min(similarity_matrix, 1)
+        # far-close ratio
+        closeidx = np.where(minvalues < 0.15)
+        faridx = np.where(minvalues >= 0.15)
+        print("far:", len(faridx[0]))
+        print("close:", len(closeidx[0]))
+
+        cutidx = np.where(global_pred_densities >= 0.5)
+        draw_minvalues = minvalues[cutidx]
+        draw_densities = global_pred_densities[cutidx]
+        with open(args.density_distance_pkl, 'wb') as f:
+            pickle.dump((global_pred_densities, minvalues), f)
+        print("dumped.")
+        plt.clf()
+        fig, ax = plt.subplots()
+        import random
+
+        if len(draw_densities) > 10000:
+            samples_idx = random.sample(range(len(draw_minvalues)), 10000)
+            ax.plot(draw_densities[random], draw_minvalues[random], color='tab:blue', marker='*', linestyle="None",
+                    markersize=1)
+        else:
+            ax.plot(draw_densities[random], draw_minvalues[random], color='tab:blue', marker='*', linestyle="None",
+                    markersize=1)
+        plt.savefig(args.density_lindistance_jpg)
+
+    global_pred_labels_new[Tidx] = l_in_gt_new
+    global_pred_labels[selectidx] = global_pred_labels[selectidx] + len(l_in_unique)
+    global_pred_labels_new[selectedidx] = global_pred_labels
+
+    global_pred_labels = global_pred_labels_new
+    linunique = np.unique(global_pred_labels[Tidx])
+    uunique = np.unique(global_pred_labels[selectedidx])
+    allnique = np.unique(global_pred_labels)
+    print("labels")
+    print(len(linunique), len(uunique), len(allnique))
+
+    global_masks = np.zeros_like(masks)
+    global_masks[:] = 1
+    global_masks[np.array(selectedidx[0])[notselectidx]] = 2
+    Tidx = np.where(masks != 2)
+    global_masks[Tidx] = 0
+    print("mask0", len(np.where(global_masks == 0)[0]))
+    print("mask1", len(np.where(global_masks == 1)[0]))
+    print("mask2", len(np.where(global_masks == 2)[0]))
+    print("all", len(masks), len(orilabels), len(orifeatures))
+
+    global_gt_labels = orilabels
+
+if MODE == "recluster":
+    global_pred_labels_new = np.zeros_like(orilabels)
+    global_pred_labels_new[:] = -1
+    Tidx = np.where(masks == 0)
+    print("T:", len(Tidx[0]))
+
+    l_in_gt = orilabels[Tidx]
+    l_in_features = orifeatures[Tidx]
+    l_in_gt_new = np.zeros_like(l_in_gt)
+    l_in_unique = np.unique(l_in_gt)
+    for i in range(len(l_in_unique)):
+        l_in = l_in_unique[i]
+        l_in_idx = np.where(l_in_gt == l_in)
+        l_in_gt_new[l_in_idx] = i
+    print("len(l_in_unique)", len(l_in_unique))
+
+    global_pred_labels_new[Tidx] = l_in_gt_new
+    print(len(global_pred_labels))
+    print(len(selectedidx[0]))
+    global_pred_labels_new[selectedidx[0]] = global_pred_labels + len(l_in_unique)
+    global_pred_labels = global_pred_labels_new
+    global_masks = masks
+    print("mask0", len(np.where(global_masks == 0)[0]))
+    print("mask1", len(np.where(global_masks == 1)[0]))
+    print("mask2", len(np.where(global_masks == 2)[0]))
+    print("all", len(masks), len(orilabels), len(orifeatures))
+    global_gt_labels = orilabels
+
+if MODE == "donothing":
+    global_masks = masks
+    pass
+
+print("##################### L_in ########################")
+print(linsize)
+if len(global_pred_labels) >= linsize:
+    evaluation(global_pred_labels[:linsize], global_gt_labels[:linsize], args.metrics)
+else:
+    print("No samples in L_in!")
+print("##################### U_in ########################")
+uinidx = np.where(global_pred_labels[linsize:linsize + uinsize] != -1)[0]
+uinidx = uinidx + linsize
+print(len(uinidx))
+if len(uinidx):
+    evaluation(global_pred_labels[uinidx], global_gt_labels[uinidx], args.metrics)
+else:
+    print("No samples in U_in!")
+print("##################### U_out ########################")
+uoutidx = np.where(global_pred_labels[linsize + uinsize:] != -1)[0]
+uoutidx = uoutidx + linsize + uinsize
+print(len(uoutidx))
+if len(uoutidx):
+    evaluation(global_pred_labels[uoutidx], global_gt_labels[uoutidx], args.metrics)
+else:
+    print("No samples in U_out!")
+print("##################### U ########################")
+uidx = np.where(global_pred_labels[linsize:] != -1)[0]
+uidx = uidx + linsize
+print(len(uidx))
+if len(uidx):
+    evaluation(global_pred_labels[uidx], global_gt_labels[uidx], args.metrics)
+else:
+    print("No samples in U!")
+print("##################### L+U ########################")
+luidx = np.where(global_pred_labels != -1)[0]
+print(len(luidx))
+evaluation(global_pred_labels[luidx], global_gt_labels[luidx], args.metrics)
+print("##################### new selected samples ########################")
+sidx = np.where(global_masks == 1)[0]
+print(len(sidx))
+if len(sidx) != 0:
+    evaluation(global_pred_labels[sidx], global_gt_labels[sidx], args.metrics)
+print("##################### not selected samples ########################")
+nsidx = np.where(global_masks == 2)[0]
+print(len(nsidx))
+if len(nsidx) != 0:
+    evaluation(global_pred_labels[nsidx], global_gt_labels[nsidx], args.metrics)
+
+with open(args.output_filename, 'wb') as f:
+    print(orifeatures.shape)
+    print(global_pred_labels.shape)
+    print(global_gt_labels.shape)
+    print(global_masks.shape)
+    pickle.dump([path2idx, orifeatures, global_pred_labels, global_gt_labels, global_masks], f)
--- a/examples/pytorch/hilander/PSS/train.sh
+++ b/examples/pytorch/hilander/PSS/train.sh
+#!/bin/bash
+
+mkdir hilander_checkpoint
+
+####################### ITER 0 #######################
+# iter 0 (supervised baseline) - train Smooth-AP
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python Smooth_AP/src/main.py \
+--dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+--n_epochs 400 --bs 384 \
+--source_path "../../data/" --embed_dim 128 \
+--class_num 948 --loss smoothap --infrequent_eval 1 \
+--trainset lin_train_set1.txt --testset Inaturalist_test_set1.txt
+
+# iter 0 (supervised baseline) - get feature
+python Smooth_AP/src/get_features.py \
+--dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+--n_epochs 400 --bs 384 \
+--source_path "../../data/" --embed_dim 128 \
+--resume "0/checkpoint_0.pth.tar" \
+--finetune false --get_features true --iter 0 \
+--class_num 948 --loss smoothap \
+--trainset lin_train_set1.txt \
+--all_trainset train_set1.txt \
+--testset Inaturalist_test_set1.txt \
+--linsize 29011
+
+# iter 0 (supervised baseline) - train hi-lander
+python train_subg_inat.py \
+--data_path "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/T_train_iter0_smoothap_inat_features.pkl" \
+--model_filename '/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter0.pth' \
+--knn_k 10,5,3 --levels 2,3,4 \
+--hidden 512 --epochs 1000 --lr 0.01 \
+--batch_size 4096 --num_conv 1 --gat --balance
+
+# iter 0 (supervised baseline) - get pseudo labels
+python test_subg_inat.py \
+--data_path '/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/all_train_iter0_smoothap_inat_features.pkl' \
+--model_filename '/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter0.pth'  --knn_k 10 \
+--tau 0.9 --level 10 --threshold prob \
+--hidden 512 --num_conv 1 --gat --batch_size 4096 --early_stop \
+--mode selectbydensity --thresh 0.8 \
+--linsize 29011 --uinsize 18403 --inclasses 948 \
+--output_filename 'data/inat_hilander_l_smoothap_train_selectbydensity_iter0.pkl'
+
+
+for i in {1..4} ; do
+  last_iter=`expr $i - 1`
+  echo ${last_iter}
+  # iter i - train Smooth-AP
+  python Smooth_AP/src/finetune_1head.py \
+  --dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+  --n_epochs 400 --bs 384 --class_num 1024 \
+  --source_path "../../data/" --embed_dim 128 \
+  --trainset lin_train_set1.txt --testset Inaturalist_test_set1.txt \
+  --cluster_path "../../data/inat_hilander_l_smoothap_train_selectbydensity_iter${last_iter}.pkl" \
+  --finetune true --loss smoothap --infrequent_eval 1 --iter ${i}
+
+  # iter i - get feature
+  python Smooth_AP/src/get_features.py \
+  --dataset Inaturalist --lr 1e-5 --fc_lr_mul 1 \
+  --n_epochs 400 --bs 384 \
+  --source_path "../../data/" --embed_dim 128 \
+  --resume "${i}/checkpoint_${i}.pth.tar" \
+  --finetune false --get_features true --iter ${i} \
+  --class_num 948 --loss smoothap \
+  --trainset lin_train_set1.txt \
+  --all_trainset train_set1.txt \
+  --testset Inaturalist_test_set1.txt \
+  --linsize 29011 --uinsize 18403 \
+  --cluster_path "../../data/inat_hilander_l_smoothap_train_selectbydensity_iter${last_iter}.pkl"
+
+  # iter i - train hi-lander
+  python train_subg_inat.py \
+  --data_path "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/T_train_iter${i}_smoothap_inat_features.pkl" \
+  --model_filename "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter${i}.pth" \
+  --knn_k 10,5,3 --levels 2,3,4 \
+  --hidden 512 --epochs 1000 --lr 0.01 \
+  --batch_size 4096 --num_conv 1 --gat --balance
+
+  # iter i - get pseudo labels
+  python test_subg_inat.py \
+  --data_path "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/data/Inaturalist/all_train_iter${i}_smoothap_inat_features.pkl" \
+  --model_filename "/home/ubuntu/code/dgl/examples/pytorch/hilander/PSS/hilander_checkpoint/inat_l_smoothap_iter${i}.pth"  --knn_k 10 \
+  --tau 0.9 --level 10 --threshold prob \
+  --hidden 512 --num_conv 1 --gat --batch_size 4096 --early_stop \
+  --mode selectbydensity --thresh 0.8 \
+  --linsize 29011 --uinsize 18403 --inclasses 948 \
+  --output_filename "data/inat_hilander_l_smoothap_train_selectbydensity_iter${i}.pkl"
+done
--- a/examples/pytorch/hilander/PSS/train_subg_inat.py
+++ b/examples/pytorch/hilander/PSS/train_subg_inat.py
+import argparse, time, os, pickle
+import random
+
+import numpy as np
+
+import dgl
+import torch
+import torch.optim as optim
+
+import sys
+sys.path.append("..")
+from models import LANDER
+from dataset import LanderDataset
+
+###########
+# ArgParser
+parser = argparse.ArgumentParser()
+
+# Dataset
+parser.add_argument('--data_path', type=str, required=True)
+parser.add_argument('--levels', type=str, default='1')
+parser.add_argument('--faiss_gpu', action='store_true')
+parser.add_argument('--model_filename', type=str, default='lander.pth')
+
+# KNN
+parser.add_argument('--knn_k', type=str, default='10')
+parser.add_argument('--num_workers', type=int, default=0)
+
+# Model
+parser.add_argument('--hidden', type=int, default=512)
+parser.add_argument('--num_conv', type=int, default=1)
+parser.add_argument('--dropout', type=float, default=0.)
+parser.add_argument('--gat', action='store_true')
+parser.add_argument('--gat_k', type=int, default=1)
+parser.add_argument('--balance', action='store_true')
+parser.add_argument('--use_cluster_feat', action='store_true')
+parser.add_argument('--use_focal_loss', action='store_true')
+
+# Training
+parser.add_argument('--epochs', type=int, default=100)
+parser.add_argument('--batch_size', type=int, default=1024)
+parser.add_argument('--lr', type=float, default=0.1)
+parser.add_argument('--momentum', type=float, default=0.9)
+parser.add_argument('--weight_decay', type=float, default=1e-5)
+
+args = parser.parse_args()
+print(args)
+
+###########################
+# Environment Configuration
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+# setup_seed(20)
+
+##################
+# Data Preparation
+with open(args.data_path, 'rb') as f:
+    path2idx, features, labels, _, masks = pickle.load(f)
+    # lidx = np.where(masks==0)
+    # features = features[lidx]
+    # labels = labels[lidx]
+    print("features.shape:", features.shape)
+    print("labels.shape:", labels.shape)
+
+
+k_list = [int(k) for k in args.knn_k.split(',')]
+lvl_list = [int(l) for l in args.levels.split(',')]
+gs = []
+nbrs = []
+ks = []
+datasets = []
+for k, l in zip(k_list, lvl_list):
+    print("k:", k)
+    print("levels:", l)
+    dataset = LanderDataset(features=features, labels=labels, k=k,
+                                levels=l, faiss_gpu=args.faiss_gpu)
+    gs += [g for g in dataset.gs]
+    ks += [k for g in dataset.gs]
+    nbrs += [nbr for nbr in dataset.nbrs]
+    datasets.append(dataset)
+
+# with open("./dataset.pkl", 'rb') as f:
+#     datasets = pickle.load(f)
+# for i in range(len(datasets)):
+#     dataset = datasets[i]
+#     k = k_list[i]
+#     gs += [g for g in dataset.gs]
+#     ks += [k for g in dataset.gs]
+#     nbrs += [nbr for nbr in dataset.nbrs]
+
+
+with open("./dataset.pkl", 'wb') as f:
+    pickle.dump(datasets, f)
+
+print('Dataset Prepared.')
+
+def set_train_sampler_loader(g, k):
+    fanouts = [k-1 for i in range(args.num_conv + 1)]
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+    # fix the number of edges
+    train_dataloader = dgl.dataloading.NodeDataLoader(
+        g, torch.arange(g.number_of_nodes()), sampler,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=args.num_workers
+    )
+    return train_dataloader
+
+train_loaders = []
+for gidx, g in enumerate(gs):
+    train_dataloader = set_train_sampler_loader(gs[gidx], ks[gidx])
+    train_loaders.append(train_dataloader)
+
+##################
+# Model Definition
+feature_dim = gs[0].ndata['features'].shape[1]
+print("feature dimension:", feature_dim)
+model = LANDER(feature_dim=feature_dim, nhid=args.hidden,
+               num_conv=args.num_conv, dropout=args.dropout,
+               use_GAT=args.gat, K=args.gat_k,
+               balance=args.balance,
+               use_cluster_feat=args.use_cluster_feat,
+               use_focal_loss=args.use_focal_loss)
+model = model.to(device)
+model.train()
+
+#################
+# Hyperparameters
+opt = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum,
+                weight_decay=args.weight_decay)
+
+# keep num_batch_per_loader the same for every sub_dataloader
+num_batch_per_loader = len(train_loaders[0])
+train_loaders = [iter(train_loader) for train_loader in train_loaders]
+num_loaders = len(train_loaders)
+scheduler = optim.lr_scheduler.CosineAnnealingLR(opt,
+                                                 T_max=args.epochs * num_batch_per_loader * num_loaders,
+                                                 eta_min=1e-5)
+
+print('Start Training.')
+
+###############
+# Training Loop
+for epoch in range(args.epochs):
+    loss_den_val_total = []
+    loss_conn_val_total = []
+    loss_val_total = []
+    for batch in range(num_batch_per_loader):
+        for loader_id in range(num_loaders):
+            try:
+                minibatch = next(train_loaders[loader_id])
+            except:
+                train_loaders[loader_id] = iter(set_train_sampler_loader(gs[loader_id], ks[loader_id]))
+                minibatch = next(train_loaders[loader_id])
+            input_nodes, sub_g, bipartites = minibatch
+            sub_g = sub_g.to(device)
+            bipartites = [b.to(device) for b in bipartites]
+            # get the feature for the input_nodes
+            opt.zero_grad()
+            output_bipartite = model(bipartites)
+            loss, loss_den_val, loss_conn_val = model.compute_loss(output_bipartite)
+            loss_den_val_total.append(loss_den_val)
+            loss_conn_val_total.append(loss_conn_val)
+            loss_val_total.append(loss.item())
+            loss.backward()
+            opt.step()
+            if (batch + 1) % 10 == 0:
+                print('epoch: %d, batch: %d / %d, loader_id : %d / %d, loss: %.6f, loss_den: %.6f, loss_conn: %.6f'%
+                      (epoch, batch, num_batch_per_loader, loader_id, num_loaders,
+                       loss.item(), loss_den_val, loss_conn_val))
+            scheduler.step()
+    print('epoch: %d, loss: %.6f, loss_den: %.6f, loss_conn: %.6f'%
+          (epoch, np.array(loss_val_total).mean(),
+           np.array(loss_den_val_total).mean(), np.array(loss_conn_val_total).mean()))
+    torch.save(model.state_dict(), args.model_filename)
+
+torch.save(model.state_dict(), args.model_filename)
--- a/examples/pytorch/hilander/__init__.py
+++ b/examples/pytorch/hilander/__init__.py
--- a/examples/pytorch/ogb/ogbn-mag/README.md
+++ b/examples/pytorch/ogb/ogbn-mag/README.md
@@ -5,11 +5,6 @@ The following options can be specified via command line arguments:
 ```
 optional arguments:
  -h, --help            show this help message and exit
-  --dropout DROPOUT     dropout probability
-  --n-hidden N_HIDDEN   number of hidden units
-  --lr LR               learning rate
-  -e N_EPOCHS, --n-epochs N_EPOCHS
-                        number of training epochs
  --runs RUNS
 ```

@@ -36,12 +31,12 @@ Some examples of such differences:
 - Instead of reversing `(paper, cites, paper)` into a new relation like `(paper, rev-cites, paper)`, the PyG implementation instead just made these into undirected edges ([code](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/sampler.py#L54))
 - In the PyG implementation there's a separate "self" linear projection matrix for each _node-type_ ([code](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/sampler.py#L106)).  This is different from the R-GCN [paper](https://arxiv.org/abs/1703.06103), which has a single "self" linear projection matrix for each R-GCN layer, not a different one for each node-type.

-### Neighborhood sampling differences 
-Although the model architectures, hyperparameter values and initialization methods are identical between the implementation here and the PyG one as of this writing, there is still a significant difference in the way neighbors are sampled, which results in the DGL implementation achieving significantly faster overfitting to the training dataset and slightly improved performance on the Test dataset.  
+### Neighborhood sampling differences
+Although the model architectures, hyperparameter values and initialization methods are identical between the implementation here and the PyG one as of this writing, there is still a significant difference in the way neighbors are sampled, which results in the DGL implementation achieving significantly faster overfitting to the training dataset and slightly improved performance on the Test dataset.

 In DGL, sampling on heterogeneous graphs with a `fanout = N` parameter means there are N samples _per incoming relation type_.  In the PyG implementation, the heterogeneous graph is represented as a homogeneous graph and there are N samples total, regardless of relation type.  This effectively means that given the same `fanout` value, there are R times as many neighbors sampled for DGL than PyG, where R is the number of edge-types that are directed inward to a node.  Since there are significantly more nodes involved in the computation, there are likewise more nodes receiving gradient updates and therefore more significant overfitting given the same number of epochs.

-An effort was made to mitigate this increase by reducing the fanout from `[25, 20]` to `[6, 5]`, which gives roughly the same number of neighbors between PyG and DGL and similar final training performance.  However, the DGL implementation has significantly worse Test performance in this case.  This is likely due to the fact that sampling e.g., 5 nodes from 4 different edge types is not the same as sampling 20 nodes by ignoring edge type unless the edge types are uniformly distributed.  
+An effort was made to mitigate this increase by reducing the fanout from `[25, 20]` to `[6, 5]`, which gives roughly the same number of neighbors between PyG and DGL and similar final training performance.  However, the DGL implementation has significantly worse Test performance in this case.  This is likely due to the fact that sampling e.g., 5 nodes from 4 different edge types is not the same as sampling 20 nodes by ignoring edge type unless the edge types are uniformly distributed.

 ### Input features
 The `paper` nodes have 128-dimensional features that are derived from word embeddings of the words found in the title and abstract of the papers.  Following the PyG implementation, all node types except `paper` receive 128-dimensional learnable embeddings as node features.  This results in 154,029,312 learnable parameters for just the node features.
@@ -55,71 +50,6 @@ ParameterDict(
 ```

 ### Model architecture
-The input features are passed to a modified version of the R-GCN architecture.  As in the R-GCN paper, each _edge-type_ has its own linear projection matrix (the "weight" ModuleDict below).  Different from the original paper, however, each _node-type_ has its own "self" linear projection matrix (the "loop_weights" ModuleDict below).  There are 7 edge-types:  4 natural edge-types ("cites", "affiliated_with", "has_topic" and "writes") and 3 manufactured reverse edge-types ("rev-affiliated_with", "rev-has_topic", "rev-writes").  As mentioned above, note that there is _not_ a reverse edge type like "rev-cites", and instead the reverse edges are given the same type of "cites".  This exception was presumably made because the source and destinate nodes are of type "paper".  Whereas the 7 "relation" linear layers do not have a bias, the 4 "self" linear layers do.  
+The input features are passed to a modified version of the R-GCN architecture.  As in the R-GCN paper, each _edge-type_ has its own linear projection matrix (the "weight" ModuleDict below).  Different from the original paper, however, each _node-type_ has its own "self" linear projection matrix (the "loop_weights" ModuleDict below).  There are 7 edge-types:  4 natural edge-types ("cites", "affiliated_with", "has_topic" and "writes") and 3 manufactured reverse edge-types ("rev-affiliated_with", "rev-has_topic", "rev-writes").  As mentioned above, note that there is _not_ a reverse edge type like "rev-cites", and instead the reverse edges are given the same type of "cites".  This exception was presumably made because the source and destinate nodes are of type "paper".  Whereas the 7 "relation" linear layers do not have a bias, the 4 "self" linear layers do.

 With two of these layers, a hidden dimension size of 64 and 349 output classes, we end up with 337,460 R-GCN model parameters.
-
-```
-EntityClassify(
-  (layers): ModuleList(
-    (0): RelGraphConvLayer(
-      (conv): HeteroGraphConv(
-        (mods): ModuleDict(
-          (affiliated_with): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (cites): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (has_topic): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (rev-affiliated_with): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (rev-has_topic): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (rev-writes): GraphConv(in=128, out=64, normalization=right, activation=None)
-          (writes): GraphConv(in=128, out=64, normalization=right, activation=None)
-        )
-      )
-      (weight): ModuleDict(
-        (affiliated_with): Linear(in_features=128, out_features=64, bias=False)
-        (cites): Linear(in_features=128, out_features=64, bias=False)
-        (has_topic): Linear(in_features=128, out_features=64, bias=False)
-        (rev-affiliated_with): Linear(in_features=128, out_features=64, bias=False)
-        (rev-has_topic): Linear(in_features=128, out_features=64, bias=False)
-        (rev-writes): Linear(in_features=128, out_features=64, bias=False)
-        (writes): Linear(in_features=128, out_features=64, bias=False)
-      )
-      (loop_weights): ModuleDict(
-        (author): Linear(in_features=128, out_features=64, bias=True)
-        (field_of_study): Linear(in_features=128, out_features=64, bias=True)
-        (institution): Linear(in_features=128, out_features=64, bias=True)
-        (paper): Linear(in_features=128, out_features=64, bias=True)
-      )
-      (dropout): Dropout(p=0.5, inplace=False)
-    )
-    (1): RelGraphConvLayer(
-      (conv): HeteroGraphConv(
-        (mods): ModuleDict(
-          (affiliated_with): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (cites): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (has_topic): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (rev-affiliated_with): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (rev-has_topic): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (rev-writes): GraphConv(in=64, out=349, normalization=right, activation=None)
-          (writes): GraphConv(in=64, out=349, normalization=right, activation=None)
-        )
-      )
-      (weight): ModuleDict(
-        (affiliated_with): Linear(in_features=64, out_features=349, bias=False)
-        (cites): Linear(in_features=64, out_features=349, bias=False)
-        (has_topic): Linear(in_features=64, out_features=349, bias=False)
-        (rev-affiliated_with): Linear(in_features=64, out_features=349, bias=False)
-        (rev-has_topic): Linear(in_features=64, out_features=349, bias=False)
-        (rev-writes): Linear(in_features=64, out_features=349, bias=False)
-        (writes): Linear(in_features=64, out_features=349, bias=False)
-      )
-      (loop_weights): ModuleDict(
-        (author): Linear(in_features=64, out_features=349, bias=True)
-        (field_of_study): Linear(in_features=64, out_features=349, bias=True)
-        (institution): Linear(in_features=64, out_features=349, bias=True)
-        (paper): Linear(in_features=64, out_features=349, bias=True)
-      )
-      (dropout): Dropout(p=0.0, inplace=False)
-    )
-  )
-)
-```
--- a/examples/pytorch/ogb/ogbn-mag/hetero_rgcn.py
+++ b/examples/pytorch/ogb/ogbn-mag/hetero_rgcn.py
@@ -4,136 +4,93 @@ from tqdm import tqdm

 import dgl
 import dgl.nn as dglnn
+from dgl.nn import HeteroEmbedding
+from dgl import Compose, AddReverse, ToSimple
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 from ogb.nodeproppred import DglNodePropPredDataset, Evaluator

+def prepare_data(args):
+    dataset = DglNodePropPredDataset(name="ogbn-mag")
+    split_idx = dataset.get_idx_split()
+    # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
+    g, labels = dataset[0]
+    labels = labels['paper'].flatten()

-def extract_embed(node_embed, input_nodes):
-    emb = {}
-    for ntype, nid in input_nodes.items():
-        nid = input_nodes[ntype]
-        if ntype in node_embed:
-            emb[ntype] = node_embed[ntype][nid]
-    return emb
+    transform = Compose([ToSimple(), AddReverse()])
+    g = transform(g)

-class RelGraphEmbed(nn.Module):
-    r"""Embedding layer for featureless heterograph.
-    
-    Parameters
-    ----------
-    g : DGLGraph
-        Input graph.
-    embed_size : int
-        The length of each embedding vector
-    exclude : list[str]
-        The list of node-types to exclude (e.g., because they have natural features)
-    """
-    def __init__(self, g, embed_size, exclude=list()):
-        
-        super(RelGraphEmbed, self).__init__()
-        self.g = g
-        self.embed_size = embed_size
-
-        # create learnable embeddings for all nodes, except those with a node-type in the "exclude" list
-        self.embeds = nn.ParameterDict()
-        for ntype in g.ntypes:
-            if ntype in exclude:
-                continue
-            embed = nn.Parameter(th.Tensor(g.number_of_nodes(ntype), self.embed_size))
-            self.embeds[ntype] = embed
+    print("Loaded graph: {}".format(g))

-        self.reset_parameters()
+    logger = Logger(args.runs)

-    def reset_parameters(self):
-        for emb in self.embeds.values():
-            nn.init.xavier_uniform_(emb)
+    # train sampler
+    sampler = dgl.dataloading.MultiLayerNeighborSampler([25, 20])
+    train_loader = dgl.dataloading.DataLoader(
+        g, split_idx['train'], sampler,
+        batch_size=1024, shuffle=True, num_workers=0)
+
+    return g, labels, dataset.num_classes, split_idx, logger, train_loader

-    def forward(self, block=None):
-        return self.embeds
+def extract_embed(node_embed, input_nodes):
+    emb = node_embed({
+        ntype: input_nodes[ntype] for ntype in input_nodes if ntype != 'paper'
+    })
+    return emb

+def rel_graph_embed(graph, embed_size):
+    node_num = {}
+    for ntype in graph.ntypes:
+        if ntype == 'paper':
+            continue
+        node_num[ntype] = graph.num_nodes(ntype)
+    embeds = HeteroEmbedding(node_num, embed_size)
+    return embeds

 class RelGraphConvLayer(nn.Module):
-    r"""Relational graph convolution layer.
-
-    Parameters
-    ----------
-    in_feat : int
-        Input feature size.
-    out_feat : int
-        Output feature size.
-    ntypes : list[str]
-        Node type names
-    rel_names : list[str]
-        Relation names.
-    weight : bool, optional
-        True if a linear layer is applied after message passing. Default: True
-    bias : bool, optional
-        True if bias is added. Default: True
-    activation : callable, optional
-        Activation function. Default: None
-    self_loop : bool, optional
-        True to include self loop message. Default: False
-    dropout : float, optional
-        Dropout rate. Default: 0.0
-    """
    def __init__(self,
                 in_feat,
                 out_feat,
                 ntypes,
                 rel_names,
-                 *,
-                 weight=True,
-                 bias=True,
                 activation=None,
-                 self_loop=False,
                 dropout=0.0):
        super(RelGraphConvLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.ntypes = ntypes
        self.rel_names = rel_names
-        self.bias = bias
        self.activation = activation
-        self.self_loop = self_loop

        self.conv = dglnn.HeteroGraphConv({
                rel : dglnn.GraphConv(in_feat, out_feat, norm='right', weight=False, bias=False)
                for rel in rel_names
            })

-        self.use_weight = weight
-        if self.use_weight:
-            self.weight = nn.ModuleDict({
-                rel_name: nn.Linear(in_feat, out_feat, bias=False)
-                for rel_name in self.rel_names
-            })
+        self.weight = nn.ModuleDict({
+            rel_name: nn.Linear(in_feat, out_feat, bias=False)
+            for rel_name in self.rel_names
+        })

        # weight for self loop
-        if self.self_loop:
-            self.loop_weights = nn.ModuleDict({
-                ntype: nn.Linear(in_feat, out_feat, bias=bias)
-                for ntype in self.ntypes
-            })
+        self.loop_weights = nn.ModuleDict({
+            ntype: nn.Linear(in_feat, out_feat, bias=True)
+            for ntype in self.ntypes
+        })

        self.dropout = nn.Dropout(dropout)
-
        self.reset_parameters()

-    
    def reset_parameters(self):
-        if self.use_weight:
-            for layer in self.weight.values():
-                layer.reset_parameters()
+        for layer in self.weight.values():
+            layer.reset_parameters()

-        if self.self_loop:
-            for layer in self.loop_weights.values():
-                layer.reset_parameters()
+        for layer in self.loop_weights.values():
+            layer.reset_parameters()

    def forward(self, g, inputs):
-        """Forward computation
-
+        """
        Parameters
        ----------
        g : DGLHeteroGraph
@@ -147,83 +104,41 @@ class RelGraphConvLayer(nn.Module):
            New node features for each node type.
        """
        g = g.local_var()
-        if self.use_weight:
-            wdict = {rel_name: {'weight': self.weight[rel_name].weight.T}
-                     for rel_name in self.rel_names}
-        else:
-            wdict = {}
+        wdict = {rel_name: {'weight': self.weight[rel_name].weight.T}
+                 for rel_name in self.rel_names}

-        if g.is_block:
-            inputs_dst = {k: v[:g.number_of_dst_nodes(k)] for k, v in inputs.items()}
-        else:
-            inputs_dst = inputs
+        inputs_dst = {k: v[:g.number_of_dst_nodes(k)] for k, v in inputs.items()}

        hs = self.conv(g, inputs, mod_kwargs=wdict)

        def _apply(ntype, h):
-            if self.self_loop:
-                h = h + self.loop_weights[ntype](inputs_dst[ntype])
+            h = h + self.loop_weights[ntype](inputs_dst[ntype])
            if self.activation:
                h = self.activation(h)
            return self.dropout(h)
-        return {ntype : _apply(ntype, h) for ntype, h in hs.items()}

+        return {ntype : _apply(ntype, h) for ntype, h in hs.items()}

 class EntityClassify(nn.Module):
-    r"""
-    R-GCN node classification model
-
-    Parameters
-    ----------
-    g : DGLGraph
-        The heterogenous graph used for message passing
-    in_dim : int
-        Input feature size.
-    h_dim : int
-        Hidden dimension size.
-    out_dim : int
-        Output dimension size.
-    num_hidden_layers : int, optional
-        Number of RelGraphConvLayers. Default: 1
-    dropout : float, optional
-        Dropout rate. Default: 0.0
-    use_self_loop : bool, optional
-        True to include self loop message in RelGraphConvLayers. Default: True
-    """
-    def __init__(self,
-                 g, in_dim,
-                 h_dim, out_dim,
-                 num_hidden_layers=1,
-                 dropout=0,
-                 use_self_loop=True):
+    def __init__(self, g, in_dim, out_dim):
        super(EntityClassify, self).__init__()
-        self.g = g
        self.in_dim = in_dim
-        self.h_dim = h_dim
+        self.h_dim = 64
        self.out_dim = out_dim
        self.rel_names = list(set(g.etypes))
        self.rel_names.sort()
-        self.num_hidden_layers = num_hidden_layers
-        self.dropout = dropout
-        self.use_self_loop = use_self_loop
+        self.dropout = 0.5

        self.layers = nn.ModuleList()
        # i2h
        self.layers.append(RelGraphConvLayer(
            self.in_dim, self.h_dim, g.ntypes, self.rel_names,
-            activation=F.relu, self_loop=self.use_self_loop,
-            dropout=self.dropout))
-        # h2h
-        for _ in range(self.num_hidden_layers):
-            self.layers.append(RelGraphConvLayer(
-                self.h_dim, self.h_dim, g.ntypes, self.rel_names,
-                activation=F.relu, self_loop=self.use_self_loop,
-                dropout=self.dropout))
+            activation=F.relu, dropout=self.dropout))
+
        # h2o
        self.layers.append(RelGraphConvLayer(
            self.h_dim, self.out_dim, g.ntypes, self.rel_names,
-            activation=None,
-            self_loop=self.use_self_loop))
+            activation=None))

    def reset_parameters(self):
        for layer in self.layers:
@@ -234,7 +149,6 @@ class EntityClassify(nn.Module):
            h = layer(block, h)
        return h

-
 class Logger(object):
    r"""
    This class was taken directly from the PyG implementation and can be found
@@ -242,8 +156,7 @@ class Logger(object):

    This was done to ensure that performance was measured in precisely the same way
    """
-    def __init__(self, runs, info=None):
-        self.info = info
+    def __init__(self, runs):
        self.results = [[] for _ in range(runs)]

    def add_result(self, run, result):
@@ -283,113 +196,17 @@ class Logger(object):
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')

-
-def parse_args():
-    # DGL
-    parser = argparse.ArgumentParser(description='RGCN')
-    parser.add_argument("--dropout", type=float, default=0.5,
-            help="dropout probability")
-    parser.add_argument("--n-hidden", type=int, default=64,
-            help="number of hidden units")
-    parser.add_argument("--lr", type=float, default=0.01,
-            help="learning rate")
-    parser.add_argument("-e", "--n-epochs", type=int, default=3,
-            help="number of training epochs")
-
-    # OGB
-    parser.add_argument('--runs', type=int, default=10)
-
-    args = parser.parse_args()
-    return args
-
-def prepare_data(args):
-    dataset = DglNodePropPredDataset(name="ogbn-mag")
-    split_idx = dataset.get_idx_split()
-    g, labels = dataset[0] # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
-    labels = labels['paper'].flatten()
-
-    def add_reverse_hetero(g, combine_like=True):
-        r"""
-        Parameters
-        ----------
-        g : DGLGraph
-            The heterogenous graph where reverse edges should be added
-        combine_like : bool, optional
-            Whether reverse-edges that have identical source/destination 
-            node types should be combined with the existing edge-type, 
-            rather than creating a new edge type.  Default: True.
-        """
-        relations = {}
-        num_nodes_dict = {ntype: g.num_nodes(ntype) for ntype in g.ntypes}
-        for metapath in g.canonical_etypes:
-            src_ntype, rel_type, dst_ntype = metapath
-            src, dst = g.all_edges(etype=rel_type)
-
-            if src_ntype==dst_ntype and combine_like:
-                # Make edges un-directed instead of making a reverse edge type
-                relations[metapath] = (th.cat([src, dst], dim=0), th.cat([dst, src], dim=0))
-            else:
-                # Original edges
-                relations[metapath] = (src, dst)
-
-                reverse_metapath = (dst_ntype, 'rev-' + rel_type, src_ntype)
-                relations[reverse_metapath] = (dst, src)           # Reverse edges
-
-        new_g = dgl.heterograph(relations, num_nodes_dict=num_nodes_dict)
-        # Remove duplicate edges
-        new_g = dgl.to_simple(new_g, return_counts=None, writeback_mapping=False, copy_ndata=True)
-
-        # copy_ndata:
-        for ntype in g.ntypes:
-            for k, v in g.nodes[ntype].data.items():
-                new_g.nodes[ntype].data[k] = v.detach().clone()
-
-        return new_g
-
-    g = add_reverse_hetero(g)
-    print("Loaded graph: {}".format(g))
-
-    logger = Logger(args['runs'], args)
-
-    # train sampler
-    sampler = dgl.dataloading.MultiLayerNeighborSampler(args['fanout'])
-    train_loader = dgl.dataloading.DataLoader(
-        g, split_idx['train'], sampler,
-        batch_size=args['batch_size'], shuffle=True, num_workers=0)
-    
-    return (g, labels, dataset.num_classes, split_idx,  
-            logger, train_loader)
-
-def get_model(g, num_classes, args):
-    embed_layer = RelGraphEmbed(g, 128, exclude=['paper'])
-    
-    model = EntityClassify(
-        g, 128, args['n_hidden'], num_classes,
-        num_hidden_layers=args['num_layers'] - 2,
-        dropout=args['dropout'],
-        use_self_loop=True,
-    )
-
-    print(embed_layer)
-    print(f"Number of embedding parameters: {sum(p.numel() for p in embed_layer.parameters())}")
-    print(model)
-    print(f"Number of model parameters: {sum(p.numel() for p in model.parameters())}")
-
-    return embed_layer, model
-
-def train(g, model, node_embed, optimizer, train_loader, split_idx,  
-          labels, logger, device, run, args):
-    
-    # training loop
+def train(g, model, node_embed, optimizer, train_loader, split_idx,
+          labels, logger, device, run):
    print("start training...")
    category = 'paper'

-    for epoch in range(args['n_epochs']):
-        N_train= split_idx['train'][category].shape[0]
-        pbar = tqdm(total=N_train)
+    for epoch in range(3):
+        num_train = split_idx['train'][category].shape[0]
+        pbar = tqdm(total=num_train)
        pbar.set_description(f'Epoch {epoch:02d}')
        model.train()
-        
+
        total_loss = 0

        for input_nodes, seeds, blocks in train_loader:
@@ -400,27 +217,25 @@ def train(g, model, node_embed, optimizer, train_loader, split_idx,
            emb = extract_embed(node_embed, input_nodes)
            # Add the batch's raw "paper" features
            emb.update({'paper': g.ndata['feat']['paper'][input_nodes['paper']]})
-            lbl = labels[seeds]
-            
-            if th.cuda.is_available():
-                emb = {k : e.cuda() for k, e in emb.items()}
-                lbl = lbl.cuda()
-            
+
+            emb = {k : e.to(device) for k, e in emb.items()}
+            lbl = labels[seeds].to(device)
+
            optimizer.zero_grad()
            logits = model(emb, blocks)[category]
-            
+
            y_hat = logits.log_softmax(dim=-1)
            loss = F.nll_loss(y_hat, lbl)
            loss.backward()
            optimizer.step()
-            
+
            total_loss += loss.item() * batch_size
            pbar.update(batch_size)
-        
+
        pbar.close()
-        loss = total_loss / N_train
-        
-        result = test(g, model, node_embed, labels, device, split_idx, args)
+        loss = total_loss / num_train
+
+        result = test(g, model, node_embed, labels, device, split_idx)
        logger.add_result(run, result)
        train_acc, valid_acc, test_acc = result
        print(f'Run: {run + 1:02d}, '
@@ -429,24 +244,24 @@ def train(g, model, node_embed, optimizer, train_loader, split_idx,
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}%, '
              f'Test: {100 * test_acc:.2f}%')
-    
+
    return logger

 @th.no_grad()
-def test(g, model, node_embed, y_true, device, split_idx, args):
+def test(g, model, node_embed, y_true, device, split_idx):
    model.eval()
    category = 'paper'
    evaluator = Evaluator(name='ogbn-mag')

-    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(args['num_layers'])
+    # 2 GNN layers
+    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
    loader = dgl.dataloading.DataLoader(
        g, {'paper': th.arange(g.num_nodes('paper'))}, sampler,
        batch_size=16384, shuffle=False, num_workers=0)
-    
-    N = y_true.size(0)
-    pbar = tqdm(total=N)
-    pbar.set_description(f'Full Inference')
-    
+
+    pbar = tqdm(total=y_true.size(0))
+    pbar.set_description(f'Inference')
+
    y_hats = list()

    for input_nodes, seeds, blocks in loader:
@@ -457,16 +272,14 @@ def test(g, model, node_embed, y_true, device, split_idx, args):
        emb = extract_embed(node_embed, input_nodes)
        # Get the batch's raw "paper" features
        emb.update({'paper': g.ndata['feat']['paper'][input_nodes['paper']]})
-        
-        if th.cuda.is_available():
-            emb = {k : e.cuda() for k, e in emb.items()}
-        
+        emb = {k : e.to(device) for k, e in emb.items()}
+
        logits = model(emb, blocks)[category]
        y_hat = logits.log_softmax(dim=-1).argmax(dim=1, keepdims=True)
        y_hats.append(y_hat.cpu())
-        
+
        pbar.update(batch_size)
-    
+
    pbar.close()

    y_pred = th.cat(y_hats, dim=0)
@@ -488,40 +301,36 @@ def test(g, model, node_embed, y_true, device, split_idx, args):
    return train_acc, valid_acc, test_acc

 def main(args):
-    # Static parameters
-    hyperparameters = dict(
-        num_layers=2,
-        fanout=[25, 20], 
-        batch_size=1024,
-    )
-    hyperparameters.update(vars(args))
-    print(hyperparameters)
-
    device = f'cuda:0' if th.cuda.is_available() else 'cpu'

-    (g, labels, num_classes, split_idx, 
-        logger, train_loader) = prepare_data(hyperparameters)
+    g, labels, num_classes, split_idx, logger, train_loader = prepare_data(args)
+
+    embed_layer = rel_graph_embed(g, 128)
+    model = EntityClassify(g, 128, num_classes).to(device)

-    embed_layer, model = get_model(g, num_classes, hyperparameters)
-    model = model.to(device)
-    
-    for run in range(hyperparameters['runs']):
+    print(f"Number of embedding parameters: {sum(p.numel() for p in embed_layer.parameters())}")
+    print(f"Number of model parameters: {sum(p.numel() for p in model.parameters())}")
+
+    for run in range(args.runs):

        embed_layer.reset_parameters()
        model.reset_parameters()

        # optimizer
        all_params = itertools.chain(model.parameters(), embed_layer.parameters())
-        optimizer = th.optim.Adam(all_params, lr=hyperparameters['lr'])
-
-        logger = train(g, model, embed_layer(), optimizer, train_loader, split_idx,
-              labels, logger, device, run, hyperparameters)
+        optimizer = th.optim.Adam(all_params, lr=0.01)

+        logger = train(g, model, embed_layer, optimizer, train_loader, split_idx,
+              labels, logger, device, run)
        logger.print_statistics(run)
-    
+
    print("Final performance: ")
    logger.print_statistics()

 if __name__ == '__main__':
-    args = parse_args()
+    parser = argparse.ArgumentParser(description='RGCN')
+    parser.add_argument('--runs', type=int, default=10)
+
+    args = parser.parse_args()
+
    main(args)
--- a/examples/pytorch/rgcn-hetero-ogbn-mag/main.py
+++ b/examples/pytorch/rgcn-hetero-ogbn-mag/main.py
-import argparse
-from itertools import chain
-from timeit import default_timer
-from typing import Callable, Tuple, Union
-
-import dgl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import utils
-from model import EntityClassify, RelGraphEmbedding
-
-
-def train(
-    embedding_layer: nn.Module,
-    model: nn.Module,
-    device: Union[str, torch.device],
-    embedding_optimizer: torch.optim.Optimizer,
-    model_optimizer: torch.optim.Optimizer,
-    loss_function: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
-    labels: torch.Tensor,
-    predict_category: str,
-    dataloader: dgl.dataloading.DataLoader,
-) -> Tuple[float]:
-    model.train()
-
-    total_loss = 0
-    total_accuracy = 0
-
-    start = default_timer()
-
-    embedding_layer = embedding_layer.to(device)
-    model = model.to(device)
-    loss_function = loss_function.to(device)
-
-    for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader):
-        embedding_optimizer.zero_grad()
-        model_optimizer.zero_grad()
-
-        in_nodes = {rel: nid.to(device) for rel, nid in in_nodes.items()}
-        out_nodes = out_nodes[predict_category].to(device)
-        blocks = [block.to(device) for block in blocks]
-
-        batch_labels = labels[out_nodes].to(device)
-
-        embedding = embedding_layer(in_nodes=in_nodes, device=device)
-        logits = model(blocks, embedding)[predict_category]
-
-        loss = loss_function(logits, batch_labels)
-
-        indices = logits.argmax(dim=-1)
-        correct = torch.sum(indices == batch_labels)
-        accuracy = correct.item() / len(batch_labels)
-
-        loss.backward()
-        model_optimizer.step()
-        embedding_optimizer.step()
-
-        total_loss += loss.item()
-        total_accuracy += accuracy
-
-    stop = default_timer()
-    time = stop - start
-
-    total_loss /= step + 1
-    total_accuracy /= step + 1
-
-    return time, total_loss, total_accuracy
-
-
-def validate(
-    embedding_layer: nn.Module,
-    model: nn.Module,
-    device: Union[str, torch.device],
-    inference_mode: str,
-    loss_function: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
-    hg: dgl.DGLHeteroGraph,
-    labels: torch.Tensor,
-    predict_category: str,
-    dataloader: dgl.dataloading.DataLoader = None,
-    eval_batch_size: int = None,
-    eval_num_workers: int = None,
-    mask: torch.Tensor = None,
-) -> Tuple[float]:
-    embedding_layer.eval()
-    model.eval()
-
-    start = default_timer()
-
-    embedding_layer = embedding_layer.to(device)
-    model = model.to(device)
-    loss_function = loss_function.to(device)
-
-    valid_labels = labels[mask].to(device)
-
-    with torch.no_grad():
-        if inference_mode == 'neighbor_sampler':
-            total_loss = 0
-            total_accuracy = 0
-
-            for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader):
-                in_nodes = {rel: nid.to(device)
-                            for rel, nid in in_nodes.items()}
-                out_nodes = out_nodes[predict_category].to(device)
-                blocks = [block.to(device) for block in blocks]
-
-                batch_labels = labels[out_nodes].to(device)
-
-                embedding = embedding_layer(in_nodes=in_nodes, device=device)
-                logits = model(blocks, embedding)[predict_category]
-
-                loss = loss_function(logits, batch_labels)
-
-                indices = logits.argmax(dim=-1)
-                correct = torch.sum(indices == batch_labels)
-                accuracy = correct.item() / len(batch_labels)
-
-                total_loss += loss.item()
-                total_accuracy += accuracy
-
-            total_loss /= step + 1
-            total_accuracy /= step + 1
-        elif inference_mode == 'full_neighbor_sampler':
-            logits = model.inference(
-                hg,
-                eval_batch_size,
-                eval_num_workers,
-                embedding_layer,
-                device,
-            )[predict_category][mask]
-
-            total_loss = loss_function(logits, valid_labels)
-
-            indices = logits.argmax(dim=-1)
-            correct = torch.sum(indices == valid_labels)
-            total_accuracy = correct.item() / len(valid_labels)
-
-            total_loss = total_loss.item()
-        else:
-            embedding = embedding_layer(device=device)
-            logits = model(hg, embedding)[predict_category][mask]
-
-            total_loss = loss_function(logits, valid_labels)
-
-            indices = logits.argmax(dim=-1)
-            correct = torch.sum(indices == valid_labels)
-            total_accuracy = correct.item() / len(valid_labels)
-
-            total_loss = total_loss.item()
-
-    stop = default_timer()
-    time = stop - start
-
-    return time, total_loss, total_accuracy
-
-
-def run(args: argparse.ArgumentParser) -> None:
-    torch.manual_seed(args.seed)
-
-    dataset, hg, train_idx, valid_idx, test_idx = utils.process_dataset(
-        args.dataset,
-        root=args.dataset_root,
-    )
-    predict_category = dataset.predict_category
-    labels = hg.nodes[predict_category].data['labels']
-
-    training_device = torch.device('cuda' if args.gpu_training else 'cpu')
-    inference_device = torch.device('cuda' if args.gpu_inference else 'cpu')
-
-    inferfence_mode = args.inference_mode
-
-    fanouts = [int(fanout) for fanout in args.fanouts.split(',')]
-
-    train_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-    train_dataloader = dgl.dataloading.DataLoader(
-        hg,
-        {predict_category: train_idx},
-        train_sampler,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=args.num_workers,
-    )
-
-    if inferfence_mode == 'neighbor_sampler':
-        valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-        valid_dataloader = dgl.dataloading.DataLoader(
-            hg,
-            {predict_category: valid_idx},
-            valid_sampler,
-            batch_size=args.eval_batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=args.eval_num_workers,
-        )
-
-        if args.test_validation:
-            test_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-            test_dataloader = dgl.dataloading.DataLoader(
-                hg,
-                {predict_category: test_idx},
-                test_sampler,
-                batch_size=args.eval_batch_size,
-                shuffle=False,
-                drop_last=False,
-                num_workers=args.eval_num_workers,
-            )
-    else:
-        valid_dataloader = None
-
-        if args.test_validation:
-            test_dataloader = None
-
-    in_feats = hg.nodes[predict_category].data['feat'].shape[-1]
-    out_feats = dataset.num_classes
-
-    num_nodes = {}
-    node_feats = {}
-
-    for ntype in hg.ntypes:
-        num_nodes[ntype] = hg.num_nodes(ntype)
-        node_feats[ntype] = hg.nodes[ntype].data.get('feat')
-
-    activations = {'leaky_relu': F.leaky_relu, 'relu': F.relu}
-
-    embedding_layer = RelGraphEmbedding(hg, in_feats, num_nodes, node_feats)
-    model = EntityClassify(
-        hg,
-        in_feats,
-        args.hidden_feats,
-        out_feats,
-        args.num_bases,
-        args.num_layers,
-        norm=args.norm,
-        layer_norm=args.layer_norm,
-        input_dropout=args.input_dropout,
-        dropout=args.dropout,
-        activation=activations[args.activation],
-        self_loop=args.self_loop,
-    )
-
-    loss_function = nn.CrossEntropyLoss()
-
-    embedding_optimizer = torch.optim.SparseAdam(
-        embedding_layer.node_embeddings.parameters(), lr=args.embedding_lr)
-
-    if args.node_feats_projection:
-        all_parameters = chain(
-            model.parameters(), embedding_layer.embeddings.parameters())
-        model_optimizer = torch.optim.Adam(all_parameters, lr=args.model_lr)
-    else:
-        model_optimizer = torch.optim.Adam(
-            model.parameters(), lr=args.model_lr)
-
-    checkpoint = utils.Callback(args.early_stopping_patience,
-                                args.early_stopping_monitor)
-
-    print('## Training started ##')
-
-    for epoch in range(args.num_epochs):
-        train_time, train_loss, train_accuracy = train(
-            embedding_layer,
-            model,
-            training_device,
-            embedding_optimizer,
-            model_optimizer,
-            loss_function,
-            labels,
-            predict_category,
-            train_dataloader,
-        )
-        valid_time, valid_loss, valid_accuracy = validate(
-            embedding_layer,
-            model,
-            inference_device,
-            inferfence_mode,
-            loss_function,
-            hg,
-            labels,
-            predict_category=predict_category,
-            dataloader=valid_dataloader,
-            eval_batch_size=args.eval_batch_size,
-            eval_num_workers=args.eval_num_workers,
-            mask=valid_idx,
-        )
-
-        checkpoint.create(
-            epoch,
-            train_time,
-            valid_time,
-            train_loss,
-            valid_loss,
-            train_accuracy,
-            valid_accuracy,
-            {'embedding_layer': embedding_layer, 'model': model},
-        )
-
-        print(
-            f'Epoch: {epoch + 1:03} '
-            f'Train Loss: {train_loss:.2f} '
-            f'Valid Loss: {valid_loss:.2f} '
-            f'Train Accuracy: {train_accuracy:.4f} '
-            f'Valid Accuracy: {valid_accuracy:.4f} '
-            f'Train Epoch Time: {train_time:.2f} '
-            f'Valid Epoch Time: {valid_time:.2f}'
-        )
-
-        if checkpoint.should_stop:
-            print('## Training finished: early stopping ##')
-
-            break
-        elif epoch >= args.num_epochs - 1:
-            print('## Training finished ##')
-
-    print(
-        f'Best Epoch: {checkpoint.best_epoch} '
-        f'Train Loss: {checkpoint.best_epoch_train_loss:.2f} '
-        f'Valid Loss: {checkpoint.best_epoch_valid_loss:.2f} '
-        f'Train Accuracy: {checkpoint.best_epoch_train_accuracy:.4f} '
-        f'Valid Accuracy: {checkpoint.best_epoch_valid_accuracy:.4f}'
-    )
-
-    if args.test_validation:
-        print('## Test data validation ##')
-
-        embedding_layer.load_state_dict(
-            checkpoint.best_epoch_model_parameters['embedding_layer'])
-        model.load_state_dict(checkpoint.best_epoch_model_parameters['model'])
-
-        test_time, test_loss, test_accuracy = validate(
-            embedding_layer,
-            model,
-            inference_device,
-            inferfence_mode,
-            loss_function,
-            hg,
-            labels,
-            predict_category=predict_category,
-            dataloader=test_dataloader,
-            eval_batch_size=args.eval_batch_size,
-            eval_num_workers=args.eval_num_workers,
-            mask=test_idx,
-        )
-
-        print(
-            f'Test Loss: {test_loss:.2f} '
-            f'Test Accuracy: {test_accuracy:.4f} '
-            f'Test Epoch Time: {test_time:.2f}'
-        )
-
-
-if __name__ == '__main__':
-    argparser = argparse.ArgumentParser('RGCN')
-
-    argparser.add_argument('--gpu-training', dest='gpu_training',
-                           action='store_true')
-    argparser.add_argument('--no-gpu-training', dest='gpu_training',
-                           action='store_false')
-    argparser.set_defaults(gpu_training=True)
-    argparser.add_argument('--gpu-inference', dest='gpu_inference',
-                           action='store_true')
-    argparser.add_argument('--no-gpu-inference', dest='gpu_inference',
-                           action='store_false')
-    argparser.set_defaults(gpu_inference=True)
-    argparser.add_argument('--inference-mode', default='neighbor_sampler', type=str,
-                           choices=['neighbor_sampler', 'full_neighbor_sampler', 'full_graph'])
-    argparser.add_argument('--dataset', default='ogbn-mag', type=str,
-                           choices=['ogbn-mag'])
-    argparser.add_argument('--dataset-root', default='dataset', type=str)
-    argparser.add_argument('--num-epochs', default=500, type=int)
-    argparser.add_argument('--embedding-lr', default=0.01, type=float)
-    argparser.add_argument('--model-lr', default=0.01, type=float)
-    argparser.add_argument('--node-feats-projection',
-                           dest='node_feats_projection', action='store_true')
-    argparser.add_argument('--no-node-feats-projection',
-                           dest='node_feats_projection', action='store_false')
-    argparser.set_defaults(node_feats_projection=False)
-    argparser.add_argument('--hidden-feats', default=64, type=int)
-    argparser.add_argument('--num-bases', default=2, type=int)
-    argparser.add_argument('--num-layers', default=2, type=int)
-    argparser.add_argument('--norm', default='right',
-                           type=str, choices=['both', 'none', 'right'])
-    argparser.add_argument('--layer-norm', dest='layer_norm',
-                           action='store_true')
-    argparser.add_argument('--no-layer-norm', dest='layer_norm',
-                           action='store_false')
-    argparser.set_defaults(layer_norm=False)
-    argparser.add_argument('--input-dropout', default=0.1, type=float)
-    argparser.add_argument('--dropout', default=0.5, type=float)
-    argparser.add_argument('--activation', default='relu', type=str,
-                           choices=['leaky_relu', 'relu'])
-    argparser.add_argument('--self-loop', dest='self_loop',
-                           action='store_true')
-    argparser.add_argument('--no-self-loop', dest='self_loop',
-                           action='store_false')
-    argparser.set_defaults(self_loop=True)
-    argparser.add_argument('--fanouts', default='25,20', type=str)
-    argparser.add_argument('--batch-size', default=1024, type=int)
-    argparser.add_argument('--eval-batch-size', default=1024, type=int)
-    argparser.add_argument('--num-workers', default=4, type=int)
-    argparser.add_argument('--eval-num-workers', default=4, type=int)
-    argparser.add_argument('--early-stopping-patience', default=10, type=int)
-    argparser.add_argument('--early-stopping-monitor', default='loss',
-                           type=str, choices=['accuracy', 'loss'])
-    argparser.add_argument('--test-validation', dest='test_validation',
-                           action='store_true')
-    argparser.add_argument('--no-test-validation', dest='test_validation',
-                           action='store_false')
-    argparser.set_defaults(test_validation=True)
-    argparser.add_argument('--seed', default=13, type=int)
-
-    args = argparser.parse_args()
-
-    run(args)
--- a/examples/pytorch/rgcn-hetero-ogbn-mag/model.py
+++ b/examples/pytorch/rgcn-hetero-ogbn-mag/model.py
-from typing import Callable, Dict, List, Union
-
-import dgl
-import dgl.nn.pytorch as dglnn
-import torch
-import torch.nn as nn
-
-
-class RelGraphEmbedding(nn.Module):
-    def __init__(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        embedding_size: int,
-        num_nodes: Dict[str, int],
-        node_feats: Dict[str, torch.Tensor],
-        node_feats_projection: bool = False,
-    ):
-        super().__init__()
-        self._hg = hg
-        self._node_feats = node_feats
-        self._node_feats_projection = node_feats_projection
-        self.node_embeddings = nn.ModuleDict()
-
-        if node_feats_projection:
-            self.embeddings = nn.ParameterDict()
-
-        for ntype in hg.ntypes:
-            if node_feats[ntype] is None:
-                node_embedding = nn.Embedding(
-                    num_nodes[ntype], embedding_size, sparse=True)
-                nn.init.uniform_(node_embedding.weight, -1, 1)
-
-                self.node_embeddings[ntype] = node_embedding
-            elif node_feats[ntype] is not None and node_feats_projection:
-                input_embedding_size = node_feats[ntype].shape[-1]
-                embedding = nn.Parameter(torch.Tensor(
-                    input_embedding_size, embedding_size))
-                nn.init.xavier_uniform_(embedding)
-
-                self.embeddings[ntype] = embedding
-
-    def forward(
-        self,
-        in_nodes: Dict[str, torch.Tensor] = None,
-        device: torch.device = None,
-    ) -> Dict[str, torch.Tensor]:
-        if in_nodes is not None:
-            ntypes = [ntype for ntype in in_nodes.keys()]
-            nids = [nid for nid in in_nodes.values()]
-        else:
-            ntypes = self._hg.ntypes
-            nids = [self._hg.nodes(ntype) for ntype in ntypes]
-
-        x = {}
-
-        for ntype, nid in zip(ntypes, nids):
-            if self._node_feats[ntype] is None:
-                x[ntype] = self.node_embeddings[ntype](nid)
-            else:
-                if device is not None:
-                    self._node_feats[ntype] = self._node_feats[ntype].to(
-                        device)
-
-                if self._node_feats_projection:
-                    x[ntype] = self._node_feats[ntype][nid] @ self.embeddings[ntype]
-                else:
-                    x[ntype] = self._node_feats[ntype][nid]
-
-        return x
-
-
-class RelGraphConvLayer(nn.Module):
-    def __init__(
-        self,
-        in_feats: int,
-        out_feats: int,
-        rel_names: List[str],
-        num_bases: int,
-        norm: str = 'right',
-        weight: bool = True,
-        bias: bool = True,
-        activation: Callable[[torch.Tensor], torch.Tensor] = None,
-        dropout: float = None,
-        self_loop: bool = False,
-    ):
-        super().__init__()
-        self._rel_names = rel_names
-        self._num_rels = len(rel_names)
-        self._conv = dglnn.HeteroGraphConv({rel: dglnn.GraphConv(
-            in_feats, out_feats, norm=norm, weight=False, bias=False) for rel in rel_names})
-        self._use_weight = weight
-        self._use_basis = num_bases < self._num_rels and weight
-        self._use_bias = bias
-        self._activation = activation
-        self._dropout = nn.Dropout(dropout) if dropout is not None else None
-        self._use_self_loop = self_loop
-
-        if weight:
-            if self._use_basis:
-                self.basis = dglnn.WeightBasis(
-                    (in_feats, out_feats), num_bases, self._num_rels)
-            else:
-                self.weight = nn.Parameter(torch.Tensor(
-                    self._num_rels, in_feats, out_feats))
-                nn.init.xavier_uniform_(
-                    self.weight, gain=nn.init.calculate_gain('relu'))
-
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
-            nn.init.zeros_(self.bias)
-
-        if self_loop:
-            self.self_loop_weight = nn.Parameter(
-                torch.Tensor(in_feats, out_feats))
-            nn.init.xavier_uniform_(
-                self.self_loop_weight, gain=nn.init.calculate_gain('relu'))
-
-    def _apply_layers(
-        self,
-        ntype: str,
-        inputs: torch.Tensor,
-        inputs_dst: torch.Tensor = None,
-    ) -> torch.Tensor:
-        x = inputs
-
-        if inputs_dst is not None:
-            x += torch.matmul(inputs_dst[ntype], self.self_loop_weight)
-
-        if self._use_bias:
-            x += self.bias
-
-        if self._activation is not None:
-            x = self._activation(x)
-
-        if self._dropout is not None:
-            x = self._dropout(x)
-
-        return x
-
-    def forward(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        inputs: Dict[str, torch.Tensor],
-    ) -> Dict[str, torch.Tensor]:
-        hg = hg.local_var()
-
-        if self._use_weight:
-            weight = self.basis() if self._use_basis else self.weight
-            weight_dict = {self._rel_names[i]: {'weight': w.squeeze(
-                dim=0)} for i, w in enumerate(torch.split(weight, 1, dim=0))}
-        else:
-            weight_dict = {}
-
-        if self._use_self_loop:
-            if hg.is_block:
-                inputs_dst = {ntype: h[:hg.num_dst_nodes(
-                    ntype)] for ntype, h in inputs.items()}
-            else:
-                inputs_dst = inputs
-        else:
-            inputs_dst = None
-
-        x = self._conv(hg, inputs, mod_kwargs=weight_dict)
-        x = {ntype: self._apply_layers(ntype, h, inputs_dst)
-             for ntype, h in x.items()}
-
-        return x
-
-
-class EntityClassify(nn.Module):
-    def __init__(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        in_feats: int,
-        hidden_feats: int,
-        out_feats: int,
-        num_bases: int,
-        num_layers: int,
-        norm: str = 'right',
-        layer_norm: bool = False,
-        input_dropout: float = 0,
-        dropout: float = 0,
-        activation: Callable[[torch.Tensor], torch.Tensor] = None,
-        self_loop: bool = False,
-    ):
-        super().__init__()
-        self._hidden_feats = hidden_feats
-        self._out_feats = out_feats
-        self._num_layers = num_layers
-        self._input_dropout = nn.Dropout(input_dropout)
-        self._dropout = nn.Dropout(dropout)
-        self._activation = activation
-        self._rel_names = sorted(list(set(hg.etypes)))
-        self._num_rels = len(self._rel_names)
-
-        if num_bases < 0 or num_bases > self._num_rels:
-            self._num_bases = self._num_rels
-        else:
-            self._num_bases = num_bases
-
-        self._layers = nn.ModuleList()
-
-        self._layers.append(RelGraphConvLayer(
-            in_feats,
-            hidden_feats,
-            self._rel_names,
-            self._num_bases,
-            norm=norm,
-            self_loop=self_loop,
-        ))
-
-        for _ in range(1, num_layers - 1):
-            self._layers.append(RelGraphConvLayer(
-                hidden_feats,
-                hidden_feats,
-                self._rel_names,
-                self._num_bases,
-                norm=norm,
-                self_loop=self_loop,
-            ))
-
-        self._layers.append(RelGraphConvLayer(
-            hidden_feats,
-            out_feats,
-            self._rel_names,
-            self._num_bases,
-            norm=norm,
-            self_loop=self_loop,
-        ))
-
-        if layer_norm:
-            self._layer_norms = nn.ModuleList()
-
-            for _ in range(num_layers - 1):
-                self._layer_norms.append(nn.LayerNorm(hidden_feats))
-        else:
-            self._layer_norms = None
-
-    def _apply_layers(
-        self,
-        layer_idx: int,
-        inputs: Dict[str, torch.Tensor],
-    ) -> Dict[str, torch.Tensor]:
-        x = inputs
-
-        for ntype, h in x.items():
-            if self._layer_norms is not None:
-                h = self._layer_norms[layer_idx](h)
-
-            if self._activation is not None:
-                h = self._activation(h)
-
-            x[ntype] = self._dropout(h)
-
-        return x
-
-    def forward(
-        self,
-        hg: Union[dgl.DGLHeteroGraph, List[dgl.DGLHeteroGraph]],
-        inputs: Dict[str, torch.Tensor],
-    ) -> Dict[str, torch.Tensor]:
-        x = {ntype: self._input_dropout(h) for ntype, h in inputs.items()}
-
-        if isinstance(hg, list):
-            for i, (layer, block) in enumerate(zip(self._layers, hg)):
-                x = layer(block, x)
-
-                if i < self._num_layers - 1:
-                    x = self._apply_layers(i, x)
-        else:
-            for i, layer in enumerate(self._layers):
-                x = layer(hg, x)
-
-                if i < self._num_layers - 1:
-                    x = self._apply_layers(i, x)
-
-        return x
-
-    def inference(
-        self,
-        hg: dgl.DGLHeteroGraph,
-        batch_size: int,
-        num_workers: int,
-        embedding_layer: nn.Module,
-        device: torch.device,
-    ) -> Dict[str, torch.Tensor]:
-        for i, layer in enumerate(self._layers):
-            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
-            dataloader = dgl.dataloading.DataLoader(
-                hg,
-                {ntype: hg.nodes(ntype) for ntype in hg.ntypes},
-                sampler,
-                batch_size=batch_size,
-                shuffle=False,
-                drop_last=False,
-                num_workers=num_workers,
-            )
-
-            if i < self._num_layers - 1:
-                y = {ntype: torch.zeros(hg.num_nodes(
-                    ntype), self._hidden_feats, device=device) for ntype in hg.ntypes}
-            else:
-                y = {ntype: torch.zeros(hg.num_nodes(
-                    ntype), self._out_feats, device=device) for ntype in hg.ntypes}
-
-            for in_nodes, out_nodes, blocks in dataloader:
-                in_nodes = {rel: nid.to(device)
-                            for rel, nid in in_nodes.items()}
-                out_nodes = {rel: nid.to(device)
-                             for rel, nid in out_nodes.items()}
-                block = blocks[0].to(device)
-
-                if i == 0:
-                    h = embedding_layer(in_nodes=in_nodes, device=device)
-                else:
-                    h = {ntype: x[ntype][in_nodes[ntype]]
-                         for ntype in hg.ntypes}
-
-                h = layer(block, h)
-
-                if i < self._num_layers - 1:
-                    h = self._apply_layers(i, h)
-
-                for ntype in out_nodes:
-                    y[ntype][out_nodes[ntype]] = h[ntype]
-
-            x = y
-
-        return x
--- a/examples/pytorch/rgcn-hetero-ogbn-mag/utils.py
+++ b/examples/pytorch/rgcn-hetero-ogbn-mag/utils.py
-from copy import deepcopy
-from typing import Dict, List, Tuple, Union
-
-import dgl
-import torch
-import torch.nn as nn
-from ogb.nodeproppred import DglNodePropPredDataset
-
-
-class Callback:
-    def __init__(
-        self,
-        patience: int,
-        monitor: str,
-    ) -> None:
-        self._patience = patience
-        self._monitor = monitor
-        self._lookback = 0
-        self._best_epoch = None
-        self._train_times = []
-        self._valid_times = []
-        self._train_losses = []
-        self._valid_losses = []
-        self._train_accuracies = []
-        self._valid_accuracies = []
-        self._model_parameters = {}
-
-    @property
-    def best_epoch(self) -> int:
-        return self._best_epoch + 1
-
-    @property
-    def train_times(self) -> List[float]:
-        return self._train_times
-
-    @property
-    def valid_times(self) -> List[float]:
-        return self._valid_times
-
-    @property
-    def train_losses(self) -> List[float]:
-        return self._train_losses
-
-    @property
-    def valid_losses(self) -> List[float]:
-        return self._valid_losses
-
-    @property
-    def train_accuracies(self) -> List[float]:
-        return self._train_accuracies
-
-    @property
-    def valid_accuracies(self) -> List[float]:
-        return self._valid_accuracies
-
-    @property
-    def best_epoch_training_time(self) -> float:
-        return sum(self._train_times[:self._best_epoch])
-
-    @property
-    def best_epoch_train_loss(self) -> float:
-        return self._train_losses[self._best_epoch]
-
-    @property
-    def best_epoch_valid_loss(self) -> float:
-        return self._valid_losses[self._best_epoch]
-
-    @property
-    def best_epoch_train_accuracy(self) -> float:
-        return self._train_accuracies[self._best_epoch]
-
-    @property
-    def best_epoch_valid_accuracy(self) -> float:
-        return self._valid_accuracies[self._best_epoch]
-
-    @property
-    def best_epoch_model_parameters(
-            self) -> Union[Dict[str, torch.Tensor], Dict[str, Dict[str, torch.Tensor]]]:
-        return self._model_parameters
-
-    @property
-    def should_stop(self) -> bool:
-        return self._lookback >= self._patience
-
-    def create(
-        self,
-        epoch: int,
-        train_time: float,
-        valid_time: float,
-        train_loss: float,
-        valid_loss: float,
-        train_accuracy: float,
-        valid_accuracy: float,
-        model: Union[nn.Module, Dict[str, nn.Module]],
-    ) -> None:
-        self._train_times.append(train_time)
-        self._valid_times.append(valid_time)
-        self._train_losses.append(train_loss)
-        self._valid_losses.append(valid_loss)
-        self._train_accuracies.append(train_accuracy)
-        self._valid_accuracies.append(valid_accuracy)
-
-        best_epoch = False
-
-        if self._best_epoch is None:
-            best_epoch = True
-        elif self._monitor == 'loss':
-            if valid_loss < self._valid_losses[self._best_epoch]:
-                best_epoch = True
-        elif self._monitor == 'accuracy':
-            if valid_accuracy > self._valid_accuracies[self._best_epoch]:
-                best_epoch = True
-
-        if best_epoch:
-            self._best_epoch = epoch
-
-            if isinstance(model, dict):
-                for name, current_model in model.items():
-                    self._model_parameters[name] = deepcopy(
-                        current_model.to('cpu').state_dict())
-            else:
-                self._model_parameters = deepcopy(model.to('cpu').state_dict())
-
-            self._lookback = 0
-        else:
-            self._lookback += 1
-
-
-class OGBDataset:
-    def __init__(
-        self,
-        g: Union[dgl.DGLGraph, dgl.DGLHeteroGraph],
-        num_labels: int,
-        predict_category: str = None,
-    ) -> None:
-        self._g = g
-        self._num_labels = num_labels
-        self._predict_category = predict_category
-
-    @property
-    def num_labels(self) -> int:
-        return self._num_labels
-
-    @property
-    def num_classes(self) -> int:
-        return self._num_labels
-
-    @property
-    def predict_category(self) -> str:
-        return self._predict_category
-
-    def __getitem__(self, idx: int) -> Union[dgl.DGLGraph, dgl.DGLHeteroGraph]:
-        return self._g
-
-
-def load_ogbn_mag(root: str = None) -> OGBDataset:
-    dataset = DglNodePropPredDataset(name='ogbn-mag', root=root)
-
-    split_idx = dataset.get_idx_split()
-
-    train_idx = split_idx['train']['paper']
-    valid_idx = split_idx['valid']['paper']
-    test_idx = split_idx['test']['paper']
-
-    hg_original, labels = dataset[0]
-
-    labels = labels['paper'].squeeze()
-    num_labels = dataset.num_classes
-
-    subgraphs = {}
-
-    for etype in hg_original.canonical_etypes:
-        src, dst = hg_original.all_edges(etype=etype)
-
-        subgraphs[etype] = (src, dst)
-        subgraphs[(etype[2], f'rev-{etype[1]}', etype[0])] = (dst, src)
-
-    hg = dgl.heterograph(subgraphs)
-
-    hg.nodes['paper'].data['feat'] = hg_original.nodes['paper'].data['feat']
-    hg.nodes['paper'].data['labels'] = labels
-
-    train_mask = torch.zeros((hg.num_nodes('paper'),), dtype=torch.bool)
-    train_mask[train_idx] = True
-    valid_mask = torch.zeros((hg.num_nodes('paper'),), dtype=torch.bool)
-    valid_mask[valid_idx] = True
-    test_mask = torch.zeros((hg.num_nodes('paper'),), dtype=torch.bool)
-    test_mask[test_idx] = True
-
-    hg.nodes['paper'].data['train_mask'] = train_mask
-    hg.nodes['paper'].data['valid_mask'] = valid_mask
-    hg.nodes['paper'].data['test_mask'] = test_mask
-
-    ogb_dataset = OGBDataset(hg, num_labels, 'paper')
-
-    return ogb_dataset
-
-
-def process_dataset(
-    name: str,
-    root: str = None,
-) -> Tuple[OGBDataset, dgl.DGLHeteroGraph, torch.Tensor]:
-    if root is None:
-        root = 'datasets'
-
-    if name == 'ogbn-mag':
-        dataset = load_ogbn_mag(root=root)
-
-    g = dataset[0]
-
-    predict_category = dataset.predict_category
-
-    train_idx = torch.nonzero(
-        g.nodes[predict_category].data['train_mask'], as_tuple=True)[0]
-    valid_idx = torch.nonzero(
-        g.nodes[predict_category].data['valid_mask'], as_tuple=True)[0]
-    test_idx = torch.nonzero(
-        g.nodes[predict_category].data['test_mask'], as_tuple=True)[0]
-
-    return dataset, g, train_idx, valid_idx, test_idx
--- a/examples/pytorch/rgcn/entity_sample.py
+++ b/examples/pytorch/rgcn/entity_sample.py
@@ -121,6 +121,7 @@ def main(args):
                 ns_mode=True)
    labels = labels.to(device)
    model = model.to(device)
+    inv_target = inv_target.to(device)

    optimizer = th.optim.Adam(model.parameters(), lr=1e-2, weight_decay=args.wd)


--- a/examples/pytorch/rgcn/entity_sample_multi_gpu.py
+++ b/examples/pytorch/rgcn/entity_sample_multi_gpu.py
@@ -60,6 +60,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, queue=None):
                 ns_mode=True)
    labels = labels.to(device)
    model = model.to(device)
+    inv_target = inv_target.to(device)
    model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)

    optimizer = th.optim.Adam(model.parameters(), lr=1e-2, weight_decay=args.wd)

--- a/include/dgl/immutable_graph.h
+++ b/include/dgl/immutable_graph.h
@@ -273,19 +273,19 @@ class COO : public GraphInterface {
  // TODO(da): add constructor for creating COO from shared memory

  void AddVertices(uint64_t num_vertices) override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  void AddEdge(dgl_id_t src, dgl_id_t dst) override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  void AddEdges(IdArray src_ids, IdArray dst_ids) override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  void Clear() override {
-    LOG(FATAL) << "CSR graph does not allow mutation.";
+    LOG(FATAL) << "COO graph does not allow mutation.";
  }

  DLContext Context() const override {