readme

5ed5979f · bailuo · 5ed5979f · 5ed5979f · 5ed5979f · 5ed5979f
Commit 5ed5979f authored Nov 19, 2025 by bailuo
20 changed files
--- a/notebooks/visualize_dump_results.ipynb
+++ b/notebooks/visualize_dump_results.ipynb
--- a/requirements.txt
+++ b/requirements.txt
+opencv_python==4.4.0.46
+albumentations==0.5.1 --no-binary=imgaug,albumentations
+ray>=1.0.1
+einops==0.3.0
+kornia==0.4.1
+loguru==0.5.3
+yacs>=0.1.8
+tqdm
+autopep8
+pylint
+ipython
+jupyterlab
+matplotlib
+h5py==3.1.0
+pytorch-lightning==1.3.5
+torchmetrics==0.6.0  # version problem: https://github.com/NVIDIA/DeepLearningExamples/issues/1113#issuecomment-1102969461
+joblib>=1.0.1
--- a/scripts/reproduce_test/indoor_ds.sh
+++ b/scripts/reproduce_test/indoor_ds.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+data_cfg_path="configs/data/scannet_test_1500.py"
+main_cfg_path="configs/loftr/indoor/scannet/loftr_ds_eval.py"
+ckpt_path="weights/indoor_ds.ckpt"
+dump_dir="dump/loftr_ds_indoor"
+profiler_name="inference"
+n_nodes=1  # mannually keep this the same with --nodes
+n_gpus_per_node=-1
+torch_num_workers=4
+batch_size=1  # per gpu
+python -u ./test.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --ckpt_path=${ckpt_path} \
+    --dump_dir=${dump_dir} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers}\
+    --profiler_name=${profiler_name} \
+    --benchmark 
\ No newline at end of file
--- a/scripts/reproduce_test/indoor_ds_new.sh
+++ b/scripts/reproduce_test/indoor_ds_new.sh
+#!/bin/bash -l
+# a indoor_ds model with the pos_enc impl bug fixed.
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+data_cfg_path="configs/data/scannet_test_1500.py"
+main_cfg_path="configs/loftr/indoor/scannet/loftr_ds_eval_new.py"
+ckpt_path="weights/indoor_ds_new.ckpt"
+dump_dir="dump/loftr_ds_indoor_new"
+profiler_name="inference"
+n_nodes=1  # mannually keep this the same with --nodes
+n_gpus_per_node=-1
+torch_num_workers=4
+batch_size=1  # per gpu
+python -u ./test.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --ckpt_path=${ckpt_path} \
+    --dump_dir=${dump_dir} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers}\
+    --profiler_name=${profiler_name} \
+    --benchmark 
\ No newline at end of file
--- a/scripts/reproduce_test/indoor_ot.sh
+++ b/scripts/reproduce_test/indoor_ot.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+data_cfg_path="configs/data/scannet_test_1500.py"
+main_cfg_path="configs/loftr/indoor/buggy_pos_enc/loftr_ot.py"
+ckpt_path="weights/indoor_ot.ckpt"
+dump_dir="dump/loftr_ot_indoor"
+profiler_name="inference"
+n_nodes=1  # mannually keep this the same with --nodes
+n_gpus_per_node=-1
+torch_num_workers=4
+batch_size=1  # per gpu
+python -u ./test.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --ckpt_path=${ckpt_path} \
+    --dump_dir=${dump_dir} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers}\
+    --profiler_name=${profiler_name} \
+    --benchmark 
--- a/scripts/reproduce_test/outdoor_ds.sh
+++ b/scripts/reproduce_test/outdoor_ds.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+data_cfg_path="configs/data/megadepth_test_1500.py"
+main_cfg_path="configs/loftr/outdoor/buggy_pos_enc/loftr_ds.py"
+ckpt_path="weights/outdoor_ds.ckpt"
+dump_dir="dump/loftr_ds_outdoor"
+profiler_name="inference"
+n_nodes=1  # mannually keep this the same with --nodes
+n_gpus_per_node=-1
+torch_num_workers=4
+batch_size=1  # per gpu
+python -u ./test.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --ckpt_path=${ckpt_path} \
+    --dump_dir=${dump_dir} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers}\
+    --profiler_name=${profiler_name} \
+    --benchmark 
\ No newline at end of file
--- a/scripts/reproduce_test/outdoor_ot.sh
+++ b/scripts/reproduce_test/outdoor_ot.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+data_cfg_path="configs/data/megadepth_test_1500.py"
+main_cfg_path="configs/loftr/outdoor/buggy_pos_enc/loftr_ot.py"
+ckpt_path="weights/outdoor_ot.ckpt"
+dump_dir="dump/loftr_ot_outdoor"
+profiler_name="inference"
+n_nodes=1  # mannually keep this the same with --nodes
+n_gpus_per_node=-1
+torch_num_workers=4
+batch_size=1  # per gpu
+python -u ./test.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --ckpt_path=${ckpt_path} \
+    --dump_dir=${dump_dir} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers}\
+    --profiler_name=${profiler_name} \
+    --benchmark 
--- a/scripts/reproduce_train/debug/.gitignore
+++ b/scripts/reproduce_train/debug/.gitignore
+*
+*/
+!.gitignore
--- a/scripts/reproduce_train/indoor_ds.sh
+++ b/scripts/reproduce_train/indoor_ds.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+data_cfg_path="configs/data/scannet_trainval.py"
+main_cfg_path="configs/loftr/indoor/loftr_ds_dense.py"
+n_nodes=1
+n_gpus_per_node=4
+torch_num_workers=4
+batch_size=1
+pin_memory=true
+exp_name="indoor-ds-bs=$(($n_gpus_per_node * $n_nodes * $batch_size))"
+python -u ./train.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --exp_name=${exp_name} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers} --pin_memory=${pin_memory} \
+    --check_val_every_n_epoch=1 \
+    --log_every_n_steps=100 \
+    --flush_logs_every_n_steps=100 \
+    --limit_val_batches=1. \
+    --num_sanity_val_steps=10 \
+    --benchmark=True \
+    --max_epochs=30 \
+    --parallel_load_data
--- a/scripts/reproduce_train/indoor_ot.sh
+++ b/scripts/reproduce_train/indoor_ot.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+data_cfg_path="configs/data/scannet_trainval.py"
+main_cfg_path="configs/loftr/indoor/loftr_ot_dense.py"
+n_nodes=1
+n_gpus_per_node=4
+torch_num_workers=4
+batch_size=1
+pin_memory=true
+exp_name="indoor-ot-bs=$(($n_gpus_per_node * $n_nodes * $batch_size))"
+python -u ./train.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --exp_name=${exp_name} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers} --pin_memory=${pin_memory} \
+    --check_val_every_n_epoch=1 \
+    --log_every_n_steps=100 \
+    --flush_logs_every_n_steps=100 \
+    --limit_val_batches=1. \
+    --num_sanity_val_steps=10 \
+    --benchmark=True \
+    --max_epochs=30 \
+    --parallel_load_data
--- a/scripts/reproduce_train/outdoor_ds.sh
+++ b/scripts/reproduce_train/outdoor_ds.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+TRAIN_IMG_SIZE=640
+# to reproduced the results in our paper, please use:
+# TRAIN_IMG_SIZE=840
+data_cfg_path="configs/data/megadepth_trainval_${TRAIN_IMG_SIZE}.py"
+main_cfg_path="configs/loftr/outdoor/loftr_ds_dense.py"
+n_nodes=1
+n_gpus_per_node=4
+torch_num_workers=4
+batch_size=1
+pin_memory=true
+exp_name="outdoor-ds-${TRAIN_IMG_SIZE}-bs=$(($n_gpus_per_node * $n_nodes * $batch_size))"
+python -u ./train.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --exp_name=${exp_name} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers} --pin_memory=${pin_memory} \
+    --check_val_every_n_epoch=1 \
+    --log_every_n_steps=1 \
+    --flush_logs_every_n_steps=1 \
+    --limit_val_batches=1. \
+    --num_sanity_val_steps=10 \
+    --benchmark=True \
+    --max_epochs=30
--- a/scripts/reproduce_train/outdoor_ot.sh
+++ b/scripts/reproduce_train/outdoor_ot.sh
+#!/bin/bash -l
+SCRIPTPATH=$(dirname $(readlink -f "$0"))
+PROJECT_DIR="${SCRIPTPATH}/../../"
+# conda activate loftr
+export PYTHONPATH=$PROJECT_DIR:$PYTHONPATH
+cd $PROJECT_DIR
+TRAIN_IMG_SIZE=640
+# to reproduced the results in our paper, please use:
+# TRAIN_IMG_SIZE=840
+data_cfg_path="configs/data/megadepth_trainval_${TRAIN_IMG_SIZE}.py"
+main_cfg_path="configs/loftr/outdoor/loftr_ot_dense.py"
+n_nodes=1
+n_gpus_per_node=4
+torch_num_workers=4
+batch_size=1
+pin_memory=true
+exp_name="outdoor-ot-${TRAIN_IMG_SIZE}-bs=$(($n_gpus_per_node * $n_nodes * $batch_size))"
+python -u ./train.py \
+    ${data_cfg_path} \
+    ${main_cfg_path} \
+    --exp_name=${exp_name} \
+    --gpus=${n_gpus_per_node} --num_nodes=${n_nodes} --accelerator="ddp" \
+    --batch_size=${batch_size} --num_workers=${torch_num_workers} --pin_memory=${pin_memory} \
+    --check_val_every_n_epoch=1 \
+    --log_every_n_steps=1 \
+    --flush_logs_every_n_steps=1 \
+    --limit_val_batches=1. \
+    --num_sanity_val_steps=10 \
+    --benchmark=True \
+    --max_epochs=30
--- a/src/__init__.py
+++ b/src/__init__.py
--- a/src/config/default.py
+++ b/src/config/default.py
+from yacs.config import CfgNode as CN
+_CN = CN()
+##############  ↓  LoFTR Pipeline  ↓  ##############
+_CN.LOFTR = CN()
+_CN.LOFTR.BACKBONE_TYPE = 'ResNetFPN'
+_CN.LOFTR.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
+_CN.LOFTR.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
+_CN.LOFTR.FINE_CONCAT_COARSE_FEAT = True
+# 1. LoFTR-backbone (local feature CNN) config
+_CN.LOFTR.RESNETFPN = CN()
+_CN.LOFTR.RESNETFPN.INITIAL_DIM = 128
+_CN.LOFTR.RESNETFPN.BLOCK_DIMS = [128, 196, 256]  # s1, s2, s3
+# 2. LoFTR-coarse module config
+_CN.LOFTR.COARSE = CN()
+_CN.LOFTR.COARSE.D_MODEL = 256
+_CN.LOFTR.COARSE.D_FFN = 256
+_CN.LOFTR.COARSE.NHEAD = 8
+_CN.LOFTR.COARSE.LAYER_NAMES = ['self', 'cross'] * 4
+_CN.LOFTR.COARSE.ATTENTION = 'linear'  # options: ['linear', 'full']
+_CN.LOFTR.COARSE.TEMP_BUG_FIX = True
+# 3. Coarse-Matching config
+_CN.LOFTR.MATCH_COARSE = CN()
+_CN.LOFTR.MATCH_COARSE.THR = 0.2
+_CN.LOFTR.MATCH_COARSE.BORDER_RM = 2
+_CN.LOFTR.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
+_CN.LOFTR.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
+_CN.LOFTR.MATCH_COARSE.SKH_ITERS = 3
+_CN.LOFTR.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
+_CN.LOFTR.MATCH_COARSE.SKH_PREFILTER = False
+_CN.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.2  # training tricks: save GPU memory
+_CN.LOFTR.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200  # training tricks: avoid DDP deadlock
+_CN.LOFTR.MATCH_COARSE.SPARSE_SPVS = True
+# 4. LoFTR-fine module config
+_CN.LOFTR.FINE = CN()
+_CN.LOFTR.FINE.D_MODEL = 128
+_CN.LOFTR.FINE.D_FFN = 128
+_CN.LOFTR.FINE.NHEAD = 8
+_CN.LOFTR.FINE.LAYER_NAMES = ['self', 'cross'] * 1
+_CN.LOFTR.FINE.ATTENTION = 'linear'
+# 5. LoFTR Losses
+# -- # coarse-level
+_CN.LOFTR.LOSS = CN()
+_CN.LOFTR.LOSS.COARSE_TYPE = 'focal'  # ['focal', 'cross_entropy']
+_CN.LOFTR.LOSS.COARSE_WEIGHT = 1.0
+# _CN.LOFTR.LOSS.SPARSE_SPVS = False
+# -- - -- # focal loss (coarse)
+_CN.LOFTR.LOSS.FOCAL_ALPHA = 0.25
+_CN.LOFTR.LOSS.FOCAL_GAMMA = 2.0
+_CN.LOFTR.LOSS.POS_WEIGHT = 1.0
+_CN.LOFTR.LOSS.NEG_WEIGHT = 1.0
+# _CN.LOFTR.LOSS.DUAL_SOFTMAX = False  # whether coarse-level use dual-softmax or not.
+# use `_CN.LOFTR.MATCH_COARSE.MATCH_TYPE`
+# -- # fine-level
+_CN.LOFTR.LOSS.FINE_TYPE = 'l2_with_std'  # ['l2_with_std', 'l2']
+_CN.LOFTR.LOSS.FINE_WEIGHT = 1.0
+_CN.LOFTR.LOSS.FINE_CORRECT_THR = 1.0  # for filtering valid fine-level gts (some gt matches might fall out of the fine-level window)
+##############  Dataset  ##############
+_CN.DATASET = CN()
+# 1. data config
+# training and validating
+_CN.DATASET.TRAINVAL_DATA_SOURCE = None  # options: ['ScanNet', 'MegaDepth']
+_CN.DATASET.TRAIN_DATA_ROOT = None
+_CN.DATASET.TRAIN_POSE_ROOT = None  # (optional directory for poses)
+_CN.DATASET.TRAIN_NPZ_ROOT = None
+_CN.DATASET.TRAIN_LIST_PATH = None
+_CN.DATASET.TRAIN_INTRINSIC_PATH = None
+_CN.DATASET.VAL_DATA_ROOT = None
+_CN.DATASET.VAL_POSE_ROOT = None  # (optional directory for poses)
+_CN.DATASET.VAL_NPZ_ROOT = None
+_CN.DATASET.VAL_LIST_PATH = None    # None if val data from all scenes are bundled into a single npz file
+_CN.DATASET.VAL_INTRINSIC_PATH = None
+# testing
+_CN.DATASET.TEST_DATA_SOURCE = None
+_CN.DATASET.TEST_DATA_ROOT = None
+_CN.DATASET.TEST_POSE_ROOT = None  # (optional directory for poses)
+_CN.DATASET.TEST_NPZ_ROOT = None
+_CN.DATASET.TEST_LIST_PATH = None   # None if test data from all scenes are bundled into a single npz file
+_CN.DATASET.TEST_INTRINSIC_PATH = None
+# 2. dataset config
+# general options
+_CN.DATASET.MIN_OVERLAP_SCORE_TRAIN = 0.4  # discard data with overlap_score < min_overlap_score
+_CN.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0
+_CN.DATASET.AUGMENTATION_TYPE = None  # options: [None, 'dark', 'mobile']
+# MegaDepth options
+_CN.DATASET.MGDPT_IMG_RESIZE = 640  # resize the longer side, zero-pad bottom-right to square.
+_CN.DATASET.MGDPT_IMG_PAD = True  # pad img to square with size = MGDPT_IMG_RESIZE
+_CN.DATASET.MGDPT_DEPTH_PAD = True  # pad depthmap to square with size = 2000
+_CN.DATASET.MGDPT_DF = 8
+##############  Trainer  ##############
+_CN.TRAINER = CN()
+_CN.TRAINER.WORLD_SIZE = 1
+_CN.TRAINER.CANONICAL_BS = 64
+_CN.TRAINER.CANONICAL_LR = 6e-3
+_CN.TRAINER.SCALING = None  # this will be calculated automatically
+_CN.TRAINER.FIND_LR = False  # use learning rate finder from pytorch-lightning
+# optimizer
+_CN.TRAINER.OPTIMIZER = "adamw"  # [adam, adamw]
+_CN.TRAINER.TRUE_LR = None  # this will be calculated automatically at runtime
+_CN.TRAINER.ADAM_DECAY = 0.  # ADAM: for adam
+_CN.TRAINER.ADAMW_DECAY = 0.1
+# step-based warm-up
+_CN.TRAINER.WARMUP_TYPE = 'linear'  # [linear, constant]
+_CN.TRAINER.WARMUP_RATIO = 0.
+_CN.TRAINER.WARMUP_STEP = 4800
+# learning rate scheduler
+_CN.TRAINER.SCHEDULER = 'MultiStepLR'  # [MultiStepLR, CosineAnnealing, ExponentialLR]
+_CN.TRAINER.SCHEDULER_INTERVAL = 'epoch'    # [epoch, step]
+_CN.TRAINER.MSLR_MILESTONES = [3, 6, 9, 12]  # MSLR: MultiStepLR
+_CN.TRAINER.MSLR_GAMMA = 0.5
+_CN.TRAINER.COSA_TMAX = 30  # COSA: CosineAnnealing
+_CN.TRAINER.ELR_GAMMA = 0.999992  # ELR: ExponentialLR, this value for 'step' interval
+# plotting related
+_CN.TRAINER.ENABLE_PLOTTING = True
+_CN.TRAINER.N_VAL_PAIRS_TO_PLOT = 32     # number of val/test paris for plotting
+_CN.TRAINER.PLOT_MODE = 'evaluation'  # ['evaluation', 'confidence']
+_CN.TRAINER.PLOT_MATCHES_ALPHA = 'dynamic'
+# geometric metrics and pose solver
+_CN.TRAINER.EPI_ERR_THR = 5e-4  # recommendation: 5e-4 for ScanNet, 1e-4 for MegaDepth (from SuperGlue)
+_CN.TRAINER.POSE_GEO_MODEL = 'E'  # ['E', 'F', 'H']
+_CN.TRAINER.POSE_ESTIMATION_METHOD = 'RANSAC'  # [RANSAC, DEGENSAC, MAGSAC]
+_CN.TRAINER.RANSAC_PIXEL_THR = 0.5
+_CN.TRAINER.RANSAC_CONF = 0.99999
+_CN.TRAINER.RANSAC_MAX_ITERS = 10000
+_CN.TRAINER.USE_MAGSACPP = False
+# data sampler for train_dataloader
+_CN.TRAINER.DATA_SAMPLER = 'scene_balance'  # options: ['scene_balance', 'random', 'normal']
+# 'scene_balance' config
+_CN.TRAINER.N_SAMPLES_PER_SUBSET = 200
+_CN.TRAINER.SB_SUBSET_SAMPLE_REPLACEMENT = True  # whether sample each scene with replacement or not
+_CN.TRAINER.SB_SUBSET_SHUFFLE = True  # after sampling from scenes, whether shuffle within the epoch or not
+_CN.TRAINER.SB_REPEAT = 1  # repeat N times for training the sampled data
+# 'random' config
+_CN.TRAINER.RDM_REPLACEMENT = True
+_CN.TRAINER.RDM_NUM_SAMPLES = None
+# gradient clipping
+_CN.TRAINER.GRADIENT_CLIPPING = 0.5
+# reproducibility
+# This seed affects the data sampling. With the same seed, the data sampling is promised
+# to be the same. When resume training from a checkpoint, it's better to use a different
+# seed, otherwise the sampled data will be exactly the same as before resuming, which will
+# cause less unique data items sampled during the entire training.
+# Use of different seed values might affect the final training result, since not all data items
+# are used during training on ScanNet. (60M pairs of images sampled during traing from 230M pairs in total.)
+_CN.TRAINER.SEED = 66
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _CN.clone()
--- a/src/datasets/megadepth.py
+++ b/src/datasets/megadepth.py
+import os.path as osp
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+from loguru import logger
+from src.utils.dataset import read_megadepth_gray, read_megadepth_depth
+class MegaDepthDataset(Dataset):
+    def __init__(self,
+                 root_dir,
+                 npz_path,
+                 mode='train',
+                 min_overlap_score=0.4,
+                 img_resize=None,
+                 df=None,
+                 img_padding=False,
+                 depth_padding=False,
+                 augment_fn=None,
+                 **kwargs):
+        """
+        Manage one scene(npz_path) of MegaDepth dataset.
+        Args:
+            root_dir (str): megadepth root directory that has `phoenix`.
+            npz_path (str): {scene_id}.npz path. This contains image pair information of a scene.
+            mode (str): options are ['train', 'val', 'test']
+            min_overlap_score (float): how much a pair should have in common. In range of [0, 1]. Set to 0 when testing.
+            img_resize (int, optional): the longer edge of resized images. None for no resize. 640 is recommended.
+                                        This is useful during training with batches and testing with memory intensive algorithms.
+            df (int, optional): image size division factor. NOTE: this will change the final image size after img_resize.
+            img_padding (bool): If set to 'True', zero-pad the image to squared size. This is useful during training.
+            depth_padding (bool): If set to 'True', zero-pad depthmap to (2000, 2000). This is useful during training.
+            augment_fn (callable, optional): augments images with pre-defined visual effects.
+        """
+        super().__init__()
+        self.root_dir = root_dir
+        self.mode = mode
+        self.scene_id = npz_path.split('.')[0]
+        # prepare scene_info and pair_info
+        if mode == 'test' and min_overlap_score != 0:
+            logger.warning("You are using `min_overlap_score`!=0 in test mode. Set to 0.")
+            min_overlap_score = 0
+        self.scene_info = np.load(npz_path, allow_pickle=True)
+        self.pair_infos = self.scene_info['pair_infos'].copy()
+        del self.scene_info['pair_infos']
+        self.pair_infos = [pair_info for pair_info in self.pair_infos if pair_info[1] > min_overlap_score]
+        # parameters for image resizing, padding and depthmap padding
+        if mode == 'train':
+            assert img_resize is not None and img_padding and depth_padding
+        self.img_resize = img_resize
+        self.df = df
+        self.img_padding = img_padding
+        self.depth_max_size = 2000 if depth_padding else None  # the upperbound of depthmaps size in megadepth.
+        # for training LoFTR
+        self.augment_fn = augment_fn if mode == 'train' else None
+        self.coarse_scale = getattr(kwargs, 'coarse_scale', 0.125)
+    def __len__(self):
+        return len(self.pair_infos)
+    def __getitem__(self, idx):
+        (idx0, idx1), overlap_score, central_matches = self.pair_infos[idx]
+        # read grayscale image and mask. (1, h, w) and (h, w)
+        img_name0 = osp.join(self.root_dir, self.scene_info['image_paths'][idx0])
+        img_name1 = osp.join(self.root_dir, self.scene_info['image_paths'][idx1])
+        # TODO: Support augmentation & handle seeds for each worker correctly.
+        image0, mask0, scale0 = read_megadepth_gray(
+            img_name0, self.img_resize, self.df, self.img_padding, None)
+            # np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
+        image1, mask1, scale1 = read_megadepth_gray(
+            img_name1, self.img_resize, self.df, self.img_padding, None)
+            # np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
+        # read depth. shape: (h, w)
+        if self.mode in ['train', 'val']:
+            depth0 = read_megadepth_depth(
+                osp.join(self.root_dir, self.scene_info['depth_paths'][idx0]), pad_to=self.depth_max_size)
+            depth1 = read_megadepth_depth(
+                osp.join(self.root_dir, self.scene_info['depth_paths'][idx1]), pad_to=self.depth_max_size)
+        else:
+            depth0 = depth1 = torch.tensor([])
+        # read intrinsics of original size
+        K_0 = torch.tensor(self.scene_info['intrinsics'][idx0].copy(), dtype=torch.float).reshape(3, 3)
+        K_1 = torch.tensor(self.scene_info['intrinsics'][idx1].copy(), dtype=torch.float).reshape(3, 3)
+        # read and compute relative poses
+        T0 = self.scene_info['poses'][idx0]
+        T1 = self.scene_info['poses'][idx1]
+        T_0to1 = torch.tensor(np.matmul(T1, np.linalg.inv(T0)), dtype=torch.float)[:4, :4]  # (4, 4)
+        T_1to0 = T_0to1.inverse()
+        data = {
+            'image0': image0,  # (1, h, w)
+            'depth0': depth0,  # (h, w)
+            'image1': image1,
+            'depth1': depth1,
+            'T_0to1': T_0to1,  # (4, 4)
+            'T_1to0': T_1to0,
+            'K0': K_0,  # (3, 3)
+            'K1': K_1,
+            'scale0': scale0,  # [scale_w, scale_h]
+            'scale1': scale1,
+            'dataset_name': 'MegaDepth',
+            'scene_id': self.scene_id,
+            'pair_id': idx,
+            'pair_names': (self.scene_info['image_paths'][idx0], self.scene_info['image_paths'][idx1]),
+        }
+        # for LoFTR training
+        if mask0 is not None:  # img_padding is True
+            if self.coarse_scale:
+                [ts_mask_0, ts_mask_1] = F.interpolate(torch.stack([mask0, mask1], dim=0)[None].float(),
+                                                       scale_factor=self.coarse_scale,
+                                                       mode='nearest',
+                                                       recompute_scale_factor=False)[0].bool()
+            data.update({'mask0': ts_mask_0, 'mask1': ts_mask_1})
+        return data
--- a/src/datasets/sampler.py
+++ b/src/datasets/sampler.py
+import torch
+from torch.utils.data import Sampler, ConcatDataset
+class RandomConcatSampler(Sampler):
+    """ Random sampler for ConcatDataset. At each epoch, `n_samples_per_subset` samples will be draw from each subset
+    in the ConcatDataset. If `subset_replacement` is ``True``, sampling within each subset will be done with replacement.
+    However, it is impossible to sample data without replacement between epochs, unless bulding a stateful sampler lived along the entire training phase.
+    For current implementation, the randomness of sampling is ensured no matter the sampler is recreated across epochs or not and call `torch.manual_seed()` or not.
+    Args:
+        shuffle (bool): shuffle the random sampled indices across all sub-datsets.
+        repeat (int): repeatedly use the sampled indices multiple times for training.
+            [arXiv:1902.05509, arXiv:1901.09335]
+    NOTE: Don't re-initialize the sampler between epochs (will lead to repeated samples)
+    NOTE: This sampler behaves differently with DistributedSampler.
+          It assume the dataset is splitted across ranks instead of replicated.
+    TODO: Add a `set_epoch()` method to fullfill sampling without replacement across epochs.
+          ref: https://github.com/PyTorchLightning/pytorch-lightning/blob/e9846dd758cfb1500eb9dba2d86f6912eb487587/pytorch_lightning/trainer/training_loop.py#L373
+    """
+    def __init__(self,
+                 data_source: ConcatDataset,
+                 n_samples_per_subset: int,
+                 subset_replacement: bool=True,
+                 shuffle: bool=True,
+                 repeat: int=1,
+                 seed: int=None):
+        if not isinstance(data_source, ConcatDataset):
+            raise TypeError("data_source should be torch.utils.data.ConcatDataset")
+        self.data_source = data_source
+        self.n_subset = len(self.data_source.datasets)
+        self.n_samples_per_subset = n_samples_per_subset
+        self.n_samples = self.n_subset * self.n_samples_per_subset * repeat
+        self.subset_replacement = subset_replacement
+        self.repeat = repeat
+        self.shuffle = shuffle
+        self.generator = torch.manual_seed(seed)
+        assert self.repeat >= 1
+    def __len__(self):
+        return self.n_samples
+    def __iter__(self):
+        indices = []
+        # sample from each sub-dataset
+        for d_idx in range(self.n_subset):
+            low = 0 if d_idx==0 else self.data_source.cumulative_sizes[d_idx-1]
+            high = self.data_source.cumulative_sizes[d_idx]
+            if self.subset_replacement:
+                rand_tensor = torch.randint(low, high, (self.n_samples_per_subset, ),
+                                            generator=self.generator, dtype=torch.int64)
+            else:  # sample without replacement
+                len_subset = len(self.data_source.datasets[d_idx])
+                rand_tensor = torch.randperm(len_subset, generator=self.generator) + low
+                if len_subset >= self.n_samples_per_subset:
+                    rand_tensor = rand_tensor[:self.n_samples_per_subset]
+                else: # padding with replacement
+                    rand_tensor_replacement = torch.randint(low, high, (self.n_samples_per_subset - len_subset, ),
+                                                            generator=self.generator, dtype=torch.int64)
+                    rand_tensor = torch.cat([rand_tensor, rand_tensor_replacement])
+            indices.append(rand_tensor)
+        indices = torch.cat(indices)
+        if self.shuffle:  # shuffle the sampled dataset (from multiple subsets)
+            rand_tensor = torch.randperm(len(indices), generator=self.generator)
+            indices = indices[rand_tensor]
+        # repeat the sampled indices (can be used for RepeatAugmentation or pure RepeatSampling)
+        if self.repeat > 1:
+            repeat_indices = [indices.clone() for _ in range(self.repeat - 1)]
+            if self.shuffle:
+                _choice = lambda x: x[torch.randperm(len(x), generator=self.generator)]
+                repeat_indices = map(_choice, repeat_indices)
+            indices = torch.cat([indices, *repeat_indices], 0)
+        assert indices.shape[0] == self.n_samples
+        return iter(indices.tolist())
--- a/src/datasets/scannet.py
+++ b/src/datasets/scannet.py
+from os import path as osp
+from typing import Dict
+from unicodedata import name
+import numpy as np
+import torch
+import torch.utils as utils
+from numpy.linalg import inv
+from src.utils.dataset import (
+    read_scannet_gray,
+    read_scannet_depth,
+    read_scannet_pose,
+    read_scannet_intrinsic
+)
+class ScanNetDataset(utils.data.Dataset):
+    def __init__(self,
+                 root_dir,
+                 npz_path,
+                 intrinsic_path,
+                 mode='train',
+                 min_overlap_score=0.4,
+                 augment_fn=None,
+                 pose_dir=None,
+                 **kwargs):
+        """Manage one scene of ScanNet Dataset.
+        Args:
+            root_dir (str): ScanNet root directory that contains scene folders.
+            npz_path (str): {scene_id}.npz path. This contains image pair information of a scene.
+            intrinsic_path (str): path to depth-camera intrinsic file.
+            mode (str): options are ['train', 'val', 'test'].
+            augment_fn (callable, optional): augments images with pre-defined visual effects.
+            pose_dir (str): ScanNet root directory that contains all poses.
+                (we use a separate (optional) pose_dir since we store images and poses separately.)
+        """
+        super().__init__()
+        self.root_dir = root_dir
+        self.pose_dir = pose_dir if pose_dir is not None else root_dir
+        self.mode = mode
+        # prepare data_names, intrinsics and extrinsics(T)
+        with np.load(npz_path) as data:
+            self.data_names = data['name']
+            if 'score' in data.keys() and mode not in ['val' or 'test']:
+                kept_mask = data['score'] > min_overlap_score
+                self.data_names = self.data_names[kept_mask]
+        self.intrinsics = dict(np.load(intrinsic_path))
+        # for training LoFTR
+        self.augment_fn = augment_fn if mode == 'train' else None
+    def __len__(self):
+        return len(self.data_names)
+    def _read_abs_pose(self, scene_name, name):
+        pth = osp.join(self.pose_dir,
+                       scene_name,
+                       'pose', f'{name}.txt')
+        return read_scannet_pose(pth)
+    def _compute_rel_pose(self, scene_name, name0, name1):
+        pose0 = self._read_abs_pose(scene_name, name0)
+        pose1 = self._read_abs_pose(scene_name, name1)
+        return np.matmul(pose1, inv(pose0))  # (4, 4)
+    def __getitem__(self, idx):
+        data_name = self.data_names[idx]
+        scene_name, scene_sub_name, stem_name_0, stem_name_1 = data_name
+        scene_name = f'scene{scene_name:04d}_{scene_sub_name:02d}'
+        # read the grayscale image which will be resized to (1, 480, 640)
+        img_name0 = osp.join(self.root_dir, scene_name, 'color', f'{stem_name_0}.jpg')
+        img_name1 = osp.join(self.root_dir, scene_name, 'color', f'{stem_name_1}.jpg')
+        # TODO: Support augmentation & handle seeds for each worker correctly.
+        image0 = read_scannet_gray(img_name0, resize=(640, 480), augment_fn=None)
+                                #    augment_fn=np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
+        image1 = read_scannet_gray(img_name1, resize=(640, 480), augment_fn=None)
+                                #    augment_fn=np.random.choice([self.augment_fn, None], p=[0.5, 0.5]))
+        # read the depthmap which is stored as (480, 640)
+        if self.mode in ['train', 'val']:
+            depth0 = read_scannet_depth(osp.join(self.root_dir, scene_name, 'depth', f'{stem_name_0}.png'))
+            depth1 = read_scannet_depth(osp.join(self.root_dir, scene_name, 'depth', f'{stem_name_1}.png'))
+        else:
+            depth0 = depth1 = torch.tensor([])
+        # read the intrinsic of depthmap
+        K_0 = K_1 = torch.tensor(self.intrinsics[scene_name].copy(), dtype=torch.float).reshape(3, 3)
+        # read and compute relative poses
+        T_0to1 = torch.tensor(self._compute_rel_pose(scene_name, stem_name_0, stem_name_1),
+                              dtype=torch.float32)
+        T_1to0 = T_0to1.inverse()
+        data = {
+            'image0': image0,   # (1, h, w)
+            'depth0': depth0,   # (h, w)
+            'image1': image1,
+            'depth1': depth1,
+            'T_0to1': T_0to1,   # (4, 4)
+            'T_1to0': T_1to0,
+            'K0': K_0,  # (3, 3)
+            'K1': K_1,
+            'dataset_name': 'ScanNet',
+            'scene_id': scene_name,
+            'pair_id': idx,
+            'pair_names': (osp.join(scene_name, 'color', f'{stem_name_0}.jpg'),
+                           osp.join(scene_name, 'color', f'{stem_name_1}.jpg'))
+        }
+        return data
--- a/src/lightning/data.py
+++ b/src/lightning/data.py
+import os
+import math
+from collections import abc
+from loguru import logger
+from torch.utils.data.dataset import Dataset
+from tqdm import tqdm
+from os import path as osp
+from pathlib import Path
+from joblib import Parallel, delayed
+import pytorch_lightning as pl
+from torch import distributed as dist
+from torch.utils.data import (
+    Dataset,
+    DataLoader,
+    ConcatDataset,
+    DistributedSampler,
+    RandomSampler,
+    dataloader
+)
+from src.utils.augment import build_augmentor
+from src.utils.dataloader import get_local_split
+from src.utils.misc import tqdm_joblib
+from src.utils import comm
+from src.datasets.megadepth import MegaDepthDataset
+from src.datasets.scannet import ScanNetDataset
+from src.datasets.sampler import RandomConcatSampler
+class MultiSceneDataModule(pl.LightningDataModule):
+    """ 
+    For distributed training, each training process is assgined
+    only a part of the training scenes to reduce memory overhead.
+    """
+    def __init__(self, args, config):
+        super().__init__()
+        # 1. data config
+        # Train and Val should from the same data source
+        self.trainval_data_source = config.DATASET.TRAINVAL_DATA_SOURCE
+        self.test_data_source = config.DATASET.TEST_DATA_SOURCE
+        # training and validating
+        self.train_data_root = config.DATASET.TRAIN_DATA_ROOT
+        self.train_pose_root = config.DATASET.TRAIN_POSE_ROOT  # (optional)
+        self.train_npz_root = config.DATASET.TRAIN_NPZ_ROOT
+        self.train_list_path = config.DATASET.TRAIN_LIST_PATH
+        self.train_intrinsic_path = config.DATASET.TRAIN_INTRINSIC_PATH
+        self.val_data_root = config.DATASET.VAL_DATA_ROOT
+        self.val_pose_root = config.DATASET.VAL_POSE_ROOT  # (optional)
+        self.val_npz_root = config.DATASET.VAL_NPZ_ROOT
+        self.val_list_path = config.DATASET.VAL_LIST_PATH
+        self.val_intrinsic_path = config.DATASET.VAL_INTRINSIC_PATH
+        # testing
+        self.test_data_root = config.DATASET.TEST_DATA_ROOT
+        self.test_pose_root = config.DATASET.TEST_POSE_ROOT  # (optional)
+        self.test_npz_root = config.DATASET.TEST_NPZ_ROOT
+        self.test_list_path = config.DATASET.TEST_LIST_PATH
+        self.test_intrinsic_path = config.DATASET.TEST_INTRINSIC_PATH
+        # 2. dataset config
+        # general options
+        self.min_overlap_score_test = config.DATASET.MIN_OVERLAP_SCORE_TEST  # 0.4, omit data with overlap_score < min_overlap_score
+        self.min_overlap_score_train = config.DATASET.MIN_OVERLAP_SCORE_TRAIN
+        self.augment_fn = build_augmentor(config.DATASET.AUGMENTATION_TYPE)  # None, options: [None, 'dark', 'mobile']
+        # MegaDepth options
+        self.mgdpt_img_resize = config.DATASET.MGDPT_IMG_RESIZE  # 840
+        self.mgdpt_img_pad = config.DATASET.MGDPT_IMG_PAD   # True
+        self.mgdpt_depth_pad = config.DATASET.MGDPT_DEPTH_PAD   # True
+        self.mgdpt_df = config.DATASET.MGDPT_DF  # 8
+        self.coarse_scale = 1 / config.LOFTR.RESOLUTION[0]  # 0.125. for training loftr.
+        # 3.loader parameters
+        self.train_loader_params = {
+            'batch_size': args.batch_size,
+            'num_workers': args.num_workers,
+            'pin_memory': getattr(args, 'pin_memory', True)
+        }
+        self.val_loader_params = {
+            'batch_size': 1,
+            'shuffle': False,
+            'num_workers': args.num_workers,
+            'pin_memory': getattr(args, 'pin_memory', True)
+        }
+        self.test_loader_params = {
+            'batch_size': 1,
+            'shuffle': False,
+            'num_workers': args.num_workers,
+            'pin_memory': True
+        }
+        # 4. sampler
+        self.data_sampler = config.TRAINER.DATA_SAMPLER
+        self.n_samples_per_subset = config.TRAINER.N_SAMPLES_PER_SUBSET
+        self.subset_replacement = config.TRAINER.SB_SUBSET_SAMPLE_REPLACEMENT
+        self.shuffle = config.TRAINER.SB_SUBSET_SHUFFLE
+        self.repeat = config.TRAINER.SB_REPEAT
+        # (optional) RandomSampler for debugging
+        # misc configurations
+        self.parallel_load_data = getattr(args, 'parallel_load_data', False)
+        self.seed = config.TRAINER.SEED  # 66
+    def setup(self, stage=None):
+        """
+        Setup train / val / test dataset. This method will be called by PL automatically.
+        Args:
+            stage (str): 'fit' in training phase, and 'test' in testing phase.
+        """
+        assert stage in ['fit', 'test'], "stage must be either fit or test"
+        try:
+            self.world_size = dist.get_world_size()
+            self.rank = dist.get_rank()
+            logger.info(f"[rank:{self.rank}] world_size: {self.world_size}")
+        except AssertionError as ae:
+            self.world_size = 1
+            self.rank = 0
+            logger.warning(str(ae) + " (set wolrd_size=1 and rank=0)")
+        if stage == 'fit':
+            self.train_dataset = self._setup_dataset(
+                self.train_data_root,
+                self.train_npz_root,
+                self.train_list_path,
+                self.train_intrinsic_path,
+                mode='train',
+                min_overlap_score=self.min_overlap_score_train,
+                pose_dir=self.train_pose_root)
+            # setup multiple (optional) validation subsets
+            if isinstance(self.val_list_path, (list, tuple)):
+                self.val_dataset = []
+                if not isinstance(self.val_npz_root, (list, tuple)):
+                    self.val_npz_root = [self.val_npz_root for _ in range(len(self.val_list_path))]
+                for npz_list, npz_root in zip(self.val_list_path, self.val_npz_root):
+                    self.val_dataset.append(self._setup_dataset(
+                        self.val_data_root,
+                        npz_root,
+                        npz_list,
+                        self.val_intrinsic_path,
+                        mode='val',
+                        min_overlap_score=self.min_overlap_score_test,
+                        pose_dir=self.val_pose_root))
+            else:
+                self.val_dataset = self._setup_dataset(
+                    self.val_data_root,
+                    self.val_npz_root,
+                    self.val_list_path,
+                    self.val_intrinsic_path,
+                    mode='val',
+                    min_overlap_score=self.min_overlap_score_test,
+                    pose_dir=self.val_pose_root)
+            logger.info(f'[rank:{self.rank}] Train & Val Dataset loaded!')
+        else:  # stage == 'test
+            self.test_dataset = self._setup_dataset(
+                self.test_data_root,
+                self.test_npz_root,
+                self.test_list_path,
+                self.test_intrinsic_path,
+                mode='test',
+                min_overlap_score=self.min_overlap_score_test,
+                pose_dir=self.test_pose_root)
+            logger.info(f'[rank:{self.rank}]: Test Dataset loaded!')
+    def _setup_dataset(self,
+                       data_root,
+                       split_npz_root,
+                       scene_list_path,
+                       intri_path,
+                       mode='train',
+                       min_overlap_score=0.,
+                       pose_dir=None):
+        """ Setup train / val / test set"""
+        with open(scene_list_path, 'r') as f:
+            npz_names = [name.split()[0] for name in f.readlines()]
+        if mode == 'train':
+            local_npz_names = get_local_split(npz_names, self.world_size, self.rank, self.seed)
+        else:
+            local_npz_names = npz_names
+        logger.info(f'[rank {self.rank}]: {len(local_npz_names)} scene(s) assigned.')
+        dataset_builder = self._build_concat_dataset_parallel \
+                            if self.parallel_load_data \
+                            else self._build_concat_dataset
+        return dataset_builder(data_root, local_npz_names, split_npz_root, intri_path,
+                                mode=mode, min_overlap_score=min_overlap_score, pose_dir=pose_dir)
+    def _build_concat_dataset(
+        self,
+        data_root,
+        npz_names,
+        npz_dir,
+        intrinsic_path,
+        mode,
+        min_overlap_score=0.,
+        pose_dir=None
+    ):
+        datasets = []
+        augment_fn = self.augment_fn if mode == 'train' else None
+        data_source = self.trainval_data_source if mode in ['train', 'val'] else self.test_data_source
+        if str(data_source).lower() == 'megadepth':
+            npz_names = [f'{n}.npz' for n in npz_names]
+        for npz_name in tqdm(npz_names,
+                             desc=f'[rank:{self.rank}] loading {mode} datasets',
+                             disable=int(self.rank) != 0):
+            # `ScanNetDataset`/`MegaDepthDataset` load all data from npz_path when initialized, which might take time.
+            npz_path = osp.join(npz_dir, npz_name)
+            if data_source == 'ScanNet':
+                datasets.append(
+                    ScanNetDataset(data_root,
+                                   npz_path,
+                                   intrinsic_path,
+                                   mode=mode,
+                                   min_overlap_score=min_overlap_score,
+                                   augment_fn=augment_fn,
+                                   pose_dir=pose_dir))
+            elif data_source == 'MegaDepth':
+                datasets.append(
+                    MegaDepthDataset(data_root,
+                                     npz_path,
+                                     mode=mode,
+                                     min_overlap_score=min_overlap_score,
+                                     img_resize=self.mgdpt_img_resize,
+                                     df=self.mgdpt_df,
+                                     img_padding=self.mgdpt_img_pad,
+                                     depth_padding=self.mgdpt_depth_pad,
+                                     augment_fn=augment_fn,
+                                     coarse_scale=self.coarse_scale))
+            else:
+                raise NotImplementedError()
+        return ConcatDataset(datasets)
+    def _build_concat_dataset_parallel(
+        self,
+        data_root,
+        npz_names,
+        npz_dir,
+        intrinsic_path,
+        mode,
+        min_overlap_score=0.,
+        pose_dir=None,
+    ):
+        augment_fn = self.augment_fn if mode == 'train' else None
+        data_source = self.trainval_data_source if mode in ['train', 'val'] else self.test_data_source
+        if str(data_source).lower() == 'megadepth':
+            npz_names = [f'{n}.npz' for n in npz_names]
+        with tqdm_joblib(tqdm(desc=f'[rank:{self.rank}] loading {mode} datasets',
+                              total=len(npz_names), disable=int(self.rank) != 0)):
+            if data_source == 'ScanNet':
+                datasets = Parallel(n_jobs=math.floor(len(os.sched_getaffinity(0)) * 0.9 / comm.get_local_size()))(
+                    delayed(lambda x: _build_dataset(
+                        ScanNetDataset,
+                        data_root,
+                        osp.join(npz_dir, x),
+                        intrinsic_path,
+                        mode=mode,
+                        min_overlap_score=min_overlap_score,
+                        augment_fn=augment_fn,
+                        pose_dir=pose_dir))(name)
+                    for name in npz_names)
+            elif data_source == 'MegaDepth':
+                # TODO: _pickle.PicklingError: Could not pickle the task to send it to the workers.
+                raise NotImplementedError()
+                datasets = Parallel(n_jobs=math.floor(len(os.sched_getaffinity(0)) * 0.9 / comm.get_local_size()))(
+                    delayed(lambda x: _build_dataset(
+                        MegaDepthDataset,
+                        data_root,
+                        osp.join(npz_dir, x),
+                        mode=mode,
+                        min_overlap_score=min_overlap_score,
+                        img_resize=self.mgdpt_img_resize,
+                        df=self.mgdpt_df,
+                        img_padding=self.mgdpt_img_pad,
+                        depth_padding=self.mgdpt_depth_pad,
+                        augment_fn=augment_fn,
+                        coarse_scale=self.coarse_scale))(name)
+                    for name in npz_names)
+            else:
+                raise ValueError(f'Unknown dataset: {data_source}')
+        return ConcatDataset(datasets)
+    def train_dataloader(self):
+        """ Build training dataloader for ScanNet / MegaDepth. """
+        assert self.data_sampler in ['scene_balance']
+        logger.info(f'[rank:{self.rank}/{self.world_size}]: Train Sampler and DataLoader re-init (should not re-init between epochs!).')
+        if self.data_sampler == 'scene_balance':
+            sampler = RandomConcatSampler(self.train_dataset,
+                                          self.n_samples_per_subset,
+                                          self.subset_replacement,
+                                          self.shuffle, self.repeat, self.seed)
+        else:
+            sampler = None
+        dataloader = DataLoader(self.train_dataset, sampler=sampler, **self.train_loader_params)
+        return dataloader
+    def val_dataloader(self):
+        """ Build validation dataloader for ScanNet / MegaDepth. """
+        logger.info(f'[rank:{self.rank}/{self.world_size}]: Val Sampler and DataLoader re-init.')
+        if not isinstance(self.val_dataset, abc.Sequence):
+            sampler = DistributedSampler(self.val_dataset, shuffle=False)
+            return DataLoader(self.val_dataset, sampler=sampler, **self.val_loader_params)
+        else:
+            dataloaders = []
+            for dataset in self.val_dataset:
+                sampler = DistributedSampler(dataset, shuffle=False)
+                dataloaders.append(DataLoader(dataset, sampler=sampler, **self.val_loader_params))
+            return dataloaders
+    def test_dataloader(self, *args, **kwargs):
+        logger.info(f'[rank:{self.rank}/{self.world_size}]: Test Sampler and DataLoader re-init.')
+        sampler = DistributedSampler(self.test_dataset, shuffle=False)
+        return DataLoader(self.test_dataset, sampler=sampler, **self.test_loader_params)
+def _build_dataset(dataset: Dataset, *args, **kwargs):
+    return dataset(*args, **kwargs)
--- a/src/lightning/lightning_loftr.py
+++ b/src/lightning/lightning_loftr.py
+from collections import defaultdict
+import pprint
+from loguru import logger
+from pathlib import Path
+import torch
+import numpy as np
+import pytorch_lightning as pl
+from matplotlib import pyplot as plt
+from src.loftr import LoFTR
+from src.loftr.utils.supervision import compute_supervision_coarse, compute_supervision_fine
+from src.losses.loftr_loss import LoFTRLoss
+from src.optimizers import build_optimizer, build_scheduler
+from src.utils.metrics import (
+    compute_symmetrical_epipolar_errors,
+    compute_pose_errors,
+    aggregate_metrics
+)
+from src.utils.plotting import make_matching_figures
+from src.utils.comm import gather, all_gather
+from src.utils.misc import lower_config, flattenList
+from src.utils.profiler import PassThroughProfiler
+class PL_LoFTR(pl.LightningModule):
+    def __init__(self, config, pretrained_ckpt=None, profiler=None, dump_dir=None):
+        """
+        TODO:
+            - use the new version of PL logging API.
+        """
+        super().__init__()
+        # Misc
+        self.config = config  # full config
+        _config = lower_config(self.config)
+        self.loftr_cfg = lower_config(_config['loftr'])
+        self.profiler = profiler or PassThroughProfiler()
+        self.n_vals_plot = max(config.TRAINER.N_VAL_PAIRS_TO_PLOT // config.TRAINER.WORLD_SIZE, 1)
+        # Matcher: LoFTR
+        self.matcher = LoFTR(config=_config['loftr'])
+        self.loss = LoFTRLoss(_config)
+        # Pretrained weights
+        if pretrained_ckpt:
+            state_dict = torch.load(pretrained_ckpt, map_location='cpu')['state_dict']
+            self.matcher.load_state_dict(state_dict, strict=True)
+            logger.info(f"Load \'{pretrained_ckpt}\' as pretrained checkpoint")
+        # Testing
+        self.dump_dir = dump_dir
+    def configure_optimizers(self):
+        # FIXME: The scheduler did not work properly when `--resume_from_checkpoint`
+        optimizer = build_optimizer(self, self.config)
+        scheduler = build_scheduler(self.config, optimizer)
+        return [optimizer], [scheduler]
+    def optimizer_step(
+            self, epoch, batch_idx, optimizer, optimizer_idx,
+            optimizer_closure, on_tpu, using_native_amp, using_lbfgs):
+        # learning rate warm up
+        warmup_step = self.config.TRAINER.WARMUP_STEP
+        if self.trainer.global_step < warmup_step:
+            if self.config.TRAINER.WARMUP_TYPE == 'linear':
+                base_lr = self.config.TRAINER.WARMUP_RATIO * self.config.TRAINER.TRUE_LR
+                lr = base_lr + \
+                    (self.trainer.global_step / self.config.TRAINER.WARMUP_STEP) * \
+                    abs(self.config.TRAINER.TRUE_LR - base_lr)
+                for pg in optimizer.param_groups:
+                    pg['lr'] = lr
+            elif self.config.TRAINER.WARMUP_TYPE == 'constant':
+                pass
+            else:
+                raise ValueError(f'Unknown lr warm-up strategy: {self.config.TRAINER.WARMUP_TYPE}')
+        # update params
+        optimizer.step(closure=optimizer_closure)
+        optimizer.zero_grad()
+    def _trainval_inference(self, batch):
+        with self.profiler.profile("Compute coarse supervision"):
+            compute_supervision_coarse(batch, self.config)
+        with self.profiler.profile("LoFTR"):
+            self.matcher(batch)
+        with self.profiler.profile("Compute fine supervision"):
+            compute_supervision_fine(batch, self.config)
+        with self.profiler.profile("Compute losses"):
+            self.loss(batch)
+    def _compute_metrics(self, batch):
+        with self.profiler.profile("Copmute metrics"):
+            compute_symmetrical_epipolar_errors(batch)  # compute epi_errs for each match
+            compute_pose_errors(batch, self.config)  # compute R_errs, t_errs, pose_errs for each pair
+            rel_pair_names = list(zip(*batch['pair_names']))
+            bs = batch['image0'].size(0)
+            metrics = {
+                # to filter duplicate pairs caused by DistributedSampler
+                'identifiers': ['#'.join(rel_pair_names[b]) for b in range(bs)],
+                'epi_errs': [batch['epi_errs'][batch['m_bids'] == b].cpu().numpy() for b in range(bs)],
+                'R_errs': batch['R_errs'],
+                't_errs': batch['t_errs'],
+                'inliers': batch['inliers']}
+            ret_dict = {'metrics': metrics}
+        return ret_dict, rel_pair_names
+    def training_step(self, batch, batch_idx):
+        self._trainval_inference(batch)
+        # logging
+        if self.trainer.global_rank == 0 and self.global_step % self.trainer.log_every_n_steps == 0:
+            # scalars
+            for k, v in batch['loss_scalars'].items():
+                self.logger.experiment.add_scalar(f'train/{k}', v, self.global_step)
+            # net-params
+            if self.config.LOFTR.MATCH_COARSE.MATCH_TYPE == 'sinkhorn':
+                self.logger.experiment.add_scalar(
+                    f'skh_bin_score', self.matcher.coarse_matching.bin_score.clone().detach().cpu().data, self.global_step)
+            # figures
+            if self.config.TRAINER.ENABLE_PLOTTING:
+                compute_symmetrical_epipolar_errors(batch)  # compute epi_errs for each match
+                figures = make_matching_figures(batch, self.config, self.config.TRAINER.PLOT_MODE)
+                for k, v in figures.items():
+                    self.logger.experiment.add_figure(f'train_match/{k}', v, self.global_step)
+        return {'loss': batch['loss']}
+    def training_epoch_end(self, outputs):
+        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
+        if self.trainer.global_rank == 0:
+            self.logger.experiment.add_scalar(
+                'train/avg_loss_on_epoch', avg_loss,
+                global_step=self.current_epoch)
+    def validation_step(self, batch, batch_idx):
+        self._trainval_inference(batch)
+        ret_dict, _ = self._compute_metrics(batch)
+        val_plot_interval = max(self.trainer.num_val_batches[0] // self.n_vals_plot, 1)
+        figures = {self.config.TRAINER.PLOT_MODE: []}
+        if batch_idx % val_plot_interval == 0:
+            figures = make_matching_figures(batch, self.config, mode=self.config.TRAINER.PLOT_MODE)
+        return {
+            **ret_dict,
+            'loss_scalars': batch['loss_scalars'],
+            'figures': figures,
+        }
+    def validation_epoch_end(self, outputs):
+        # handle multiple validation sets
+        multi_outputs = [outputs] if not isinstance(outputs[0], (list, tuple)) else outputs
+        multi_val_metrics = defaultdict(list)
+        for valset_idx, outputs in enumerate(multi_outputs):
+            # since pl performs sanity_check at the very begining of the training
+            cur_epoch = self.trainer.current_epoch
+            if not self.trainer.resume_from_checkpoint and self.trainer.running_sanity_check:
+                cur_epoch = -1
+            # 1. loss_scalars: dict of list, on cpu
+            _loss_scalars = [o['loss_scalars'] for o in outputs]
+            loss_scalars = {k: flattenList(all_gather([_ls[k] for _ls in _loss_scalars])) for k in _loss_scalars[0]}
+            # 2. val metrics: dict of list, numpy
+            _metrics = [o['metrics'] for o in outputs]
+            metrics = {k: flattenList(all_gather(flattenList([_me[k] for _me in _metrics]))) for k in _metrics[0]}
+            # NOTE: all ranks need to `aggregate_merics`, but only log at rank-0 
+            val_metrics_4tb = aggregate_metrics(metrics, self.config.TRAINER.EPI_ERR_THR)
+            for thr in [5, 10, 20]:
+                multi_val_metrics[f'auc@{thr}'].append(val_metrics_4tb[f'auc@{thr}'])
+            # 3. figures
+            _figures = [o['figures'] for o in outputs]
+            figures = {k: flattenList(gather(flattenList([_me[k] for _me in _figures]))) for k in _figures[0]}
+            # tensorboard records only on rank 0
+            if self.trainer.global_rank == 0:
+                for k, v in loss_scalars.items():
+                    mean_v = torch.stack(v).mean()
+                    self.logger.experiment.add_scalar(f'val_{valset_idx}/avg_{k}', mean_v, global_step=cur_epoch)
+                for k, v in val_metrics_4tb.items():
+                    self.logger.experiment.add_scalar(f"metrics_{valset_idx}/{k}", v, global_step=cur_epoch)
+                for k, v in figures.items():
+                    if self.trainer.global_rank == 0:
+                        for plot_idx, fig in enumerate(v):
+                            self.logger.experiment.add_figure(
+                                f'val_match_{valset_idx}/{k}/pair-{plot_idx}', fig, cur_epoch, close=True)
+            plt.close('all')
+        for thr in [5, 10, 20]:
+            # log on all ranks for ModelCheckpoint callback to work properly
+            self.log(f'auc@{thr}', torch.tensor(np.mean(multi_val_metrics[f'auc@{thr}'])))  # ckpt monitors on this
+    def test_step(self, batch, batch_idx):
+        with self.profiler.profile("LoFTR"):
+            self.matcher(batch)
+        ret_dict, rel_pair_names = self._compute_metrics(batch)
+        with self.profiler.profile("dump_results"):
+            if self.dump_dir is not None:
+                # dump results for further analysis
+                keys_to_save = {'mkpts0_f', 'mkpts1_f', 'mconf', 'epi_errs'}
+                pair_names = list(zip(*batch['pair_names']))
+                bs = batch['image0'].shape[0]
+                dumps = []
+                for b_id in range(bs):
+                    item = {}
+                    mask = batch['m_bids'] == b_id
+                    item['pair_names'] = pair_names[b_id]
+                    item['identifier'] = '#'.join(rel_pair_names[b_id])
+                    for key in keys_to_save:
+                        item[key] = batch[key][mask].cpu().numpy()
+                    for key in ['R_errs', 't_errs', 'inliers']:
+                        item[key] = batch[key][b_id]
+                    dumps.append(item)
+                ret_dict['dumps'] = dumps
+        return ret_dict
+    def test_epoch_end(self, outputs):
+        # metrics: dict of list, numpy
+        _metrics = [o['metrics'] for o in outputs]
+        metrics = {k: flattenList(gather(flattenList([_me[k] for _me in _metrics]))) for k in _metrics[0]}
+        # [{key: [{...}, *#bs]}, *#batch]
+        if self.dump_dir is not None:
+            Path(self.dump_dir).mkdir(parents=True, exist_ok=True)
+            _dumps = flattenList([o['dumps'] for o in outputs])  # [{...}, #bs*#batch]
+            dumps = flattenList(gather(_dumps))  # [{...}, #proc*#bs*#batch]
+            logger.info(f'Prediction and evaluation results will be saved to: {self.dump_dir}')
+        if self.trainer.global_rank == 0:
+            print(self.profiler.summary())
+            val_metrics_4tb = aggregate_metrics(metrics, self.config.TRAINER.EPI_ERR_THR)
+            logger.info('\n' + pprint.pformat(val_metrics_4tb))
+            if self.dump_dir is not None:
+                np.save(Path(self.dump_dir) / 'LoFTR_pred_eval', dumps)
--- a/src/loftr/__init__.py
+++ b/src/loftr/__init__.py
+from .loftr import LoFTR
+from .utils.cvpr_ds_config import default_cfg