readme

5ed5979f · bailuo · 5ed5979f · 5ed5979f · 5ed5979f · 5ed5979f
Commit 5ed5979f authored Nov 19, 2025 by bailuo
20 changed files
--- a/configs/loftr/outdoor/buggy_pos_enc/loftr_ot.py
+++ b/configs/loftr/outdoor/buggy_pos_enc/loftr_ot.py
+from src.config.default import _CN as cfg
+
+cfg.LOFTR.COARSE.TEMP_BUG_FIX = False
+cfg.LOFTR.MATCH_COARSE.MATCH_TYPE = 'sinkhorn'
+
+cfg.TRAINER.CANONICAL_LR = 8e-3
+cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs
+cfg.TRAINER.WARMUP_RATIO = 0.1
+cfg.TRAINER.MSLR_MILESTONES = [8, 12, 16, 20, 24]
+
+# pose estimation
+cfg.TRAINER.RANSAC_PIXEL_THR = 0.5
+
+cfg.TRAINER.OPTIMIZER = "adamw"
+cfg.TRAINER.ADAMW_DECAY = 0.1
+cfg.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.3
--- a/configs/loftr/outdoor/buggy_pos_enc/loftr_ot_dense.py
+++ b/configs/loftr/outdoor/buggy_pos_enc/loftr_ot_dense.py
+from src.config.default import _CN as cfg
+
+cfg.LOFTR.COARSE.TEMP_BUG_FIX = False
+cfg.LOFTR.MATCH_COARSE.MATCH_TYPE = 'sinkhorn'
+cfg.LOFTR.MATCH_COARSE.SPARSE_SPVS = False
+
+cfg.TRAINER.CANONICAL_LR = 8e-3
+cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs
+cfg.TRAINER.WARMUP_RATIO = 0.1
+cfg.TRAINER.MSLR_MILESTONES = [8, 12, 16, 20, 24]
+
+# pose estimation
+cfg.TRAINER.RANSAC_PIXEL_THR = 0.5
+
+cfg.TRAINER.OPTIMIZER = "adamw"
+cfg.TRAINER.ADAMW_DECAY = 0.1
+cfg.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.3
--- a/configs/loftr/outdoor/debug/.gitignore
+++ b/configs/loftr/outdoor/debug/.gitignore
+*
+*/
+!.gitignore
--- a/configs/loftr/outdoor/loftr_ds.py
+++ b/configs/loftr/outdoor/loftr_ds.py
+from src.config.default import _CN as cfg
+
+cfg.LOFTR.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'
+
+cfg.TRAINER.CANONICAL_LR = 8e-3
+cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs
+cfg.TRAINER.WARMUP_RATIO = 0.1
+cfg.TRAINER.MSLR_MILESTONES = [8, 12, 16, 20, 24]
+
+# pose estimation
+cfg.TRAINER.RANSAC_PIXEL_THR = 0.5
+
+cfg.TRAINER.OPTIMIZER = "adamw"
+cfg.TRAINER.ADAMW_DECAY = 0.1
+cfg.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.3
--- a/configs/loftr/outdoor/loftr_ds_dense.py
+++ b/configs/loftr/outdoor/loftr_ds_dense.py
+from src.config.default import _CN as cfg
+
+cfg.LOFTR.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'
+cfg.LOFTR.MATCH_COARSE.SPARSE_SPVS = False
+
+cfg.TRAINER.CANONICAL_LR = 8e-3
+cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs
+cfg.TRAINER.WARMUP_RATIO = 0.1
+cfg.TRAINER.MSLR_MILESTONES = [8, 12, 16, 20, 24]
+
+# pose estimation
+cfg.TRAINER.RANSAC_PIXEL_THR = 0.5
+
+cfg.TRAINER.OPTIMIZER = "adamw"
+cfg.TRAINER.ADAMW_DECAY = 0.1
+cfg.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.3
--- a/configs/loftr/outdoor/loftr_ot.py
+++ b/configs/loftr/outdoor/loftr_ot.py
+from src.config.default import _CN as cfg
+
+cfg.LOFTR.MATCH_COARSE.MATCH_TYPE = 'sinkhorn'
+
+cfg.TRAINER.CANONICAL_LR = 8e-3
+cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs
+cfg.TRAINER.WARMUP_RATIO = 0.1
+cfg.TRAINER.MSLR_MILESTONES = [8, 12, 16, 20, 24]
+
+# pose estimation
+cfg.TRAINER.RANSAC_PIXEL_THR = 0.5
+
+cfg.TRAINER.OPTIMIZER = "adamw"
+cfg.TRAINER.ADAMW_DECAY = 0.1
+cfg.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.3
--- a/configs/loftr/outdoor/loftr_ot_dense.py
+++ b/configs/loftr/outdoor/loftr_ot_dense.py
+from src.config.default import _CN as cfg
+
+cfg.LOFTR.MATCH_COARSE.MATCH_TYPE = 'sinkhorn'
+cfg.LOFTR.MATCH_COARSE.SPARSE_SPVS = False
+
+cfg.TRAINER.CANONICAL_LR = 8e-3
+cfg.TRAINER.WARMUP_STEP = 1875  # 3 epochs
+cfg.TRAINER.WARMUP_RATIO = 0.1
+cfg.TRAINER.MSLR_MILESTONES = [8, 12, 16, 20, 24]
+
+# pose estimation
+cfg.TRAINER.RANSAC_PIXEL_THR = 0.5
+
+cfg.TRAINER.OPTIMIZER = "adamw"
+cfg.TRAINER.ADAMW_DECAY = 0.1
+cfg.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.3
--- a/data/megadepth/index/.gitignore
+++ b/data/megadepth/index/.gitignore
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
--- a/data/megadepth/test/.gitignore
+++ b/data/megadepth/test/.gitignore
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
--- a/data/megadepth/train/.gitignore
+++ b/data/megadepth/train/.gitignore
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
--- a/data/scannet/index/.gitignore
+++ b/data/scannet/index/.gitignore
+*
+*/
+!.gitignore
--- a/data/scannet/intrinsics.npz
+++ b/data/scannet/intrinsics.npz
--- a/data/scannet/test/.gitignore
+++ b/data/scannet/test/.gitignore
+*
+*/
+!.gitignore
--- a/data/scannet/train
+++ b/data/scannet/train
+/mnt/lustre/share/3dv/dataset/scannet/out/output
\ No newline at end of file
--- a/demo/demo_loftr.py
+++ b/demo/demo_loftr.py
+front_matter = """
+------------------------------------------------------------------------
+Online demo for [LoFTR](https://zju3dv.github.io/loftr/).
+
+This demo is heavily inspired by [SuperGlue](https://github.com/magicleap/SuperGluePretrainedNetwork/).
+We thank the authors for their execellent work.
+------------------------------------------------------------------------
+"""
+
+import os
+import argparse
+from pathlib import Path
+import cv2
+import torch
+import numpy as np
+import matplotlib.cm as cm
+
+os.sys.path.append("../")  # Add the project directory
+from src.loftr import LoFTR, default_cfg
+from src.config.default import get_cfg_defaults
+try:
+    from demo.utils import (AverageTimer, VideoStreamer,
+                            make_matching_plot_fast, make_matching_plot, frame2tensor)
+except:
+    raise ImportError("This demo requires utils.py from SuperGlue, please use run_demo.sh to start this script.")
+
+
+torch.set_grad_enabled(False)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='LoFTR online demo',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--weight', type=str, help="Path to the checkpoint.")
+    parser.add_argument(
+        '--input', type=str, default='0',
+        help='ID of a USB webcam, URL of an IP camera, '
+             'or path to an image directory or movie file')
+    parser.add_argument(
+        '--output_dir', type=str, default=None,
+        help='Directory where to write output frames (If None, no output)')
+    parser.add_argument(
+        '--image_glob', type=str, nargs='+', default=['*.png', '*.jpg', '*.jpeg'],
+        help='Glob if a directory of images is specified')
+    parser.add_argument(
+        '--skip', type=int, default=1,
+        help='Images to skip if input is a movie or directory')
+    parser.add_argument(
+        '--max_length', type=int, default=1000000,
+        help='Maximum length if input is a movie or directory')
+    parser.add_argument(
+        '--resize', type=int, nargs='+', default=[640, 480],
+        help='Resize the input image before running inference. If two numbers, '
+             'resize to the exact dimensions, if one number, resize the max '
+             'dimension, if -1, do not resize')
+    parser.add_argument(
+        '--no_display', action='store_true',
+        help='Do not display images to screen. Useful if running remotely')
+    parser.add_argument(
+        '--save_video', action='store_true',
+        help='Save output (with match visualizations) to a video.')
+    parser.add_argument(
+        '--save_input', action='store_true',
+        help='Save the input images to a video (for gathering repeatable input source).')
+    parser.add_argument(
+        '--skip_frames', type=int, default=1, 
+        help="Skip frames from webcam input.")
+    parser.add_argument(
+        '--top_k', type=int, default=2000, help="The max vis_range (please refer to the code).")
+    parser.add_argument(
+        '--bottom_k', type=int, default=0, help="The min vis_range (please refer to the code).")
+
+    opt = parser.parse_args()
+    print(front_matter)
+    parser.print_help()
+
+    if len(opt.resize) == 2 and opt.resize[1] == -1:
+        opt.resize = opt.resize[0:1]
+    if len(opt.resize) == 2:
+        print('Will resize to {}x{} (WxH)'.format(
+            opt.resize[0], opt.resize[1]))
+    elif len(opt.resize) == 1 and opt.resize[0] > 0:
+        print('Will resize max dimension to {}'.format(opt.resize[0]))
+    elif len(opt.resize) == 1:
+        print('Will not resize images')
+    else:
+        raise ValueError('Cannot specify more than two integers for --resize')
+
+    if torch.cuda.is_available():
+        device = 'cuda' 
+    else:
+        raise RuntimeError("GPU is required to run this demo.")
+
+    # Initialize LoFTR
+    matcher = LoFTR(config=default_cfg)
+    matcher.load_state_dict(torch.load(opt.weight)['state_dict'])
+    matcher = matcher.eval().to(device=device)
+
+    # Configure I/O
+    if opt.save_video:
+        print('Writing video to loftr-matches.mp4...')
+        writer = cv2.VideoWriter('loftr-matches.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 15, (640*2 + 10, 480))
+    if opt.save_input:
+        print('Writing video to demo-input.mp4...')
+        input_writer = cv2.VideoWriter('demo-input.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 15, (640, 480))
+
+    vs = VideoStreamer(opt.input, opt.resize, opt.skip,
+                       opt.image_glob, opt.max_length)
+    frame, ret = vs.next_frame()
+    assert ret, 'Error when reading the first frame (try different --input?)'
+
+    frame_id = 0  
+    last_image_id = 0
+    frame_tensor = frame2tensor(frame, device)
+    last_data = {'image0': frame_tensor}
+    last_frame = frame
+
+    if opt.output_dir is not None:
+        print('==> Will write outputs to {}'.format(opt.output_dir))
+        Path(opt.output_dir).mkdir(exist_ok=True)
+
+    # Create a window to display the demo.
+    if not opt.no_display:
+        window_name = 'LoFTR Matches'
+        cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
+        cv2.resizeWindow(window_name, (640*2, 480))
+    else:
+        print('Skipping visualization, will not show a GUI.')
+
+    # Print the keyboard help menu.
+    print('==> Keyboard control:\n'
+          '\tn: select the current frame as the reference image (left)\n'
+          '\td/f: move the range of the matches (ranked by confidence) to visualize\n'
+          '\tc/v: increase/decrease the length of the visualization range (i.e., total number of matches) to show\n'
+          '\tq: quit')
+
+    timer = AverageTimer()
+    vis_range = [opt.bottom_k, opt.top_k]
+
+    while True:
+        frame_id += 1
+        frame, ret = vs.next_frame()
+        if frame_id % opt.skip_frames != 0:
+            # print("Skipping frame.")
+            continue
+        if opt.save_input:
+            inp = np.stack([frame]*3, -1)
+            inp_rgb = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
+            input_writer.write(inp_rgb)
+        if not ret:
+            print('Finished demo_loftr.py')
+            break
+        timer.update('data')
+        stem0, stem1 = last_image_id, vs.i - 1
+
+        frame_tensor = frame2tensor(frame, device)
+        last_data = {**last_data, 'image1': frame_tensor}
+        matcher(last_data)
+
+        total_n_matches = len(last_data['mkpts0_f'])
+        mkpts0 = last_data['mkpts0_f'].cpu().numpy()[vis_range[0]:vis_range[1]]
+        mkpts1 = last_data['mkpts1_f'].cpu().numpy()[vis_range[0]:vis_range[1]]
+        mconf = last_data['mconf'].cpu().numpy()[vis_range[0]:vis_range[1]]
+
+        # Normalize confidence.
+        if len(mconf) > 0:
+            conf_vis_min = 0.
+            conf_min = mconf.min()
+            conf_max = mconf.max()
+            mconf = (mconf - conf_vis_min) / (conf_max - conf_vis_min + 1e-5)
+
+        timer.update('forward')
+        alpha = 0
+        color = cm.jet(mconf, alpha=alpha)
+
+        text = [
+            f'LoFTR',
+            '# Matches (showing/total): {}/{}'.format(len(mkpts0), total_n_matches),
+        ]
+        small_text = [
+            f'Showing matches from {vis_range[0]}:{vis_range[1]}',
+            f'Confidence Range: {conf_min:.2f}:{conf_max:.2f}',
+            'Image Pair: {:06}:{:06}'.format(stem0, stem1),
+        ]
+        out = make_matching_plot_fast(
+            last_frame, frame, mkpts0, mkpts1, mkpts0, mkpts1, color, text,
+            path=None, show_keypoints=False, small_text=small_text)
+
+        # Save high quality png, optionally with dynamic alpha support (unreleased yet).
+        # save_path = 'demo_vid/{:06}'.format(frame_id)
+        # make_matching_plot(
+        #     last_frame, frame, mkpts0, mkpts1, mkpts0, mkpts1, color, text,
+        #     path=save_path, show_keypoints=opt.show_keypoints, small_text=small_text)
+
+        if not opt.no_display:
+            if opt.save_video:
+                writer.write(out)
+            cv2.imshow('LoFTR Matches', out)
+            key = chr(cv2.waitKey(1) & 0xFF)
+            if key == 'q':
+                if opt.save_video:
+                    writer.release()
+                if opt.save_input:
+                    input_writer.release()
+                vs.cleanup()
+                print('Exiting...')
+                break
+            elif key == 'n':  
+                last_data['image0'] = frame_tensor
+                last_frame = frame
+                last_image_id = (vs.i - 1)
+                frame_id_left = frame_id
+            elif key in ['d', 'f']:
+                if key == 'd':
+                    if vis_range[0] >= 0:
+                       vis_range[0] -= 200
+                       vis_range[1] -= 200
+                if key =='f':
+                    vis_range[0] += 200
+                    vis_range[1] += 200
+                print(f'\nChanged the vis_range to {vis_range[0]}:{vis_range[1]}')
+            elif key in ['c', 'v']:
+                if key == 'c':
+                    vis_range[1] -= 50
+                if key =='v':
+                    vis_range[1] += 50
+                print(f'\nChanged the vis_range[1] to {vis_range[1]}')
+        elif opt.output_dir is not None:
+            stem = 'matches_{:06}_{:06}'.format(stem0, stem1)
+            out_file = str(Path(opt.output_dir, stem + '.png'))
+            print('\nWriting image to {}'.format(out_file))
+            cv2.imwrite(out_file, out)
+        else:
+            raise ValueError("output_dir is required when no display is given.")
+        timer.update('viz')
+        timer.print()
+
+
+    cv2.destroyAllWindows()
+    vs.cleanup()
--- a/demo/run_demo.sh
+++ b/demo/run_demo.sh
+#!/bin/bash
+set -e
+# set -x
+
+if [ ! -f utils.py ]; then
+    echo "Downloading utils.py from the SuperGlue repo."
+    echo "We cannot provide this file directly due to its strict licence."
+    wget https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/master/models/utils.py
+fi
+
+# Use webcam 0 as input source. 
+input=0
+# or use a pre-recorded video given the path.
+# input=/home/sunjiaming/Downloads/scannet_test/$scene_name.mp4
+
+# Toggle indoor/outdoor model here.
+model_ckpt=../weights/indoor_ds.ckpt
+# model_ckpt=../weights/outdoor_ds.ckpt
+
+# Optionally assign the GPU ID.
+# export CUDA_VISIBLE_DEVICES=0
+
+echo "Running LoFTR demo.."
+eval "$(conda shell.bash hook)"
+conda activate loftr
+python demo_loftr.py --weight $model_ckpt --input $input
+# To save the input video and output match visualizations.
+# python demo_loftr.py --weight $model_ckpt --input $input --save_video --save_input
+
+# Running on remote GPU servers with no GUI.
+# Save images first.
+# python demo_loftr.py --weight $model_ckpt --input $input --no_display --output_dir="./demo_images/"
+# Then convert them to a video.
+# ffmpeg -framerate 15 -pattern_type glob -i '*.png' -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4
--- a/docs/TRAINING.md
+++ b/docs/TRAINING.md
+
+# Traininig LoFTR
+
+## Dataset setup
+Generally, two parts of data are needed for training LoFTR, the original dataset, i.e., ScanNet and MegaDepth, and the offline generated dataset indices. The dataset indices store scenes, image pairs, and other metadata within each dataset used for training/validation/testing. For the MegaDepth dataset, the relative poses between images used for training are directly cached in the indexing files. However, the relative poses of ScanNet image pairs are not stored due to the enormous resulting file size.
+
+### Download datasets
+#### MegaDepth
+We use depth maps provided in the [original MegaDepth dataset](https://www.cs.cornell.edu/projects/megadepth/) as well as undistorted images, corresponding camera intrinsics and extrinsics preprocessed by [D2-Net](https://github.com/mihaidusmanu/d2-net#downloading-and-preprocessing-the-megadepth-dataset). You can download them separately from the following links. 
+- [MegaDepth undistorted images and processed depths](https://www.cs.cornell.edu/projects/megadepth/dataset/Megadepth_v1/MegaDepth_v1.tar.gz)
+    - Note that we only use depth maps.
+    - Path of the download data will be referreed to as `/path/to/megadepth`
+- [D2-Net preprocessed images](https://drive.google.com/drive/folders/1hxpOsqOZefdrba_BqnW490XpNX_LgXPB)
+    - Images are undistorted manually in D2-Net since the undistorted images from MegaDepth do not come with corresponding intrinsics.
+    - Path of the download data will be referreed to as `/path/to/megadepth_d2net`
+
+#### ScanNet
+Please set up the ScanNet dataset following [the official guide](https://github.com/ScanNet/ScanNet#scannet-data)
+> NOTE: We use the [python exported data](https://github.com/ScanNet/ScanNet/tree/master/SensReader/python),
+instead of the [c++ exported one](https://github.com/ScanNet/ScanNet/tree/master/SensReader/c%2B%2B).
+
+### Download the dataset indices
+
+You can download the required dataset indices from the [following link](https://drive.google.com/drive/folders/1DOcOPZb3-5cWxLqn256AhwUVjBPifhuf).
+After downloading, unzip the required files.
+```shell
+unzip downloaded-file.zip
+
+# extract dataset indices
+tar xf train-data/megadepth_indices.tar
+tar xf train-data/scannet_indices.tar
+
+# extract testing data (optional)
+tar xf testdata/megadepth_test_1500.tar
+tar xf testdata/scannet_test_1500.tar
+```
+
+### Build the dataset symlinks
+
+We symlink the datasets to the `data` directory under the main LoFTR project directory.
+
+```shell
+# scannet
+# -- # train and test dataset
+ln -s /path/to/scannet_train/* /path/to/LoFTR/data/scannet/train
+ln -s /path/to/scannet_test/* /path/to/LoFTR/data/scannet/test
+# -- # dataset indices
+ln -s /path/to/scannet_indices/* /path/to/LoFTR/data/scannet/index
+
+# megadepth
+# -- # train and test dataset (train and test share the same dataset)
+ln -sv /path/to/megadepth/phoenix /path/to/megadepth_d2net/Undistorted_SfM /path/to/LoFTR/data/megadepth/train
+ln -sv /path/to/megadepth/phoenix /path/to/megadepth_d2net/Undistorted_SfM /path/to/LoFTR/data/megadepth/test
+# -- # dataset indices
+ln -s /path/to/megadepth_indices/* /path/to/LoFTR/data/megadepth/index
+```
+
+
+## Training
+We provide training scripts of ScanNet and MegaDepth. The results in the LoFTR paper can be reproduced with 32/64 GPUs with at least 11GB of RAM for ScanNet, and 8/16 GPUs with at least 24GB of RAM for MegaDepth. For a different setup (e.g., training with 4 gpus on ScanNet), we scale the learning rate and its warm-up linearly, but the final evaluation results might vary due to the different batch size & learning rate used. Thus the reproduction of results in our paper is not guaranteed.
+
+Training scripts of the optimal-transport matcher end with "_ot" and ones of the dual-softmax matcher end with "_ds".
+
+The released training scripts use smaller setups comparing to ones used for training the released models. You could manually scale the setup (e.g., using 32 gpus instead of 4) to reproduce our results.
+
+
+### Training on ScanNet
+``` shell
+scripts/reproduce_train/indoor_ds.sh
+```
+> NOTE: It uses 4 gpus only. Reproduction of paper results is not guaranteed under this setup.
+
+
+### Training on MegaDepth
+``` shell
+scripts/reproduce_train/outdoor_ds.sh
+```
+> NOTE: It uses 4 gpus only, with smaller image sizes of 640x640. Reproduction of paper results is not guaranteed under this setup.
+
+
+## Updated Training Strategy
+In the released training code, we use a slightly modified version of the coarse-level training supervision comparing to the one described in our paper.
+For example, as described in our paper, we only supervise the ground-truth positive matches when training the dual-softmax model. However, the entire confidence matrix produced by the dual-softmax matcher is supervised by default in the released code, regardless of the use of softmax operators. This implementation is counter-intuitive and unusual but leads to better evaluation results on estimating relative camera poses. The same phenomenon applies to the optimal-transport matcher version as well. Note that we don't supervise the dustbin rows and columns under the dense supervision setup.
+
+> NOTE: To use the sparse supervision described in our paper, set `_CN.LOFTR.MATCH_COARSE.SPARSE_SPVS = False`.
--- a/environment.yaml
+++ b/environment.yaml
+name: loftr
+channels:
+  # - https://dx-mirrors.sensetime.com/anaconda/cloud/pytorch
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.8
+  - cudatoolkit=10.2
+  - pytorch=1.8.1
+  - pip
+  - pip:
+      - -r requirements.txt
--- a/image-1.png
+++ b/image-1.png
--- a/notebooks/demo_single_pair.ipynb
+++ b/notebooks/demo_single_pair.ipynb