add preprocessing

3d92aebb · bailuo · fcc0bcf3 · 3d92aebb · 3d92aebb · 3d92aebb
Commit 3d92aebb authored Jul 16, 2024 by bailuo
20 changed files
--- a/preprocessing/RAFT/filter_raft.py
+++ b/preprocessing/RAFT/filter_raft.py
+"""
+This script filters the raft optical flow using cycle consistency and appearance consistency
+checks (using dino features), and produces the following files:
+
+raft masks: h x w x 3 for each pair of flows, first channel stores the mask for cycle consistency,
+            second channel stores the mask for occlusion (i.e., regions that detected as occluded
+            where the prediction is likely to be reliable using double cycle consistency checks).
+count_maps: h x w for each frame (uint16), contains the number of valid correspondences for each pixel
+            across all frames.
+flow_stats.json: contains the total number of valid correspondences between each pair of frames.
+"""
+
+import json
+import argparse
+import os
+import glob
+import imageio
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from chain_raft import gen_grid, normalize_coords
+import warnings
+
+warnings.filterwarnings("ignore")
+
+DEVICE = 'cuda'
+
+
+def run_filtering(args):
+    feature_name = 'dino'
+    scene_dir = args.data_dir
+    print('flitering raft optical flow for {}....'.format(scene_dir))
+
+    img_files = sorted(glob.glob(os.path.join(scene_dir, 'color', '*')))
+    num_imgs = len(img_files)
+    pbar = tqdm(total=num_imgs * (num_imgs - 1))
+
+    out_flow_stats_file = os.path.join(scene_dir, 'flow_stats.json')
+    out_dir = os.path.join(scene_dir, 'raft_masks')
+    os.makedirs(out_dir, exist_ok=True)
+
+    count_out_dir = os.path.join(scene_dir, 'count_maps')
+    os.makedirs(count_out_dir, exist_ok=True)
+
+    h, w = imageio.imread(img_files[0]).shape[:2]
+    grid = gen_grid(h, w, device=DEVICE).permute(2, 0, 1)[None]
+    grid_normed = normalize_coords(grid.squeeze().permute(1, 2, 0), h, w)  # [h, w, 2]
+
+    features = [torch.from_numpy(np.load(os.path.join(scene_dir, 'features', feature_name,
+                                                      os.path.basename(img_file) + '.npy'))).float().to(DEVICE)
+                for img_file in img_files]
+
+    flow_stats = {}
+    count_maps = np.zeros((num_imgs, h, w), dtype=np.uint16)
+    for i in range(num_imgs):
+        imgname_i = os.path.basename(img_files[i])
+        feature_i = features[i].permute(2, 0, 1)[None]
+        feature_i_sampled = F.grid_sample(feature_i, grid_normed[None], align_corners=True)[0].permute(1, 2, 0)
+
+        for j in range(num_imgs):
+            if i == j:
+                continue
+            frame_interval = abs(i - j)
+            imgname_j = os.path.basename(img_files[j])
+            flow_f = np.load(os.path.join(scene_dir, 'raft_exhaustive', '{}_{}.npy'.format(imgname_i, imgname_j)))
+            flow_f = torch.from_numpy(flow_f).float().permute(2, 0, 1)[None].cuda()
+            flow_b = np.load(os.path.join(scene_dir, 'raft_exhaustive', '{}_{}.npy'.format(imgname_j, imgname_i)))
+            flow_b = torch.from_numpy(flow_b).float().permute(2, 0, 1)[None].cuda()
+
+            coord2 = flow_f + grid
+            coord2_normed = normalize_coords(coord2.squeeze().permute(1, 2, 0), h, w)  # [h, w, 2]
+            flow_21_sampled = F.grid_sample(flow_b, coord2_normed[None], align_corners=True)
+            map_i = flow_f + flow_21_sampled
+            fb_discrepancy = torch.norm(map_i.squeeze(), dim=0)
+            mask_cycle = fb_discrepancy < args.cycle_th
+
+            feature_j = features[j].permute(2, 0, 1)[None]
+            feature_j_sampled = F.grid_sample(feature_j, coord2_normed[None], align_corners=True)[0].permute(1, 2, 0)
+            feature_sim = torch.cosine_similarity(feature_i_sampled, feature_j_sampled, dim=-1)
+            feature_mask = feature_sim > 0.5
+
+            mask_cycle = mask_cycle * feature_mask if frame_interval >= 3 else mask_cycle
+
+            # only keep correspondences for occluded pixels if the correspondences are
+            # inconsistent in the first cycle but consistent in the second cycle
+            # and if the two frames are adjacent enough (interval < 3)
+            if frame_interval < 3:
+                coord_21 = grid + map_i  # [1, 2, h, w]
+                coord_21_normed = normalize_coords(coord_21.squeeze().permute(1, 2, 0), h, w)  # [h, w, 2]
+                flow_22 = F.grid_sample(flow_f, coord_21_normed[None], align_corners=True)
+                fbf_discrepancy = torch.norm((coord_21 + flow_22 - flow_f - grid).squeeze(), dim=0)
+                mask_in_range = (coord2_normed.min(dim=-1)[0] >= -1) * (coord2_normed.max(dim=-1)[0] <= 1)
+                mask_occluded = (fbf_discrepancy < args.cycle_th) * (fb_discrepancy > args.cycle_th * 1.5)
+                mask_occluded *= mask_in_range
+            else:
+                mask_occluded = torch.zeros_like(mask_cycle)
+
+            out_mask = torch.stack([mask_cycle, mask_occluded, torch.zeros_like(mask_cycle)], dim=-1).cpu().numpy()
+            imageio.imwrite('{}/{}_{}.png'.format(out_dir, imgname_i, imgname_j), (255 * out_mask.astype(np.uint8)))
+
+            if not imgname_i in flow_stats.keys():
+                flow_stats[imgname_i] = {}
+            flow_stats[imgname_i][imgname_j] = np.sum(out_mask).item()
+            count_maps[i] += out_mask.sum(axis=-1).astype(np.uint16)
+            pbar.update(1)
+
+    pbar.close()
+    with open(out_flow_stats_file, 'w') as fp:
+        json.dump(flow_stats, fp)
+
+    for i in range(num_imgs):
+        img_name = os.path.basename(img_files[i])
+        imageio.imwrite(os.path.join(count_out_dir, img_name.replace('.jpg', '.png')), count_maps[i])
+
+    print('filtering raft optical flow for {} is done\n'.format(scene_dir))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', type=str, default='', help='dataset dir')
+    parser.add_argument('--cycle_th', type=float, default=3., help='threshold for cycle consistency error')
+    args = parser.parse_args()
+
+    run_filtering(args)
--- a/preprocessing/RAFT/models-20240527T075142Z-001.zip
+++ b/preprocessing/RAFT/models-20240527T075142Z-001.zip
--- a/preprocessing/RAFT/models/raft-chairs.pth
+++ b/preprocessing/RAFT/models/raft-chairs.pth
--- a/preprocessing/RAFT/models/raft-kitti.pth
+++ b/preprocessing/RAFT/models/raft-kitti.pth
--- a/preprocessing/RAFT/models/raft-sintel.pth
+++ b/preprocessing/RAFT/models/raft-sintel.pth
--- a/preprocessing/RAFT/models/raft-small.pth
+++ b/preprocessing/RAFT/models/raft-small.pth
--- a/preprocessing/RAFT/models/raft-things.pth
+++ b/preprocessing/RAFT/models/raft-things.pth
--- a/preprocessing/RAFT/train.py
+++ b/preprocessing/RAFT/train.py
+from __future__ import print_function, division
+import sys
+sys.path.append('core')
+
+import argparse
+import os
+import cv2
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+
+from torch.utils.data import DataLoader
+from raft import RAFT
+import evaluate
+import datasets
+
+from torch.utils.tensorboard import SummaryWriter
+
+try:
+    from torch.cuda.amp import GradScaler
+except:
+    # dummy GradScaler for PyTorch < 1.6
+    class GradScaler:
+        def __init__(self):
+            pass
+        def scale(self, loss):
+            return loss
+        def unscale_(self, optimizer):
+            pass
+        def step(self, optimizer):
+            optimizer.step()
+        def update(self):
+            pass
+
+
+# exclude extremly large displacements
+MAX_FLOW = 400
+SUM_FREQ = 100
+VAL_FREQ = 5000
+
+
+def sequence_loss(flow_preds, flow_gt, valid, gamma=0.8, max_flow=MAX_FLOW):
+    """ Loss function defined over sequence of flow predictions """
+
+    n_predictions = len(flow_preds)    
+    flow_loss = 0.0
+
+    # exlude invalid pixels and extremely large diplacements
+    mag = torch.sum(flow_gt**2, dim=1).sqrt()
+    valid = (valid >= 0.5) & (mag < max_flow)
+
+    for i in range(n_predictions):
+        i_weight = gamma**(n_predictions - i - 1)
+        i_loss = (flow_preds[i] - flow_gt).abs()
+        flow_loss += i_weight * (valid[:, None] * i_loss).mean()
+
+    epe = torch.sum((flow_preds[-1] - flow_gt)**2, dim=1).sqrt()
+    epe = epe.view(-1)[valid.view(-1)]
+
+    metrics = {
+        'epe': epe.mean().item(),
+        '1px': (epe < 1).float().mean().item(),
+        '3px': (epe < 3).float().mean().item(),
+        '5px': (epe < 5).float().mean().item(),
+    }
+
+    return flow_loss, metrics
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def fetch_optimizer(args, model):
+    """ Create the optimizer and learning rate scheduler """
+    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.wdecay, eps=args.epsilon)
+
+    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, args.lr, args.num_steps+100,
+        pct_start=0.05, cycle_momentum=False, anneal_strategy='linear')
+
+    return optimizer, scheduler
+    
+
+class Logger:
+    def __init__(self, model, scheduler):
+        self.model = model
+        self.scheduler = scheduler
+        self.total_steps = 0
+        self.running_loss = {}
+        self.writer = None
+
+    def _print_training_status(self):
+        metrics_data = [self.running_loss[k]/SUM_FREQ for k in sorted(self.running_loss.keys())]
+        training_str = "[{:6d}, {:10.7f}] ".format(self.total_steps+1, self.scheduler.get_last_lr()[0])
+        metrics_str = ("{:10.4f}, "*len(metrics_data)).format(*metrics_data)
+        
+        # print the training status
+        print(training_str + metrics_str)
+
+        if self.writer is None:
+            self.writer = SummaryWriter()
+
+        for k in self.running_loss:
+            self.writer.add_scalar(k, self.running_loss[k]/SUM_FREQ, self.total_steps)
+            self.running_loss[k] = 0.0
+
+    def push(self, metrics):
+        self.total_steps += 1
+
+        for key in metrics:
+            if key not in self.running_loss:
+                self.running_loss[key] = 0.0
+
+            self.running_loss[key] += metrics[key]
+
+        if self.total_steps % SUM_FREQ == SUM_FREQ-1:
+            self._print_training_status()
+            self.running_loss = {}
+
+    def write_dict(self, results):
+        if self.writer is None:
+            self.writer = SummaryWriter()
+
+        for key in results:
+            self.writer.add_scalar(key, results[key], self.total_steps)
+
+    def close(self):
+        self.writer.close()
+
+
+def train(args):
+
+    model = nn.DataParallel(RAFT(args), device_ids=args.gpus)
+    print("Parameter Count: %d" % count_parameters(model))
+
+    if args.restore_ckpt is not None:
+        model.load_state_dict(torch.load(args.restore_ckpt), strict=False)
+
+    model.cuda()
+    model.train()
+
+    if args.stage != 'chairs':
+        model.module.freeze_bn()
+
+    train_loader = datasets.fetch_dataloader(args)
+    optimizer, scheduler = fetch_optimizer(args, model)
+
+    total_steps = 0
+    scaler = GradScaler(enabled=args.mixed_precision)
+    logger = Logger(model, scheduler)
+
+    VAL_FREQ = 5000
+    add_noise = True
+
+    should_keep_training = True
+    while should_keep_training:
+
+        for i_batch, data_blob in enumerate(train_loader):
+            optimizer.zero_grad()
+            image1, image2, flow, valid = [x.cuda() for x in data_blob]
+
+            if args.add_noise:
+                stdv = np.random.uniform(0.0, 5.0)
+                image1 = (image1 + stdv * torch.randn(*image1.shape).cuda()).clamp(0.0, 255.0)
+                image2 = (image2 + stdv * torch.randn(*image2.shape).cuda()).clamp(0.0, 255.0)
+
+            flow_predictions = model(image1, image2, iters=args.iters)            
+
+            loss, metrics = sequence_loss(flow_predictions, flow, valid, args.gamma)
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)                
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
+            
+            scaler.step(optimizer)
+            scheduler.step()
+            scaler.update()
+
+            logger.push(metrics)
+
+            if total_steps % VAL_FREQ == VAL_FREQ - 1:
+                PATH = 'checkpoints/%d_%s.pth' % (total_steps+1, args.name)
+                torch.save(model.state_dict(), PATH)
+
+                results = {}
+                for val_dataset in args.validation:
+                    if val_dataset == 'chairs':
+                        results.update(evaluate.validate_chairs(model.module))
+                    elif val_dataset == 'sintel':
+                        results.update(evaluate.validate_sintel(model.module))
+                    elif val_dataset == 'kitti':
+                        results.update(evaluate.validate_kitti(model.module))
+
+                logger.write_dict(results)
+                
+                model.train()
+                if args.stage != 'chairs':
+                    model.module.freeze_bn()
+            
+            total_steps += 1
+
+            if total_steps > args.num_steps:
+                should_keep_training = False
+                break
+
+    logger.close()
+    PATH = 'checkpoints/%s.pth' % args.name
+    torch.save(model.state_dict(), PATH)
+
+    return PATH
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name', default='raft', help="name your experiment")
+    parser.add_argument('--stage', help="determines which dataset to use for training") 
+    parser.add_argument('--restore_ckpt', help="restore checkpoint")
+    parser.add_argument('--small', action='store_true', help='use small model')
+    parser.add_argument('--validation', type=str, nargs='+')
+
+    parser.add_argument('--lr', type=float, default=0.00002)
+    parser.add_argument('--num_steps', type=int, default=100000)
+    parser.add_argument('--batch_size', type=int, default=6)
+    parser.add_argument('--image_size', type=int, nargs='+', default=[384, 512])
+    parser.add_argument('--gpus', type=int, nargs='+', default=[0,1])
+    parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
+
+    parser.add_argument('--iters', type=int, default=12)
+    parser.add_argument('--wdecay', type=float, default=.00005)
+    parser.add_argument('--epsilon', type=float, default=1e-8)
+    parser.add_argument('--clip', type=float, default=1.0)
+    parser.add_argument('--dropout', type=float, default=0.0)
+    parser.add_argument('--gamma', type=float, default=0.8, help='exponential weighting')
+    parser.add_argument('--add_noise', action='store_true')
+    args = parser.parse_args()
+
+    torch.manual_seed(1234)
+    np.random.seed(1234)
+
+    if not os.path.isdir('checkpoints'):
+        os.mkdir('checkpoints')
+
+    train(args)
\ No newline at end of file
--- a/preprocessing/RAFT/train_mixed.sh
+++ b/preprocessing/RAFT/train_mixed.sh
+#!/bin/bash
+mkdir -p checkpoints
+python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 --num_steps 120000 --batch_size 8 --lr 0.00025 --image_size 368 496 --wdecay 0.0001 --mixed_precision 
+python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 400 720 --wdecay 0.0001 --mixed_precision
+python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 368 768 --wdecay 0.00001 --gamma=0.85 --mixed_precision
+python -u train.py --name raft-kitti  --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 --num_steps 50000 --batch_size 5 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85 --mixed_precision
--- a/preprocessing/RAFT/train_standard.sh
+++ b/preprocessing/RAFT/train_standard.sh
+#!/bin/bash
+mkdir -p checkpoints
+python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 1 --num_steps 100000 --batch_size 10 --lr 0.0004 --image_size 368 496 --wdecay 0.0001
+python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 400 720 --wdecay 0.0001
+python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 368 768 --wdecay 0.00001 --gamma=0.85
+python -u train.py --name raft-kitti  --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 1 --num_steps 50000 --batch_size 6 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85
--- a/preprocessing/README.md
+++ b/preprocessing/README.md
+# Data processing
+
+This README file contains instructions to compute and process RAFT optical flows for optimizing OmniMotion.
+
+## Data format
+The input video data should be organized in the following format:
+```
+├──sequence_name/
+    ├──color/
+        ├──00000.jpg
+        ├──00001.jpg
+        .....
+    ├──mask/ (optional; only used for visualization purposes)
+        ├──00000.png
+        ├──00001.png
+        ..... 
+```
+If you want to run on [DAVIS](https://davischallenge.org/index.html) video sequences, you can run `python get_davis.py <out_dir>` 
+which will download the original dataset and organize it in our format for processing. Alternatively, you can 
+download some of our processed sequences [here](https://omnimotion.cs.cornell.edu/dataset/) to skip processing and directly start training.
+
+If you want to train on your own video sequence, we recommend you to start with
+shorter sequences (< 60 frames) and lower resolution (<= 480p) to manage computational cost. 
+You may use `ffmpeg` to extract frames from the video.
+
+
+## Preparation
+The command below moves files to the correct locations and download pretrained models (this only needs to be run once).
+```
+cd preprocessing/  
+
+mv exhaustive_raft.py filter_raft.py chain_raft.py RAFT/;
+cd RAFT; ./download_models.sh; cd ../
+
+mv extract_dino_features.py dino/
+```
+
+## Computing and processing flow
+
+Run the following command to process the input video sequence. Please use absolute path for the sequence directory.
+```
+conda activate omnimotion
+python main_processing.py --data_dir <sequence directory> --chain
+```
+The processing contains several steps:
+- computing all pairwise optical flows using `exhaustive_raft.py`
+- computing dino features for each frame using `extract_dino_features.py`
+- filtering flows using cycle consistency and appearance consistency check using`filter_raft.py`
+- (optional) chaining only cycle consistent flows to create denser correspondences using `chain_raft.py`. 
+  We found this to be helpful for handling sequences with rapid motion and large displacements. 
+  For simple motion, this may be skipped by omitting `--chain` to save processing time. 
+
+After processing the folder should look like the following:
+```
+├──sequence_name/
+    ├──color/
+    ├──mask/ (optional; only used for visualization purposes)
+    ├──count_maps/
+    ├──features/
+    ├──raft_exhaustive/
+    ├──raft_masks/
+    ├──flow_stats.json
+```
+
+## Discussion
+This processing pipeline is designed to filter and process RAFT optical flow for training our method. 
+Our method can also take as input correspondences from other methods, e.g., [TAPIR](https://deepmind-tapir.github.io/) and
+[CoTracker](https://co-tracker.github.io/). 
+If you want to use different correspondences as input supervision, note that their error patterns might be different from
+those of RAFT optical flow, and you may need to devise new filtering methods that are effective for the specific correspondences
+you are working with.
--- a/preprocessing/dino/LICENSE
+++ b/preprocessing/dino/LICENSE
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/preprocessing/dino/README.md
+++ b/preprocessing/dino/README.md
--- a/preprocessing/dino/__pycache__/utils.cpython-310.pyc
+++ b/preprocessing/dino/__pycache__/utils.cpython-310.pyc
--- a/preprocessing/dino/__pycache__/vision_transformer.cpython-310.pyc
+++ b/preprocessing/dino/__pycache__/vision_transformer.cpython-310.pyc
--- a/preprocessing/dino/eval_copy_detection.py
+++ b/preprocessing/dino/eval_copy_detection.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import pickle
+import argparse
+
+import torch
+from torch import nn
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torchvision import models as torchvision_models
+from torchvision import transforms as pth_transforms
+from PIL import Image, ImageFile
+import numpy as np
+
+import utils
+import vision_transformer as vits
+from eval_knn import extract_features
+
+
+class CopydaysDataset():
+    def __init__(self, basedir):
+        self.basedir = basedir
+        self.block_names = (
+            ['original', 'strong'] +
+            ['jpegqual/%d' % i for i in
+             [3, 5, 8, 10, 15, 20, 30, 50, 75]] +
+            ['crops/%d' % i for i in
+             [10, 15, 20, 30, 40, 50, 60, 70, 80]])
+        self.nblocks = len(self.block_names)
+
+        self.query_blocks = range(self.nblocks)
+        self.q_block_sizes = np.ones(self.nblocks, dtype=int) * 157
+        self.q_block_sizes[1] = 229
+        # search only among originals
+        self.database_blocks = [0]
+
+    def get_block(self, i):
+        dirname = self.basedir + '/' + self.block_names[i]
+        fnames = [dirname + '/' + fname
+                  for fname in sorted(os.listdir(dirname))
+                  if fname.endswith('.jpg')]
+        return fnames
+
+    def get_block_filenames(self, subdir_name):
+        dirname = self.basedir + '/' + subdir_name
+        return [fname
+                for fname in sorted(os.listdir(dirname))
+                if fname.endswith('.jpg')]
+
+    def eval_result(self, ids, distances):
+        j0 = 0
+        for i in range(self.nblocks):
+            j1 = j0 + self.q_block_sizes[i]
+            block_name = self.block_names[i]
+            I = ids[j0:j1]   # block size
+            sum_AP = 0
+            if block_name != 'strong':
+                # 1:1 mapping of files to names
+                positives_per_query = [[i] for i in range(j1 - j0)]
+            else:
+                originals = self.get_block_filenames('original')
+                strongs = self.get_block_filenames('strong')
+
+                # check if prefixes match
+                positives_per_query = [
+                    [j for j, bname in enumerate(originals)
+                     if bname[:4] == qname[:4]]
+                    for qname in strongs]
+
+            for qno, Iline in enumerate(I):
+                positives = positives_per_query[qno]
+                ranks = []
+                for rank, bno in enumerate(Iline):
+                    if bno in positives:
+                        ranks.append(rank)
+                sum_AP += score_ap_from_ranks_1(ranks, len(positives))
+
+            print("eval on %s mAP=%.3f" % (
+                block_name, sum_AP / (j1 - j0)))
+            j0 = j1
+
+
+# from the Holidays evaluation package
+def score_ap_from_ranks_1(ranks, nres):
+    """ Compute the average precision of one search.
+    ranks = ordered list of ranks of true positives
+    nres  = total number of positives in dataset
+    """
+
+    # accumulate trapezoids in PR-plot
+    ap = 0.0
+
+    # All have an x-size of:
+    recall_step = 1.0 / nres
+
+    for ntp, rank in enumerate(ranks):
+
+        # y-size on left side of trapezoid:
+        # ntp = nb of true positives so far
+        # rank = nb of retrieved items so far
+        if rank == 0:
+            precision_0 = 1.0
+        else:
+            precision_0 = ntp / float(rank)
+
+        # y-size on right side of trapezoid:
+        # ntp and rank are increased by one
+        precision_1 = (ntp + 1) / float(rank + 1)
+
+        ap += (precision_1 + precision_0) * recall_step / 2.0
+
+    return ap
+
+
+class ImgListDataset(torch.utils.data.Dataset):
+    def __init__(self, img_list, transform=None):
+        self.samples = img_list
+        self.transform = transform
+
+    def __getitem__(self, i):
+        with open(self.samples[i], 'rb') as f:
+            img = Image.open(f)
+            img = img.convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, i
+
+    def __len__(self):
+        return len(self.samples)
+
+
+def is_image_file(s):
+    ext = s.split(".")[-1]
+    if ext in ['jpg', 'jpeg', 'png', 'ppm', 'bmp', 'pgm', 'tif', 'tiff', 'webp']:
+        return True
+    return False
+
+
+@torch.no_grad()
+def extract_features(image_list, model, args):
+    transform = pth_transforms.Compose([
+        pth_transforms.Resize((args.imsize, args.imsize), interpolation=3),
+        pth_transforms.ToTensor(),
+        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    tempdataset = ImgListDataset(image_list, transform=transform)
+    data_loader = torch.utils.data.DataLoader(tempdataset, batch_size=args.batch_size_per_gpu,
+        num_workers=args.num_workers, drop_last=False,
+        sampler=torch.utils.data.DistributedSampler(tempdataset, shuffle=False))
+    features = None
+    for samples, index in utils.MetricLogger(delimiter="  ").log_every(data_loader, 10):
+        samples, index = samples.cuda(non_blocking=True), index.cuda(non_blocking=True)
+        feats = model.get_intermediate_layers(samples, n=1)[0].clone()
+
+        cls_output_token = feats[:, 0, :]  #  [CLS] token
+        # GeM with exponent 4 for output patch tokens
+        b, h, w, d = len(samples), int(samples.shape[-2] / model.patch_embed.patch_size), int(samples.shape[-1] / model.patch_embed.patch_size), feats.shape[-1]
+        feats = feats[:, 1:, :].reshape(b, h, w, d)
+        feats = feats.clamp(min=1e-6).permute(0, 3, 1, 2)
+        feats = nn.functional.avg_pool2d(feats.pow(4), (h, w)).pow(1. / 4).reshape(b, -1)
+        # concatenate [CLS] token and GeM pooled patch tokens
+        feats = torch.cat((cls_output_token, feats), dim=1)
+
+        # init storage feature matrix
+        if dist.get_rank() == 0 and features is None:
+            features = torch.zeros(len(data_loader.dataset), feats.shape[-1])
+            if args.use_cuda:
+                features = features.cuda(non_blocking=True)
+
+        # get indexes from all processes
+        y_all = torch.empty(dist.get_world_size(), index.size(0), dtype=index.dtype, device=index.device)
+        y_l = list(y_all.unbind(0))
+        y_all_reduce = torch.distributed.all_gather(y_l, index, async_op=True)
+        y_all_reduce.wait()
+        index_all = torch.cat(y_l)
+
+        # share features between processes
+        feats_all = torch.empty(dist.get_world_size(), feats.size(0), feats.size(1),
+                                dtype=feats.dtype, device=feats.device)
+        output_l = list(feats_all.unbind(0))
+        output_all_reduce = torch.distributed.all_gather(output_l, feats, async_op=True)
+        output_all_reduce.wait()
+
+        # update storage feature matrix
+        if dist.get_rank() == 0:
+            if args.use_cuda:
+                features.index_copy_(0, index_all, torch.cat(output_l))
+            else:
+                features.index_copy_(0, index_all.cpu(), torch.cat(output_l).cpu())
+    return features  # features is still None for every rank which is not 0 (main)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('Copy detection on Copydays')
+    parser.add_argument('--data_path', default='/path/to/copydays/', type=str,
+        help="See https://lear.inrialpes.fr/~jegou/data.php#copydays")
+    parser.add_argument('--whitening_path', default='/path/to/whitening_data/', type=str,
+        help="""Path to directory with images used for computing the whitening operator.
+        In our paper, we use 20k random images from YFCC100M.""")
+    parser.add_argument('--distractors_path', default='/path/to/distractors/', type=str,
+        help="Path to directory with distractors images. In our paper, we use 10k random images from YFCC100M.")
+    parser.add_argument('--imsize', default=320, type=int, help='Image size (square image)')
+    parser.add_argument('--batch_size_per_gpu', default=16, type=int, help='Per-GPU batch-size')
+    parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.")
+    parser.add_argument('--use_cuda', default=True, type=utils.bool_flag)
+    parser.add_argument('--arch', default='vit_base', type=str, help='Architecture')
+    parser.add_argument('--patch_size', default=8, type=int, help='Patch resolution of the model.')
+    parser.add_argument("--checkpoint_key", default="teacher", type=str,
+        help='Key to use in the checkpoint (example: "teacher")')
+    parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.')
+    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
+        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
+    parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.")
+    args = parser.parse_args()
+
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+    cudnn.benchmark = True
+
+    # ============ building network ... ============
+    if "vit" in args.arch:
+        model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0)
+        print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.")
+    else:
+        print(f"Architecture {args.arch} non supported")
+        sys.exit(1)
+    if args.use_cuda:
+        model.cuda()
+    model.eval()
+    utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size)
+
+    dataset = CopydaysDataset(args.data_path)
+
+    # ============ Extract features ... ============
+    # extract features for queries
+    queries = []
+    for q in dataset.query_blocks:
+        queries.append(extract_features(dataset.get_block(q), model, args))
+    if utils.get_rank() == 0:
+        queries = torch.cat(queries)
+        print(f"Extraction of queries features done. Shape: {queries.shape}")
+
+    # extract features for database
+    database = []
+    for b in dataset.database_blocks:
+        database.append(extract_features(dataset.get_block(b), model, args))
+
+    # extract features for distractors
+    if os.path.isdir(args.distractors_path):
+        print("Using distractors...")
+        list_distractors = [os.path.join(args.distractors_path, s) for s in os.listdir(args.distractors_path) if is_image_file(s)]
+        database.append(extract_features(list_distractors, model, args))
+    if utils.get_rank() == 0:
+        database = torch.cat(database)
+        print(f"Extraction of database and distractors features done. Shape: {database.shape}")
+
+    # ============ Whitening ... ============
+    if os.path.isdir(args.whitening_path):
+        print(f"Extracting features on images from {args.whitening_path} for learning the whitening operator.")
+        list_whit = [os.path.join(args.whitening_path, s) for s in os.listdir(args.whitening_path) if is_image_file(s)]
+        features_for_whitening = extract_features(list_whit, model, args)
+        if utils.get_rank() == 0:
+            # center
+            mean_feature = torch.mean(features_for_whitening, dim=0)
+            database -= mean_feature
+            queries -= mean_feature
+            pca = utils.PCA(dim=database.shape[-1], whit=0.5)
+            # compute covariance
+            cov = torch.mm(features_for_whitening.T, features_for_whitening) / features_for_whitening.shape[0]
+            pca.train_pca(cov.cpu().numpy())
+            database = pca.apply(database)
+            queries = pca.apply(queries)
+
+    # ============ Copy detection ... ============
+    if utils.get_rank() == 0:
+        # l2 normalize the features
+        database = nn.functional.normalize(database, dim=1, p=2)
+        queries = nn.functional.normalize(queries, dim=1, p=2)
+
+        # similarity
+        similarity = torch.mm(queries, database.T)
+        distances, indices = similarity.topk(20, largest=True, sorted=True)
+
+        # evaluate
+        retrieved = dataset.eval_result(indices, distances)
+    dist.barrier()
+
--- a/preprocessing/dino/eval_image_retrieval.py
+++ b/preprocessing/dino/eval_image_retrieval.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import pickle
+import argparse
+
+import torch
+from torch import nn
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torchvision import models as torchvision_models
+from torchvision import transforms as pth_transforms
+from PIL import Image, ImageFile
+import numpy as np
+
+import utils
+import vision_transformer as vits
+from eval_knn import extract_features
+
+
+class OxfordParisDataset(torch.utils.data.Dataset):
+    def __init__(self, dir_main, dataset, split, transform=None, imsize=None):
+        if dataset not in ['roxford5k', 'rparis6k']:
+            raise ValueError('Unknown dataset: {}!'.format(dataset))
+
+        # loading imlist, qimlist, and gnd, in cfg as a dict
+        gnd_fname = os.path.join(dir_main, dataset, 'gnd_{}.pkl'.format(dataset))
+        with open(gnd_fname, 'rb') as f:
+            cfg = pickle.load(f)
+        cfg['gnd_fname'] = gnd_fname
+        cfg['ext'] = '.jpg'
+        cfg['qext'] = '.jpg'
+        cfg['dir_data'] = os.path.join(dir_main, dataset)
+        cfg['dir_images'] = os.path.join(cfg['dir_data'], 'jpg')
+        cfg['n'] = len(cfg['imlist'])
+        cfg['nq'] = len(cfg['qimlist'])
+        cfg['im_fname'] = config_imname
+        cfg['qim_fname'] = config_qimname
+        cfg['dataset'] = dataset
+        self.cfg = cfg
+
+        self.samples = cfg["qimlist"] if split == "query" else cfg["imlist"]
+        self.transform = transform
+        self.imsize = imsize
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, index):
+        path = os.path.join(self.cfg["dir_images"], self.samples[index] + ".jpg")
+        ImageFile.LOAD_TRUNCATED_IMAGES = True
+        with open(path, 'rb') as f:
+            img = Image.open(f)
+            img = img.convert('RGB')
+        if self.imsize is not None:
+            img.thumbnail((self.imsize, self.imsize), Image.ANTIALIAS)
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, index
+
+
+def config_imname(cfg, i):
+    return os.path.join(cfg['dir_images'], cfg['imlist'][i] + cfg['ext'])
+
+
+def config_qimname(cfg, i):
+    return os.path.join(cfg['dir_images'], cfg['qimlist'][i] + cfg['qext'])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('Image Retrieval on revisited Paris and Oxford')
+    parser.add_argument('--data_path', default='/path/to/revisited_paris_oxford/', type=str)
+    parser.add_argument('--dataset', default='roxford5k', type=str, choices=['roxford5k', 'rparis6k'])
+    parser.add_argument('--multiscale', default=False, type=utils.bool_flag)
+    parser.add_argument('--imsize', default=224, type=int, help='Image size')
+    parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.")
+    parser.add_argument('--use_cuda', default=True, type=utils.bool_flag)
+    parser.add_argument('--arch', default='vit_small', type=str, help='Architecture')
+    parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.')
+    parser.add_argument("--checkpoint_key", default="teacher", type=str,
+        help='Key to use in the checkpoint (example: "teacher")')
+    parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.')
+    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
+        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
+    parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.")
+    args = parser.parse_args()
+
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+    cudnn.benchmark = True
+
+    # ============ preparing data ... ============
+    transform = pth_transforms.Compose([
+        pth_transforms.ToTensor(),
+        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    dataset_train = OxfordParisDataset(args.data_path, args.dataset, split="train", transform=transform, imsize=args.imsize)
+    dataset_query = OxfordParisDataset(args.data_path, args.dataset, split="query", transform=transform, imsize=args.imsize)
+    sampler = torch.utils.data.DistributedSampler(dataset_train, shuffle=False)
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train,
+        sampler=sampler,
+        batch_size=1,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+    )
+    data_loader_query = torch.utils.data.DataLoader(
+        dataset_query,
+        batch_size=1,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+    )
+    print(f"train: {len(dataset_train)} imgs / query: {len(dataset_query)} imgs")
+
+    # ============ building network ... ============
+    if "vit" in args.arch:
+        model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0)
+        print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.")
+    elif "xcit" in args.arch:
+        model = torch.hub.load('facebookresearch/xcit:main', args.arch, num_classes=0)
+    elif args.arch in torchvision_models.__dict__.keys():
+        model = torchvision_models.__dict__[args.arch](num_classes=0)
+    else:
+        print(f"Architecture {args.arch} non supported")
+        sys.exit(1)
+    if args.use_cuda:
+        model.cuda()
+    model.eval()
+
+    # load pretrained weights
+    if os.path.isfile(args.pretrained_weights):
+        state_dict = torch.load(args.pretrained_weights, map_location="cpu")
+        if args.checkpoint_key is not None and args.checkpoint_key in state_dict:
+            print(f"Take key {args.checkpoint_key} in provided checkpoint dict")
+            state_dict = state_dict[args.checkpoint_key]
+        # remove `module.` prefix
+        state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+        # remove `backbone.` prefix induced by multicrop wrapper
+        state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+        msg = model.load_state_dict(state_dict, strict=False)
+        print('Pretrained weights found at {} and loaded with msg: {}'.format(args.pretrained_weights, msg))
+    elif args.arch == "vit_small" and args.patch_size == 16:
+        print("Since no pretrained weights have been provided, we load pretrained DINO weights on Google Landmark v2.")
+        model.load_state_dict(torch.hub.load_state_dict_from_url(url="https://dl.fbaipublicfiles.com/dino/dino_vitsmall16_googlelandmark_pretrain/dino_vitsmall16_googlelandmark_pretrain.pth"))
+    else:
+        print("Warning: We use random weights.")
+
+    ############################################################################
+    # Step 1: extract features
+    train_features = extract_features(model, data_loader_train, args.use_cuda, multiscale=args.multiscale)
+    query_features = extract_features(model, data_loader_query, args.use_cuda, multiscale=args.multiscale)
+
+    if utils.get_rank() == 0:  # only rank 0 will work from now on
+        # normalize features
+        train_features = nn.functional.normalize(train_features, dim=1, p=2)
+        query_features = nn.functional.normalize(query_features, dim=1, p=2)
+
+        ############################################################################
+        # Step 2: similarity
+        sim = torch.mm(train_features, query_features.T)
+        ranks = torch.argsort(-sim, dim=0).cpu().numpy()
+
+        ############################################################################
+        # Step 3: evaluate
+        gnd = dataset_train.cfg['gnd']
+        # evaluate ranks
+        ks = [1, 5, 10]
+        # search for easy & hard
+        gnd_t = []
+        for i in range(len(gnd)):
+            g = {}
+            g['ok'] = np.concatenate([gnd[i]['easy'], gnd[i]['hard']])
+            g['junk'] = np.concatenate([gnd[i]['junk']])
+            gnd_t.append(g)
+        mapM, apsM, mprM, prsM = utils.compute_map(ranks, gnd_t, ks)
+        # search for hard
+        gnd_t = []
+        for i in range(len(gnd)):
+            g = {}
+            g['ok'] = np.concatenate([gnd[i]['hard']])
+            g['junk'] = np.concatenate([gnd[i]['junk'], gnd[i]['easy']])
+            gnd_t.append(g)
+        mapH, apsH, mprH, prsH = utils.compute_map(ranks, gnd_t, ks)
+        print('>> {}: mAP M: {}, H: {}'.format(args.dataset, np.around(mapM*100, decimals=2), np.around(mapH*100, decimals=2)))
+        print('>> {}: mP@k{} M: {}, H: {}'.format(args.dataset, np.array(ks), np.around(mprM*100, decimals=2), np.around(mprH*100, decimals=2)))
+    dist.barrier()
--- a/preprocessing/dino/eval_knn.py
+++ b/preprocessing/dino/eval_knn.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import argparse
+
+import torch
+from torch import nn
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torchvision import datasets
+from torchvision import transforms as pth_transforms
+from torchvision import models as torchvision_models
+
+import utils
+import vision_transformer as vits
+
+
+def extract_feature_pipeline(args):
+    # ============ preparing data ... ============
+    transform = pth_transforms.Compose([
+        pth_transforms.Resize(256, interpolation=3),
+        pth_transforms.CenterCrop(224),
+        pth_transforms.ToTensor(),
+        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    dataset_train = ReturnIndexDataset(os.path.join(args.data_path, "train"), transform=transform)
+    dataset_val = ReturnIndexDataset(os.path.join(args.data_path, "val"), transform=transform)
+    sampler = torch.utils.data.DistributedSampler(dataset_train, shuffle=False)
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train,
+        sampler=sampler,
+        batch_size=args.batch_size_per_gpu,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+    )
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val,
+        batch_size=args.batch_size_per_gpu,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+    )
+    print(f"Data loaded with {len(dataset_train)} train and {len(dataset_val)} val imgs.")
+
+    # ============ building network ... ============
+    if "vit" in args.arch:
+        model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0)
+        print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.")
+    elif "xcit" in args.arch:
+        model = torch.hub.load('facebookresearch/xcit:main', args.arch, num_classes=0)
+    elif args.arch in torchvision_models.__dict__.keys():
+        model = torchvision_models.__dict__[args.arch](num_classes=0)
+        model.fc = nn.Identity()
+    else:
+        print(f"Architecture {args.arch} non supported")
+        sys.exit(1)
+    model.cuda()
+    utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size)
+    model.eval()
+
+    # ============ extract features ... ============
+    print("Extracting features for train set...")
+    train_features = extract_features(model, data_loader_train, args.use_cuda)
+    print("Extracting features for val set...")
+    test_features = extract_features(model, data_loader_val, args.use_cuda)
+
+    if utils.get_rank() == 0:
+        train_features = nn.functional.normalize(train_features, dim=1, p=2)
+        test_features = nn.functional.normalize(test_features, dim=1, p=2)
+
+    train_labels = torch.tensor([s[-1] for s in dataset_train.samples]).long()
+    test_labels = torch.tensor([s[-1] for s in dataset_val.samples]).long()
+    # save features and labels
+    if args.dump_features and dist.get_rank() == 0:
+        torch.save(train_features.cpu(), os.path.join(args.dump_features, "trainfeat.pth"))
+        torch.save(test_features.cpu(), os.path.join(args.dump_features, "testfeat.pth"))
+        torch.save(train_labels.cpu(), os.path.join(args.dump_features, "trainlabels.pth"))
+        torch.save(test_labels.cpu(), os.path.join(args.dump_features, "testlabels.pth"))
+    return train_features, test_features, train_labels, test_labels
+
+
+@torch.no_grad()
+def extract_features(model, data_loader, use_cuda=True, multiscale=False):
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    features = None
+    for samples, index in metric_logger.log_every(data_loader, 10):
+        samples = samples.cuda(non_blocking=True)
+        index = index.cuda(non_blocking=True)
+        if multiscale:
+            feats = utils.multi_scale(samples, model)
+        else:
+            feats = model(samples).clone()
+
+        # init storage feature matrix
+        if dist.get_rank() == 0 and features is None:
+            features = torch.zeros(len(data_loader.dataset), feats.shape[-1])
+            if use_cuda:
+                features = features.cuda(non_blocking=True)
+            print(f"Storing features into tensor of shape {features.shape}")
+
+        # get indexes from all processes
+        y_all = torch.empty(dist.get_world_size(), index.size(0), dtype=index.dtype, device=index.device)
+        y_l = list(y_all.unbind(0))
+        y_all_reduce = torch.distributed.all_gather(y_l, index, async_op=True)
+        y_all_reduce.wait()
+        index_all = torch.cat(y_l)
+
+        # share features between processes
+        feats_all = torch.empty(
+            dist.get_world_size(),
+            feats.size(0),
+            feats.size(1),
+            dtype=feats.dtype,
+            device=feats.device,
+        )
+        output_l = list(feats_all.unbind(0))
+        output_all_reduce = torch.distributed.all_gather(output_l, feats, async_op=True)
+        output_all_reduce.wait()
+
+        # update storage feature matrix
+        if dist.get_rank() == 0:
+            if use_cuda:
+                features.index_copy_(0, index_all, torch.cat(output_l))
+            else:
+                features.index_copy_(0, index_all.cpu(), torch.cat(output_l).cpu())
+    return features
+
+
+@torch.no_grad()
+def knn_classifier(train_features, train_labels, test_features, test_labels, k, T, num_classes=1000):
+    top1, top5, total = 0.0, 0.0, 0
+    train_features = train_features.t()
+    num_test_images, num_chunks = test_labels.shape[0], 100
+    imgs_per_chunk = num_test_images // num_chunks
+    retrieval_one_hot = torch.zeros(k, num_classes).to(train_features.device)
+    for idx in range(0, num_test_images, imgs_per_chunk):
+        # get the features for test images
+        features = test_features[
+            idx : min((idx + imgs_per_chunk), num_test_images), :
+        ]
+        targets = test_labels[idx : min((idx + imgs_per_chunk), num_test_images)]
+        batch_size = targets.shape[0]
+
+        # calculate the dot product and compute top-k neighbors
+        similarity = torch.mm(features, train_features)
+        distances, indices = similarity.topk(k, largest=True, sorted=True)
+        candidates = train_labels.view(1, -1).expand(batch_size, -1)
+        retrieved_neighbors = torch.gather(candidates, 1, indices)
+
+        retrieval_one_hot.resize_(batch_size * k, num_classes).zero_()
+        retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1)
+        distances_transform = distances.clone().div_(T).exp_()
+        probs = torch.sum(
+            torch.mul(
+                retrieval_one_hot.view(batch_size, -1, num_classes),
+                distances_transform.view(batch_size, -1, 1),
+            ),
+            1,
+        )
+        _, predictions = probs.sort(1, True)
+
+        # find the predictions that match the target
+        correct = predictions.eq(targets.data.view(-1, 1))
+        top1 = top1 + correct.narrow(1, 0, 1).sum().item()
+        top5 = top5 + correct.narrow(1, 0, min(5, k)).sum().item()  # top5 does not make sense if k < 5
+        total += targets.size(0)
+    top1 = top1 * 100.0 / total
+    top5 = top5 * 100.0 / total
+    return top1, top5
+
+
+class ReturnIndexDataset(datasets.ImageFolder):
+    def __getitem__(self, idx):
+        img, lab = super(ReturnIndexDataset, self).__getitem__(idx)
+        return img, idx
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('Evaluation with weighted k-NN on ImageNet')
+    parser.add_argument('--batch_size_per_gpu', default=128, type=int, help='Per-GPU batch-size')
+    parser.add_argument('--nb_knn', default=[10, 20, 100, 200], nargs='+', type=int,
+        help='Number of NN to use. 20 is usually working the best.')
+    parser.add_argument('--temperature', default=0.07, type=float,
+        help='Temperature used in the voting coefficient')
+    parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.")
+    parser.add_argument('--use_cuda', default=True, type=utils.bool_flag,
+        help="Should we store the features on GPU? We recommend setting this to False if you encounter OOM")
+    parser.add_argument('--arch', default='vit_small', type=str, help='Architecture')
+    parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.')
+    parser.add_argument("--checkpoint_key", default="teacher", type=str,
+        help='Key to use in the checkpoint (example: "teacher")')
+    parser.add_argument('--dump_features', default=None,
+        help='Path where to save computed features, empty for no saving')
+    parser.add_argument('--load_features', default=None, help="""If the features have
+        already been computed, where to find them.""")
+    parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.')
+    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
+        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
+    parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.")
+    parser.add_argument('--data_path', default='/path/to/imagenet/', type=str)
+    args = parser.parse_args()
+
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+    cudnn.benchmark = True
+
+    if args.load_features:
+        train_features = torch.load(os.path.join(args.load_features, "trainfeat.pth"))
+        test_features = torch.load(os.path.join(args.load_features, "testfeat.pth"))
+        train_labels = torch.load(os.path.join(args.load_features, "trainlabels.pth"))
+        test_labels = torch.load(os.path.join(args.load_features, "testlabels.pth"))
+    else:
+        # need to extract features !
+        train_features, test_features, train_labels, test_labels = extract_feature_pipeline(args)
+
+    if utils.get_rank() == 0:
+        if args.use_cuda:
+            train_features = train_features.cuda()
+            test_features = test_features.cuda()
+            train_labels = train_labels.cuda()
+            test_labels = test_labels.cuda()
+
+        print("Features are ready!\nStart the k-NN classification.")
+        for k in args.nb_knn:
+            top1, top5 = knn_classifier(train_features, train_labels,
+                test_features, test_labels, k, args.temperature)
+            print(f"{k}-NN classifier result: Top1: {top1}, Top5: {top5}")
+    dist.barrier()
--- a/preprocessing/dino/eval_linear.py
+++ b/preprocessing/dino/eval_linear.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from torch import nn
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torchvision import datasets
+from torchvision import transforms as pth_transforms
+from torchvision import models as torchvision_models
+
+import utils
+import vision_transformer as vits
+
+
+def eval_linear(args):
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+    cudnn.benchmark = True
+
+    # ============ building network ... ============
+    # if the network is a Vision Transformer (i.e. vit_tiny, vit_small, vit_base)
+    if args.arch in vits.__dict__.keys():
+        model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0)
+        embed_dim = model.embed_dim * (args.n_last_blocks + int(args.avgpool_patchtokens))
+    # if the network is a XCiT
+    elif "xcit" in args.arch:
+        model = torch.hub.load('facebookresearch/xcit:main', args.arch, num_classes=0)
+        embed_dim = model.embed_dim
+    # otherwise, we check if the architecture is in torchvision models
+    elif args.arch in torchvision_models.__dict__.keys():
+        model = torchvision_models.__dict__[args.arch]()
+        embed_dim = model.fc.weight.shape[1]
+        model.fc = nn.Identity()
+    else:
+        print(f"Unknow architecture: {args.arch}")
+        sys.exit(1)
+    model.cuda()
+    model.eval()
+    # load weights to evaluate
+    utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size)
+    print(f"Model {args.arch} built.")
+
+    linear_classifier = LinearClassifier(embed_dim, num_labels=args.num_labels)
+    linear_classifier = linear_classifier.cuda()
+    linear_classifier = nn.parallel.DistributedDataParallel(linear_classifier, device_ids=[args.gpu])
+
+    # ============ preparing data ... ============
+    val_transform = pth_transforms.Compose([
+        pth_transforms.Resize(256, interpolation=3),
+        pth_transforms.CenterCrop(224),
+        pth_transforms.ToTensor(),
+        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    dataset_val = datasets.ImageFolder(os.path.join(args.data_path, "val"), transform=val_transform)
+    val_loader = torch.utils.data.DataLoader(
+        dataset_val,
+        batch_size=args.batch_size_per_gpu,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+
+    if args.evaluate:
+        utils.load_pretrained_linear_weights(linear_classifier, args.arch, args.patch_size)
+        test_stats = validate_network(val_loader, model, linear_classifier, args.n_last_blocks, args.avgpool_patchtokens)
+        print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        return
+
+    train_transform = pth_transforms.Compose([
+        pth_transforms.RandomResizedCrop(224),
+        pth_transforms.RandomHorizontalFlip(),
+        pth_transforms.ToTensor(),
+        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    dataset_train = datasets.ImageFolder(os.path.join(args.data_path, "train"), transform=train_transform)
+    sampler = torch.utils.data.distributed.DistributedSampler(dataset_train)
+    train_loader = torch.utils.data.DataLoader(
+        dataset_train,
+        sampler=sampler,
+        batch_size=args.batch_size_per_gpu,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+    print(f"Data loaded with {len(dataset_train)} train and {len(dataset_val)} val imgs.")
+
+    # set optimizer
+    optimizer = torch.optim.SGD(
+        linear_classifier.parameters(),
+        args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256., # linear scaling rule
+        momentum=0.9,
+        weight_decay=0, # we do not apply weight decay
+    )
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0)
+
+    # Optionally resume from a checkpoint
+    to_restore = {"epoch": 0, "best_acc": 0.}
+    utils.restart_from_checkpoint(
+        os.path.join(args.output_dir, "checkpoint.pth.tar"),
+        run_variables=to_restore,
+        state_dict=linear_classifier,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+    start_epoch = to_restore["epoch"]
+    best_acc = to_restore["best_acc"]
+
+    for epoch in range(start_epoch, args.epochs):
+        train_loader.sampler.set_epoch(epoch)
+
+        train_stats = train(model, linear_classifier, optimizer, train_loader, epoch, args.n_last_blocks, args.avgpool_patchtokens)
+        scheduler.step()
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                     'epoch': epoch}
+        if epoch % args.val_freq == 0 or epoch == args.epochs - 1:
+            test_stats = validate_network(val_loader, model, linear_classifier, args.n_last_blocks, args.avgpool_patchtokens)
+            print(f"Accuracy at epoch {epoch} of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+            best_acc = max(best_acc, test_stats["acc1"])
+            print(f'Max accuracy so far: {best_acc:.2f}%')
+            log_stats = {**{k: v for k, v in log_stats.items()},
+                         **{f'test_{k}': v for k, v in test_stats.items()}}
+        if utils.is_main_process():
+            with (Path(args.output_dir) / "log.txt").open("a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+            save_dict = {
+                "epoch": epoch + 1,
+                "state_dict": linear_classifier.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "scheduler": scheduler.state_dict(),
+                "best_acc": best_acc,
+            }
+            torch.save(save_dict, os.path.join(args.output_dir, "checkpoint.pth.tar"))
+    print("Training of the supervised linear classifier on frozen features completed.\n"
+                "Top-1 test accuracy: {acc:.1f}".format(acc=best_acc))
+
+
+def train(model, linear_classifier, optimizer, loader, epoch, n, avgpool):
+    linear_classifier.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    for (inp, target) in metric_logger.log_every(loader, 20, header):
+        # move to gpu
+        inp = inp.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+
+        # forward
+        with torch.no_grad():
+            if "vit" in args.arch:
+                intermediate_output = model.get_intermediate_layers(inp, n)
+                output = torch.cat([x[:, 0] for x in intermediate_output], dim=-1)
+                if avgpool:
+                    output = torch.cat((output.unsqueeze(-1), torch.mean(intermediate_output[-1][:, 1:], dim=1).unsqueeze(-1)), dim=-1)
+                    output = output.reshape(output.shape[0], -1)
+            else:
+                output = model(inp)
+        output = linear_classifier(output)
+
+        # compute cross entropy loss
+        loss = nn.CrossEntropyLoss()(output, target)
+
+        # compute the gradients
+        optimizer.zero_grad()
+        loss.backward()
+
+        # step
+        optimizer.step()
+
+        # log 
+        torch.cuda.synchronize()
+        metric_logger.update(loss=loss.item())
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def validate_network(val_loader, model, linear_classifier, n, avgpool):
+    linear_classifier.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+    for inp, target in metric_logger.log_every(val_loader, 20, header):
+        # move to gpu
+        inp = inp.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+
+        # forward
+        with torch.no_grad():
+            if "vit" in args.arch:
+                intermediate_output = model.get_intermediate_layers(inp, n)
+                output = torch.cat([x[:, 0] for x in intermediate_output], dim=-1)
+                if avgpool:
+                    output = torch.cat((output.unsqueeze(-1), torch.mean(intermediate_output[-1][:, 1:], dim=1).unsqueeze(-1)), dim=-1)
+                    output = output.reshape(output.shape[0], -1)
+            else:
+                output = model(inp)
+        output = linear_classifier(output)
+        loss = nn.CrossEntropyLoss()(output, target)
+
+        if linear_classifier.module.num_labels >= 5:
+            acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
+        else:
+            acc1, = utils.accuracy(output, target, topk=(1,))
+
+        batch_size = inp.shape[0]
+        metric_logger.update(loss=loss.item())
+        metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
+        if linear_classifier.module.num_labels >= 5:
+            metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
+    if linear_classifier.module.num_labels >= 5:
+        print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
+          .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
+    else:
+        print('* Acc@1 {top1.global_avg:.3f} loss {losses.global_avg:.3f}'
+          .format(top1=metric_logger.acc1, losses=metric_logger.loss))
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+class LinearClassifier(nn.Module):
+    """Linear layer to train on top of frozen features"""
+    def __init__(self, dim, num_labels=1000):
+        super(LinearClassifier, self).__init__()
+        self.num_labels = num_labels
+        self.linear = nn.Linear(dim, num_labels)
+        self.linear.weight.data.normal_(mean=0.0, std=0.01)
+        self.linear.bias.data.zero_()
+
+    def forward(self, x):
+        # flatten
+        x = x.view(x.size(0), -1)
+
+        # linear layer
+        return self.linear(x)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('Evaluation with linear classification on ImageNet')
+    parser.add_argument('--n_last_blocks', default=4, type=int, help="""Concatenate [CLS] tokens
+        for the `n` last blocks. We use `n=4` when evaluating ViT-Small and `n=1` with ViT-Base.""")
+    parser.add_argument('--avgpool_patchtokens', default=False, type=utils.bool_flag,
+        help="""Whether ot not to concatenate the global average pooled features to the [CLS] token.
+        We typically set this to False for ViT-Small and to True with ViT-Base.""")
+    parser.add_argument('--arch', default='vit_small', type=str, help='Architecture')
+    parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.')
+    parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.")
+    parser.add_argument("--checkpoint_key", default="teacher", type=str, help='Key to use in the checkpoint (example: "teacher")')
+    parser.add_argument('--epochs', default=100, type=int, help='Number of epochs of training.')
+    parser.add_argument("--lr", default=0.001, type=float, help="""Learning rate at the beginning of
+        training (highest LR used during training). The learning rate is linearly scaled
+        with the batch size, and specified here for a reference batch size of 256.
+        We recommend tweaking the LR depending on the checkpoint evaluated.""")
+    parser.add_argument('--batch_size_per_gpu', default=128, type=int, help='Per-GPU batch-size')
+    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
+        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
+    parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.")
+    parser.add_argument('--data_path', default='/path/to/imagenet/', type=str)
+    parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.')
+    parser.add_argument('--val_freq', default=1, type=int, help="Epoch frequency for validation.")
+    parser.add_argument('--output_dir', default=".", help='Path to save logs and checkpoints')
+    parser.add_argument('--num_labels', default=1000, type=int, help='Number of labels for linear classifier')
+    parser.add_argument('--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set')
+    args = parser.parse_args()
+    eval_linear(args)
--- a/preprocessing/dino/eval_video_segmentation.py
+++ b/preprocessing/dino/eval_video_segmentation.py