update

b6c19984 · dengjb · b6c19984 · b6c19984 · b6c19984 · b6c19984
Commit b6c19984 authored Nov 18, 2025 by dengjb
20 changed files
--- a/fastreid/data/transforms/build.py
+++ b/fastreid/data/transforms/build.py
+# encoding: utf-8
+"""
+@author:  liaoxingyu
+@contact: sherlockliao01@gmail.com
+"""
+
+import torchvision.transforms as T
+
+from .transforms import *
+from .autoaugment import AutoAugment
+
+
+def build_transforms(cfg, is_train=True):
+    res = []
+
+    if is_train:
+        size_train = cfg.INPUT.SIZE_TRAIN
+
+        # crop
+        do_crop = cfg.INPUT.CROP.ENABLED
+        crop_size = cfg.INPUT.CROP.SIZE
+        crop_scale = cfg.INPUT.CROP.SCALE
+        crop_ratio = cfg.INPUT.CROP.RATIO
+
+        # augmix augmentation
+        do_augmix = cfg.INPUT.AUGMIX.ENABLED
+        augmix_prob = cfg.INPUT.AUGMIX.PROB
+
+        # auto augmentation
+        do_autoaug = cfg.INPUT.AUTOAUG.ENABLED
+        autoaug_prob = cfg.INPUT.AUTOAUG.PROB
+
+        # horizontal filp
+        do_flip = cfg.INPUT.FLIP.ENABLED
+        flip_prob = cfg.INPUT.FLIP.PROB
+
+        # padding
+        do_pad = cfg.INPUT.PADDING.ENABLED
+        padding_size = cfg.INPUT.PADDING.SIZE
+        padding_mode = cfg.INPUT.PADDING.MODE
+
+        # color jitter
+        do_cj = cfg.INPUT.CJ.ENABLED
+        cj_prob = cfg.INPUT.CJ.PROB
+        cj_brightness = cfg.INPUT.CJ.BRIGHTNESS
+        cj_contrast = cfg.INPUT.CJ.CONTRAST
+        cj_saturation = cfg.INPUT.CJ.SATURATION
+        cj_hue = cfg.INPUT.CJ.HUE
+
+        # random affine
+        do_affine = cfg.INPUT.AFFINE.ENABLED
+
+        # random erasing
+        do_rea = cfg.INPUT.REA.ENABLED
+        rea_prob = cfg.INPUT.REA.PROB
+        rea_value = cfg.INPUT.REA.VALUE
+
+        # random patch
+        do_rpt = cfg.INPUT.RPT.ENABLED
+        rpt_prob = cfg.INPUT.RPT.PROB
+
+        if do_autoaug:
+            res.append(T.RandomApply([AutoAugment()], p=autoaug_prob))
+
+        if size_train[0] > 0:
+            res.append(T.Resize(size_train[0] if len(size_train) == 1 else size_train, interpolation=3))
+
+        if do_crop:
+            res.append(T.RandomResizedCrop(size=crop_size[0] if len(crop_size) == 1 else crop_size,
+                                           interpolation=3,
+                                           scale=crop_scale, ratio=crop_ratio))
+        if do_pad:
+            res.extend([T.Pad(padding_size, padding_mode=padding_mode),
+                        T.RandomCrop(size_train[0] if len(size_train) == 1 else size_train)])
+        if do_flip:
+            res.append(T.RandomHorizontalFlip(p=flip_prob))
+
+        if do_cj:
+            res.append(T.RandomApply([T.ColorJitter(cj_brightness, cj_contrast, cj_saturation, cj_hue)], p=cj_prob))
+        if do_affine:
+            res.append(T.RandomAffine(degrees=10, translate=None, scale=[0.9, 1.1], shear=0.1, resample=False,
+                                      fillcolor=0))
+        if do_augmix:
+            res.append(AugMix(prob=augmix_prob))
+        res.append(ToTensor())
+        if do_rea:
+            res.append(T.RandomErasing(p=rea_prob, value=rea_value))
+        if do_rpt:
+            res.append(RandomPatch(prob_happen=rpt_prob))
+    else:
+        size_test = cfg.INPUT.SIZE_TEST
+        do_crop = cfg.INPUT.CROP.ENABLED
+        crop_size = cfg.INPUT.CROP.SIZE
+
+        if size_test[0] > 0:
+            res.append(T.Resize(size_test[0] if len(size_test) == 1 else size_test, interpolation=3))
+        if do_crop:
+            res.append(T.CenterCrop(size=crop_size[0] if len(crop_size) == 1 else crop_size))
+        res.append(ToTensor())
+    return T.Compose(res)
--- a/fastreid/data/transforms/functional.py
+++ b/fastreid/data/transforms/functional.py
+# encoding: utf-8
+"""
+@author:  liaoxingyu
+@contact: sherlockliao01@gmail.com
+"""
+
+import numpy as np
+import torch
+from PIL import Image, ImageOps, ImageEnhance
+
+
+def to_tensor(pic):
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+
+    See ``ToTensor`` for more details.
+
+    Args:
+        pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+
+    Returns:
+        Tensor: Converted image.
+    """
+    if isinstance(pic, np.ndarray):
+        assert len(pic.shape) in (2, 3)
+        # handle numpy array
+        if pic.ndim == 2:
+            pic = pic[:, :, None]
+
+        img = torch.from_numpy(pic.transpose((2, 0, 1)))
+        # backward compatibility
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+
+    # handle PIL Image
+    if pic.mode == 'I':
+        img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+    elif pic.mode == 'I;16':
+        img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+    elif pic.mode == 'F':
+        img = torch.from_numpy(np.array(pic, np.float32, copy=False))
+    elif pic.mode == '1':
+        img = 255 * torch.from_numpy(np.array(pic, np.uint8, copy=False))
+    else:
+        img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+    # PIL image mode: L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK
+    if pic.mode == 'YCbCr':
+        nchannel = 3
+    elif pic.mode == 'I;16':
+        nchannel = 1
+    else:
+        nchannel = len(pic.mode)
+    img = img.view(pic.size[1], pic.size[0], nchannel)
+    # put it from HWC to CHW format
+    # yikes, this transpose takes 80% of the loading time/CPU
+    img = img.transpose(0, 1).transpose(0, 2).contiguous()
+    if isinstance(img, torch.ByteTensor):
+        return img.float()
+    else:
+        return img
+
+
+def int_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval .
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      An int that results from scaling `maxval` according to `level`.
+    """
+    return int(level * maxval / 10)
+
+
+def float_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval.
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      A float that results from scaling `maxval` according to `level`.
+    """
+    return float(level) * maxval / 10.
+
+
+def sample_level(n):
+    return np.random.uniform(low=0.1, high=n)
+
+
+def autocontrast(pil_img, *args):
+    return ImageOps.autocontrast(pil_img)
+
+
+def equalize(pil_img, *args):
+    return ImageOps.equalize(pil_img)
+
+
+def posterize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 4)
+    return ImageOps.posterize(pil_img, 4 - level)
+
+
+def rotate(pil_img, level, *args):
+    degrees = int_parameter(sample_level(level), 30)
+    if np.random.uniform() > 0.5:
+        degrees = -degrees
+    return pil_img.rotate(degrees, resample=Image.BILINEAR)
+
+
+def solarize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 256)
+    return ImageOps.solarize(pil_img, 256 - level)
+
+
+def shear_x(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, level, 0, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def shear_y(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, level, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_x(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[0] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, level, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_y(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[1] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, 0, 1, level),
+                             resample=Image.BILINEAR)
+
+
+# operation that overlaps with ImageNet-C's test set
+def color(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Color(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def contrast(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Contrast(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def brightness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Brightness(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def sharpness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Sharpness(pil_img).enhance(level)
+
+
+augmentations = [
+    autocontrast, equalize, posterize, rotate, solarize, shear_x, shear_y,
+    translate_x, translate_y
+]
--- a/fastreid/data/transforms/transforms.py
+++ b/fastreid/data/transforms/transforms.py
+# encoding: utf-8
+"""
+@author:  liaoxingyu
+@contact: sherlockliao01@gmail.com
+"""
+
+__all__ = ['ToTensor', 'RandomPatch', 'AugMix', ]
+
+import math
+import random
+from collections import deque
+
+import numpy as np
+import torch
+
+from .functional import to_tensor, augmentations
+
+
+class ToTensor(object):
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 255.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+    """
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+
+        Returns:
+            Tensor: Converted image.
+        """
+        return to_tensor(pic)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+
+
+class RandomPatch(object):
+    """Random patch data augmentation.
+    There is a patch pool that stores randomly extracted pathces from person images.
+    For each input image, RandomPatch
+        1) extracts a random patch and stores the patch in the patch pool;
+        2) randomly selects a patch from the patch pool and pastes it on the
+           input (at random position) to simulate occlusion.
+    Reference:
+        - Zhou et al. Omni-Scale Feature Learning for Person Re-Identification. ICCV, 2019.
+        - Zhou et al. Learning Generalisable Omni-Scale Representations
+          for Person Re-Identification. arXiv preprint, 2019.
+    """
+
+    def __init__(self, prob_happen=0.5, pool_capacity=50000, min_sample_size=100,
+                 patch_min_area=0.01, patch_max_area=0.5, patch_min_ratio=0.1, prob_flip_leftright=0.5,
+                 ):
+        self.prob_happen = prob_happen
+
+        self.patch_min_area = patch_min_area
+        self.patch_max_area = patch_max_area
+        self.patch_min_ratio = patch_min_ratio
+
+        self.prob_flip_leftright = prob_flip_leftright
+
+        self.patchpool = deque(maxlen=pool_capacity)
+        self.min_sample_size = min_sample_size
+
+    def generate_wh(self, W, H):
+        area = W * H
+        for attempt in range(100):
+            target_area = random.uniform(self.patch_min_area, self.patch_max_area) * area
+            aspect_ratio = random.uniform(self.patch_min_ratio, 1. / self.patch_min_ratio)
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < W and h < H:
+                return w, h
+        return None, None
+
+    def transform_patch(self, patch):
+        if random.uniform(0, 1) > self.prob_flip_leftright:
+            patch = torch.flip(patch, dims=[2])
+        return patch
+
+    def __call__(self, img):
+        _, H, W = img.size()  # original image size
+
+        # collect new patch
+        w, h = self.generate_wh(W, H)
+        if w is not None and h is not None:
+            x1 = random.randint(0, W - w)
+            y1 = random.randint(0, H - h)
+            new_patch = img[..., y1:y1 + h, x1:x1 + w]
+            self.patchpool.append(new_patch)
+
+        if len(self.patchpool) < self.min_sample_size:
+            return img
+
+        if random.uniform(0, 1) > self.prob_happen:
+            return img
+
+        # paste a randomly selected patch on a random position
+        patch = random.sample(self.patchpool, 1)[0]
+        _, patchH, patchW = patch.size()
+        x1 = random.randint(0, W - patchW)
+        y1 = random.randint(0, H - patchH)
+        patch = self.transform_patch(patch)
+        img[..., y1:y1 + patchH, x1:x1 + patchW] = patch
+
+        return img
+
+
+class AugMix(object):
+    """ Perform AugMix augmentation and compute mixture.
+    """
+
+    def __init__(self, prob=0.5, aug_prob_coeff=0.1, mixture_width=3, mixture_depth=1, aug_severity=1):
+        """
+        Args:
+            prob: Probability of taking augmix
+            aug_prob_coeff: Probability distribution coefficients.
+            mixture_width: Number of augmentation chains to mix per augmented example.
+            mixture_depth: Depth of augmentation chains. -1 denotes stochastic depth in [1, 3]'
+            aug_severity: Severity of underlying augmentation operators (between 1 to 10).
+        """
+        # fmt: off
+        self.prob           = prob
+        self.aug_prob_coeff = aug_prob_coeff
+        self.mixture_width  = mixture_width
+        self.mixture_depth  = mixture_depth
+        self.aug_severity   = aug_severity
+        self.augmentations  = augmentations
+        # fmt: on
+
+    def __call__(self, image):
+        """Perform AugMix augmentations and compute mixture.
+
+        Returns:
+          mixed: Augmented and mixed image.
+        """
+        if random.random() > self.prob:
+            # Avoid the warning: the given NumPy array is not writeable
+            return np.asarray(image).copy()
+
+        ws = np.float32(
+            np.random.dirichlet([self.aug_prob_coeff] * self.mixture_width))
+        m = np.float32(np.random.beta(self.aug_prob_coeff, self.aug_prob_coeff))
+
+        mix = np.zeros([image.size[1], image.size[0], 3])
+        for i in range(self.mixture_width):
+            image_aug = image.copy()
+            depth = self.mixture_depth if self.mixture_depth > 0 else np.random.randint(1, 4)
+            for _ in range(depth):
+                op = np.random.choice(self.augmentations)
+                image_aug = op(image_aug, self.aug_severity)
+            mix += ws[i] * np.asarray(image_aug)
+
+        mixed = (1 - m) * image + m * mix
+        return mixed.astype(np.uint8)
--- a/fastreid/engine/__init__.py
+++ b/fastreid/engine/__init__.py
+# encoding: utf-8
+"""
+@author:  liaoxingyu
+@contact: sherlockliao01@gmail.com
+"""
+from .train_loop import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+# prefer to let hooks and defaults live in separate namespaces (therefore not in __all__)
+# but still make them available here
+from .hooks import *
+from .defaults import *
+from .launch import *
--- a/fastreid/engine/defaults.py
+++ b/fastreid/engine/defaults.py
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+This file contains components with some default boilerplate logic user may need
+in training / testing. They will not work for everyone, but many users may find them useful.
+The behavior of functions/classes in this file is subject to change,
+since they are meant to represent the "common default behavior" people need in their projects.
+"""
+
+import argparse
+import logging
+import os
+import sys
+from collections import OrderedDict
+
+import torch
+from torch.nn.parallel import DistributedDataParallel
+
+from fastreid.data import build_reid_test_loader, build_reid_train_loader
+from fastreid.evaluation import (ReidEvaluator,
+                                 inference_on_dataset, print_csv_format)
+from fastreid.modeling.meta_arch import build_model
+from fastreid.solver import build_lr_scheduler, build_optimizer
+from fastreid.utils import comm
+from fastreid.utils.checkpoint import Checkpointer
+from fastreid.utils.collect_env import collect_env_info
+from fastreid.utils.env import seed_all_rng
+from fastreid.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+from fastreid.utils.file_io import PathManager
+from fastreid.utils.logger import setup_logger
+from . import hooks
+from .train_loop import TrainerBase, AMPTrainer, SimpleTrainer
+
+__all__ = ["default_argument_parser", "default_setup", "DefaultPredictor", "DefaultTrainer"]
+
+
+def default_argument_parser():
+    """
+    Create a parser with some common arguments used by fastreid users.
+    Returns:
+        argparse.ArgumentParser:
+    """
+    parser = argparse.ArgumentParser(description="fastreid Training")
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="whether to attempt to resume from the checkpoint directory",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
+    parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
+    parser.add_argument(
+        "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
+    )
+
+    # PyTorch still may leave orphan processes in multi-gpu training.
+    # Therefore we use a deterministic way to obtain port,
+    # so that users are aware of orphan processes by seeing the port occupied.
+    port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14
+    parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port))
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def default_setup(cfg, args):
+    """
+    Perform some basic common setups at the beginning of a job, including:
+    1. Set up the detectron2 logger
+    2. Log basic information about environment, cmdline arguments, and config
+    3. Backup the config to the output directory
+    Args:
+        cfg (CfgNode): the full config to be used
+        args (argparse.NameSpace): the command line arguments to be logged
+    """
+    output_dir = cfg.OUTPUT_DIR
+    if comm.is_main_process() and output_dir:
+        PathManager.mkdirs(output_dir)
+
+    rank = comm.get_rank()
+    # setup_logger(output_dir, distributed_rank=rank, name="fvcore")
+    logger = setup_logger(output_dir, distributed_rank=rank)
+
+    logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
+    logger.info("Environment info:\n" + collect_env_info())
+
+    logger.info("Command line arguments: " + str(args))
+    if hasattr(args, "config_file") and args.config_file != "":
+        logger.info(
+            "Contents of args.config_file={}:\n{}".format(
+                args.config_file, PathManager.open(args.config_file, "r").read()
+            )
+        )
+
+    logger.info("Running with full config:\n{}".format(cfg))
+    if comm.is_main_process() and output_dir:
+        # Note: some of our scripts may expect the existence of
+        # config.yaml in output directory
+        path = os.path.join(output_dir, "config.yaml")
+        with PathManager.open(path, "w") as f:
+            f.write(cfg.dump())
+        logger.info("Full config saved to {}".format(os.path.abspath(path)))
+
+    # make sure each worker has a different, yet deterministic seed if specified
+    seed_all_rng()
+
+    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
+    # typical validation set.
+    if not (hasattr(args, "eval_only") and args.eval_only):
+        torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
+
+
+class DefaultPredictor:
+    """
+    Create a simple end-to-end predictor with the given config.
+    The predictor takes an BGR image, resizes it to the specified resolution,
+    runs the model and produces a dict of predictions.
+    This predictor takes care of model loading and input preprocessing for you.
+    If you'd like to do anything more fancy, please refer to its source code
+    as examples to build and use the model manually.
+    Attributes:
+    Examples:
+    .. code-block:: python
+        pred = DefaultPredictor(cfg)
+        inputs = cv2.imread("input.jpg")
+        outputs = pred(inputs)
+    """
+
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.cfg.defrost()
+        self.cfg.MODEL.BACKBONE.PRETRAIN = False
+        self.model = build_model(self.cfg)
+        self.model.eval()
+
+        Checkpointer(self.model).load(cfg.MODEL.WEIGHTS)
+
+    def __call__(self, image):
+        """
+        Args:
+            image (torch.tensor): an image tensor of shape (B, C, H, W).
+        Returns:
+            predictions (torch.tensor): the output features of the model
+        """
+        inputs = {"images": image.to(self.model.device)}
+        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
+            predictions = self.model(inputs)
+        return predictions.cpu()
+
+
+class DefaultTrainer(TrainerBase):
+    """
+    A trainer with default training logic. Compared to `SimpleTrainer`, it
+    contains the following logic in addition:
+    1. Create model, optimizer, scheduler, dataloader from the given config.
+    2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists.
+    3. Register a few common hooks.
+    It is created to simplify the **standard model training workflow** and reduce code boilerplate
+    for users who only need the standard training workflow, with standard features.
+    It means this class makes *many assumptions* about your training logic that
+    may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+    :class:`SimpleTrainer` are too much for research.
+    The code of this class has been annotated about restrictive assumptions it mades.
+    When they do not work for you, you're encouraged to:
+    1. Overwrite methods of this class, OR:
+    2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
+       nothing else. You can then add your own hooks if needed. OR:
+    3. Write your own training loop similar to `tools/plain_train_net.py`.
+    Also note that the behavior of this class, like other functions/classes in
+    this file, is not stable, since it is meant to represent the "common default behavior".
+    It is only guaranteed to work well with the standard models and training workflow in fastreid.
+    To obtain more stable behavior, write your own training logic with other public APIs.
+    Attributes:
+        scheduler:
+        checkpointer:
+        cfg (CfgNode):
+    Examples:
+    .. code-block:: python
+        trainer = DefaultTrainer(cfg)
+        trainer.resume_or_load()  # load last checkpoint or MODEL.WEIGHTS
+        trainer.train()
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (CfgNode):
+        """
+        super().__init__()
+        logger = logging.getLogger("fastreid")
+        if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called for fastreid
+            setup_logger()
+
+        # Assume these objects must be constructed in this order.
+        data_loader = self.build_train_loader(cfg)
+        cfg = self.auto_scale_hyperparams(cfg, data_loader.dataset.num_classes)
+        model = self.build_model(cfg)
+        optimizer, param_wrapper = self.build_optimizer(cfg, model)
+
+        # For training, wrap with DDP. But don't need this for inference.
+        if comm.get_world_size() > 1:
+            # ref to https://github.com/pytorch/pytorch/issues/22049 to set `find_unused_parameters=True`
+            # for part of the parameters is not updated.
+            model = DistributedDataParallel(
+                model, device_ids=[comm.get_local_rank()], broadcast_buffers=False,
+            )
+
+        self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
+            model, data_loader, optimizer, param_wrapper
+        )
+
+        self.iters_per_epoch = len(data_loader.dataset) // cfg.SOLVER.IMS_PER_BATCH
+        self.scheduler = self.build_lr_scheduler(cfg, optimizer, self.iters_per_epoch)
+
+        # Assume no other objects need to be checkpointed.
+        # We can later make it checkpoint the stateful hooks
+        self.checkpointer = Checkpointer(
+            # Assume you want to save checkpoints together with logs/statistics
+            model,
+            cfg.OUTPUT_DIR,
+            save_to_disk=comm.is_main_process(),
+            optimizer=optimizer,
+            **self.scheduler,
+        )
+
+        self.start_epoch = 0
+        self.max_epoch = cfg.SOLVER.MAX_EPOCH
+        self.max_iter = self.max_epoch * self.iters_per_epoch
+        self.warmup_iters = cfg.SOLVER.WARMUP_ITERS
+        self.delay_epochs = cfg.SOLVER.DELAY_EPOCHS
+        self.cfg = cfg
+
+        self.register_hooks(self.build_hooks())
+
+    def resume_or_load(self, resume=True):
+        """
+        If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by
+        a `last_checkpoint` file), resume from the file. Resuming means loading all
+        available states (eg. optimizer and scheduler) and update iteration counter
+        from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used.
+        Otherwise, this is considered as an independent training. The method will load model
+        weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start
+        from iteration 0.
+        Args:
+            resume (bool): whether to do resume or not
+        """
+        # The checkpoint stores the training iteration that just finished, thus we start
+        # at the next iteration (or iter zero if there's no checkpoint).
+        checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
+
+        if resume and self.checkpointer.has_checkpoint():
+            self.start_epoch = checkpoint.get("epoch", -1) + 1
+            # The checkpoint stores the training iteration that just finished, thus we start
+            # at the next iteration (or iter zero if there's no checkpoint).
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+        Returns:
+            list[HookBase]:
+        """
+        logger = logging.getLogger(__name__)
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
+        cfg.DATASETS.NAMES = tuple([cfg.TEST.PRECISE_BN.DATASET])  # set dataset name for PreciseBN
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(self.optimizer, self.scheduler),
+        ]
+
+        if cfg.TEST.PRECISE_BN.ENABLED and hooks.get_bn_modules(self.model):
+            logger.info("Prepare precise BN dataset")
+            ret.append(hooks.PreciseBN(
+                # Run at the same freq as (but before) evaluation.
+                self.model,
+                # Build a new data loader to not affect training
+                self.build_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            ))
+
+        if len(cfg.MODEL.FREEZE_LAYERS) > 0 and cfg.SOLVER.FREEZE_ITERS > 0:
+            ret.append(hooks.LayerFreeze(
+                self.model,
+                cfg.MODEL.FREEZE_LAYERS,
+                cfg.SOLVER.FREEZE_ITERS,
+            ))
+
+        # Do PreciseBN before checkpointer, because it updates the model and need to
+        # be saved by checkpointer.
+        # This is not always the best: if checkpointing has a different frequency,
+        # some checkpoints may have more precise statistics than others.
+
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+
+        # Do evaluation before checkpointer, because then if it fails,
+        # we can use the saved checkpoint to debug.
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+            # run writers in the end, so that evaluation metrics are written
+            ret.append(hooks.PeriodicWriter(self.build_writers(), 200))
+
+        return ret
+
+    def build_writers(self):
+        """
+        Build a list of writers to be used. By default it contains
+        writers that write metrics to the screen,
+        a json file, and a tensorboard event file respectively.
+        If you'd like a different list of writers, you can overwrite it in
+        your trainer.
+        Returns:
+            list[EventWriter]: a list of :class:`EventWriter` objects.
+        It is now implemented by:
+        .. code-block:: python
+            return [
+                CommonMetricPrinter(self.max_iter),
+                JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+                TensorboardXWriter(self.cfg.OUTPUT_DIR),
+            ]
+        """
+        # Assume the default print/log frequency.
+        return [
+            # It may not always print what you want to see, since it prints "common" metrics only.
+            CommonMetricPrinter(self.max_iter),
+            JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+            TensorboardXWriter(self.cfg.OUTPUT_DIR),
+        ]
+
+    def train(self):
+        """
+        Run training.
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_epoch, self.max_epoch, self.iters_per_epoch)
+        if comm.is_main_process():
+            assert hasattr(
+                self, "_last_eval_results"
+            ), "No evaluation results obtained during training!"
+            return self._last_eval_results
+
+    def run_step(self):
+        self._trainer.iter = self.iter
+        self._trainer.run_step()
+
+    @classmethod
+    def build_model(cls, cfg):
+        """
+        Returns:
+            torch.nn.Module:
+        It now calls :func:`fastreid.modeling.build_model`.
+        Overwrite it if you'd like a different model.
+        """
+        model = build_model(cfg)
+        logger = logging.getLogger(__name__)
+        logger.info("Model:\n{}".format(model))
+        return model
+
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        """
+        Returns:
+            torch.optim.Optimizer:
+        It now calls :func:`fastreid.solver.build_optimizer`.
+        Overwrite it if you'd like a different optimizer.
+        """
+        return build_optimizer(cfg, model)
+
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer, iters_per_epoch):
+        """
+        It now calls :func:`fastreid.solver.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        return build_lr_scheduler(cfg, optimizer, iters_per_epoch)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        """
+        Returns:
+            iterable
+        It now calls :func:`fastreid.data.build_reid_train_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Prepare training set")
+        return build_reid_train_loader(cfg, combineall=cfg.DATASETS.COMBINEALL)
+
+    @classmethod
+    def build_test_loader(cls, cfg, dataset_name):
+        """
+        Returns:
+            iterable
+        It now calls :func:`fastreid.data.build_reid_test_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_reid_test_loader(cfg, dataset_name=dataset_name)
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_dir=None):
+        data_loader, num_query = cls.build_test_loader(cfg, dataset_name)
+        return data_loader, ReidEvaluator(cfg, num_query, output_dir)
+
+    @classmethod
+    def test(cls, cfg, model):
+        """
+        Args:
+            cfg (CfgNode):
+            model (nn.Module):
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TESTS):
+            logger.info("Prepare testing set")
+            try:
+                data_loader, evaluator = cls.build_evaluator(cfg, dataset_name)
+            except NotImplementedError:
+                logger.warn(
+                    "No evaluator found. implement its `build_evaluator` method."
+                )
+                results[dataset_name] = {}
+                continue
+            results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP.ENABLED)
+            results[dataset_name] = results_i
+
+            if comm.is_main_process():
+                assert isinstance(
+                    results, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                results_i['dataset'] = dataset_name
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+
+        return results
+
+    @staticmethod
+    def auto_scale_hyperparams(cfg, num_classes):
+        r"""
+        This is used for auto-computation actual training iterations,
+        because some hyper-param, such as MAX_ITER, means training epochs rather than iters,
+        so we need to convert specific hyper-param to training iterations.
+        """
+        cfg = cfg.clone()
+        frozen = cfg.is_frozen()
+        cfg.defrost()
+
+        # If you don't hard-code the number of classes, it will compute the number automatically
+        if cfg.MODEL.HEADS.NUM_CLASSES == 0:
+            output_dir = cfg.OUTPUT_DIR
+            cfg.MODEL.HEADS.NUM_CLASSES = num_classes
+            logger = logging.getLogger(__name__)
+            logger.info(f"Auto-scaling the num_classes={cfg.MODEL.HEADS.NUM_CLASSES}")
+
+            # Update the saved config file to make the number of classes valid
+            if comm.is_main_process() and output_dir:
+                # Note: some of our scripts may expect the existence of
+                # config.yaml in output directory
+                path = os.path.join(output_dir, "config.yaml")
+                with PathManager.open(path, "w") as f:
+                    f.write(cfg.dump())
+
+        if frozen: cfg.freeze()
+
+        return cfg
+
+
+# Access basic attributes from the underlying trainer
+for _attr in ["model", "data_loader", "optimizer", "grad_scaler"]:
+    setattr(DefaultTrainer, _attr, property(lambda self, x=_attr: getattr(self._trainer, x, None)))
--- a/fastreid/engine/hooks.py
+++ b/fastreid/engine/hooks.py
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import datetime
+import itertools
+import logging
+import os
+import tempfile
+import time
+from collections import Counter
+
+import torch
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
+
+from fastreid.evaluation.testing import flatten_results_dict
+from fastreid.solver import optim
+from fastreid.utils import comm
+from fastreid.utils.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
+from fastreid.utils.events import EventStorage, EventWriter, get_event_storage
+from fastreid.utils.file_io import PathManager
+from fastreid.utils.precision_bn import update_bn_stats, get_bn_modules
+from fastreid.utils.timer import Timer
+from .train_loop import HookBase
+
+__all__ = [
+    "CallbackHook",
+    "IterationTimer",
+    "PeriodicWriter",
+    "PeriodicCheckpointer",
+    "LRScheduler",
+    "AutogradProfiler",
+    "EvalHook",
+    "PreciseBN",
+    "LayerFreeze",
+]
+
+"""
+Implement some common hooks.
+"""
+
+
+class CallbackHook(HookBase):
+    """
+    Create a hook using callback functions provided by the user.
+    """
+
+    def __init__(self, *, before_train=None, after_train=None, before_epoch=None, after_epoch=None,
+                 before_step=None, after_step=None):
+        """
+        Each argument is a function that takes one argument: the trainer.
+        """
+        self._before_train = before_train
+        self._before_epoch = before_epoch
+        self._before_step = before_step
+        self._after_step = after_step
+        self._after_epoch = after_epoch
+        self._after_train = after_train
+
+    def before_train(self):
+        if self._before_train:
+            self._before_train(self.trainer)
+
+    def after_train(self):
+        if self._after_train:
+            self._after_train(self.trainer)
+        # The functions may be closures that hold reference to the trainer
+        # Therefore, delete them to avoid circular reference.
+        del self._before_train, self._after_train
+        del self._before_step, self._after_step
+
+    def before_epoch(self):
+        if self._before_epoch:
+            self._before_epoch(self.trainer)
+
+    def after_epoch(self):
+        if self._after_epoch:
+            self._after_epoch(self.trainer)
+
+    def before_step(self):
+        if self._before_step:
+            self._before_step(self.trainer)
+
+    def after_step(self):
+        if self._after_step:
+            self._after_step(self.trainer)
+
+
+class IterationTimer(HookBase):
+    """
+    Track the time spent for each iteration (each run_step call in the trainer).
+    Print a summary in the end of training.
+    This hook uses the time between the call to its :meth:`before_step`
+    and :meth:`after_step` methods.
+    Under the convention that :meth:`before_step` of all hooks should only
+    take negligible amount of time, the :class:`IterationTimer` hook should be
+    placed at the beginning of the list of hooks to obtain accurate timing.
+    """
+
+    def __init__(self, warmup_iter=3):
+        """
+        Args:
+            warmup_iter (int): the number of iterations at the beginning to exclude
+                from timing.
+        """
+        self._warmup_iter = warmup_iter
+        self._step_timer = Timer()
+
+    def before_train(self):
+        self._start_time = time.perf_counter()
+        self._total_timer = Timer()
+        self._total_timer.pause()
+
+    def after_train(self):
+        logger = logging.getLogger(__name__)
+        total_time = time.perf_counter() - self._start_time
+        total_time_minus_hooks = self._total_timer.seconds()
+        hook_time = total_time - total_time_minus_hooks
+
+        num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter
+
+        if num_iter > 0 and total_time_minus_hooks > 0:
+            # Speed is meaningful only after warmup
+            # NOTE this format is parsed by grep in some scripts
+            logger.info(
+                "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
+                    num_iter,
+                    str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
+                    total_time_minus_hooks / num_iter,
+                )
+            )
+
+        logger.info(
+            "Total training time: {} ({} on hooks)".format(
+                str(datetime.timedelta(seconds=int(total_time))),
+                str(datetime.timedelta(seconds=int(hook_time))),
+            )
+        )
+
+    def before_step(self):
+        self._step_timer.reset()
+        self._total_timer.resume()
+
+    def after_step(self):
+        # +1 because we're in after_step
+        iter_done = self.trainer.iter - self.trainer.start_iter + 1
+        if iter_done >= self._warmup_iter:
+            sec = self._step_timer.seconds()
+            self.trainer.storage.put_scalars(time=sec)
+        else:
+            self._start_time = time.perf_counter()
+            self._total_timer.reset()
+
+        self._total_timer.pause()
+
+
+class PeriodicWriter(HookBase):
+    """
+    Write events to EventStorage periodically.
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, writers, period=20):
+        """
+        Args:
+            writers (list[EventWriter]): a list of EventWriter objects
+            period (int):
+        """
+        self._writers = writers
+        for w in writers:
+            assert isinstance(w, EventWriter), w
+        self._period = period
+
+    def after_step(self):
+        if (self.trainer.iter + 1) % self._period == 0 or (
+                self.trainer.iter == self.trainer.max_iter - 1
+        ):
+            for writer in self._writers:
+                writer.write()
+
+    def after_epoch(self):
+        for writer in self._writers:
+            writer.write()
+
+    def after_train(self):
+        for writer in self._writers:
+            writer.close()
+
+
+class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
+    """
+    Same as :class:`fastreid.utils.checkpoint.PeriodicCheckpointer`, but as a hook.
+    Note that when used as a hook,
+    it is unable to save additional data other than what's defined
+    by the given `checkpointer`.
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def before_train(self):
+        self.max_epoch = self.trainer.max_epoch
+        if len(self.trainer.cfg.DATASETS.TESTS) == 1:
+            self.metric_name = "metric"
+        else:
+            self.metric_name = self.trainer.cfg.DATASETS.TESTS[0] + "/metric"
+
+    def after_epoch(self):
+        # No way to use **kwargs
+        storage = get_event_storage()
+        metric_dict = dict(
+            metric=storage.latest()[self.metric_name][0] if self.metric_name in storage.latest() else -1
+        )
+        self.step(self.trainer.epoch, **metric_dict)
+
+
+class LRScheduler(HookBase):
+    """
+    A hook which executes a torch builtin LR scheduler and summarizes the LR.
+    It is executed after every iteration.
+    """
+
+    def __init__(self, optimizer, scheduler):
+        """
+        Args:
+            optimizer (torch.optim.Optimizer):
+            scheduler (torch.optim._LRScheduler)
+        """
+        self._optimizer = optimizer
+        self._scheduler = scheduler
+        self._scale = 0
+
+        # NOTE: some heuristics on what LR to summarize
+        # summarize the param group with most parameters
+        largest_group = max(len(g["params"]) for g in optimizer.param_groups)
+
+        if largest_group == 1:
+            # If all groups have one parameter,
+            # then find the most common initial LR, and use it for summary
+            lr_count = Counter([g["lr"] for g in optimizer.param_groups])
+            lr = lr_count.most_common()[0][0]
+            for i, g in enumerate(optimizer.param_groups):
+                if g["lr"] == lr:
+                    self._best_param_group_id = i
+                    break
+        else:
+            for i, g in enumerate(optimizer.param_groups):
+                if len(g["params"]) == largest_group:
+                    self._best_param_group_id = i
+                    break
+
+    def before_step(self):
+        if self.trainer.grad_scaler is not None:
+            self._scale = self.trainer.grad_scaler.get_scale()
+
+    def after_step(self):
+        lr = self._optimizer.param_groups[self._best_param_group_id]["lr"]
+        self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False)
+
+        next_iter = self.trainer.iter + 1
+        if next_iter <= self.trainer.warmup_iters:
+            if self.trainer.grad_scaler is None or self._scale == self.trainer.grad_scaler.get_scale():
+                self._scheduler["warmup_sched"].step()
+
+    def after_epoch(self):
+        next_iter = self.trainer.iter + 1
+        next_epoch = self.trainer.epoch + 1
+        if next_iter > self.trainer.warmup_iters and next_epoch > self.trainer.delay_epochs:
+            self._scheduler["lr_sched"].step()
+
+
+class AutogradProfiler(HookBase):
+    """
+    A hook which runs `torch.autograd.profiler.profile`.
+    Examples:
+    .. code-block:: python
+        hooks.AutogradProfiler(
+             lambda trainer: trainer.iter > 10 and trainer.iter < 20, self.cfg.OUTPUT_DIR
+        )
+    The above example will run the profiler for iteration 10~20 and dump
+    results to ``OUTPUT_DIR``. We did not profile the first few iterations
+    because they are typically slower than the rest.
+    The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
+    Note:
+        When used together with NCCL on older version of GPUs,
+        autograd profiler may cause deadlock because it unnecessarily allocates
+        memory on every device it sees. The memory management calls, if
+        interleaved with NCCL calls, lead to deadlock on GPUs that do not
+        support `cudaLaunchCooperativeKernelMultiDevice`.
+    """
+
+    def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
+        """
+        Args:
+            enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+                and returns whether to enable the profiler.
+                It will be called once every step, and can be used to select which steps to profile.
+            output_dir (str): the output directory to dump tracing files.
+            use_cuda (bool): same as in `torch.autograd.profiler.profile`.
+        """
+        self._enable_predicate = enable_predicate
+        self._use_cuda = use_cuda
+        self._output_dir = output_dir
+
+    def before_step(self):
+        if self._enable_predicate(self.trainer):
+            self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda)
+            self._profiler.__enter__()
+        else:
+            self._profiler = None
+
+    def after_step(self):
+        if self._profiler is None:
+            return
+        self._profiler.__exit__(None, None, None)
+        out_file = os.path.join(
+            self._output_dir, "profiler-trace-iter{}.json".format(self.trainer.iter)
+        )
+        if "://" not in out_file:
+            self._profiler.export_chrome_trace(out_file)
+        else:
+            # Support non-posix filesystems
+            with tempfile.TemporaryDirectory(prefix="fastreid_profiler") as d:
+                tmp_file = os.path.join(d, "tmp.json")
+                self._profiler.export_chrome_trace(tmp_file)
+                with open(tmp_file) as f:
+                    content = f.read()
+            with PathManager.open(out_file, "w") as f:
+                f.write(content)
+
+
+class EvalHook(HookBase):
+    """
+    Run an evaluation function periodically, and at the end of training.
+    It is executed every ``eval_period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, eval_period, eval_function):
+        """
+        Args:
+            eval_period (int): the period to run `eval_function`.
+            eval_function (callable): a function which takes no arguments, and
+                returns a nested dict of evaluation metrics.
+        Note:
+            This hook must be enabled in all or none workers.
+            If you would like only certain workers to perform evaluation,
+            give other workers a no-op function (`eval_function=lambda: None`).
+        """
+        self._period = eval_period
+        self._func = eval_function
+
+    def _do_eval(self):
+        results = self._func()
+
+        if results:
+            assert isinstance(
+                results, dict
+            ), "Eval function must return a dict. Got {} instead.".format(results)
+
+            flattened_results = flatten_results_dict(results)
+            for k, v in flattened_results.items():
+                try:
+                    v = float(v)
+                except Exception:
+                    raise ValueError(
+                        "[EvalHook] eval_function should return a nested dict of float. "
+                        "Got '{}: {}' instead.".format(k, v)
+                    )
+            self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
+
+        torch.cuda.empty_cache()
+        # Evaluation may take different time among workers.
+        # A barrier make them start the next iteration together.
+        comm.synchronize()
+
+    def after_epoch(self):
+        next_epoch = self.trainer.epoch + 1
+        if self._period > 0 and next_epoch % self._period == 0:
+            self._do_eval()
+
+    def after_train(self):
+        next_epoch = self.trainer.epoch + 1
+        # This condition is to prevent the eval from running after a failed training
+        if next_epoch % self._period != 0 and next_epoch >= self.trainer.max_epoch:
+            self._do_eval()
+        # func is likely a closure that holds reference to the trainer
+        # therefore we clean it to avoid circular reference in the end
+        del self._func
+
+
+class PreciseBN(HookBase):
+    """
+    The standard implementation of BatchNorm uses EMA in inference, which is
+    sometimes suboptimal.
+    This class computes the true average of statistics rather than the moving average,
+    and put true averages to every BN layer in the given model.
+    It is executed after the last iteration.
+    """
+
+    def __init__(self, model, data_loader, num_iter):
+        """
+        Args:
+            model (nn.Module): a module whose all BN layers in training mode will be
+                updated by precise BN.
+                Note that user is responsible for ensuring the BN layers to be
+                updated are in training mode when this hook is triggered.
+            data_loader (iterable): it will produce data to be run by `model(data)`.
+            num_iter (int): number of iterations used to compute the precise
+                statistics.
+        """
+        self._logger = logging.getLogger(__name__)
+        if len(get_bn_modules(model)) == 0:
+            self._logger.info(
+                "PreciseBN is disabled because model does not contain BN layers in training mode."
+            )
+            self._disabled = True
+            return
+
+        self._model = model
+        self._data_loader = data_loader
+        self._num_iter = num_iter
+        self._disabled = False
+
+        self._data_iter = None
+
+    def after_epoch(self):
+        next_epoch = self.trainer.epoch + 1
+        is_final = next_epoch == self.trainer.max_epoch
+        if is_final:
+            self.update_stats()
+
+    def update_stats(self):
+        """
+        Update the model with precise statistics. Users can manually call this method.
+        """
+        if self._disabled:
+            return
+
+        if self._data_iter is None:
+            self._data_iter = iter(self._data_loader)
+
+        def data_loader():
+            for num_iter in itertools.count(1):
+                if num_iter % 100 == 0:
+                    self._logger.info(
+                        "Running precise-BN ... {}/{} iterations.".format(num_iter, self._num_iter)
+                    )
+                # This way we can reuse the same iterator
+                yield next(self._data_iter)
+
+        with EventStorage():  # capture events in a new storage to discard them
+            self._logger.info(
+                "Running precise-BN for {} iterations...  ".format(self._num_iter)
+                + "Note that this could produce different statistics every time."
+            )
+            update_bn_stats(self._model, data_loader(), self._num_iter)
+
+
+class LayerFreeze(HookBase):
+    def __init__(self, model, freeze_layers, freeze_iters):
+        self._logger = logging.getLogger(__name__)
+        if isinstance(model, DistributedDataParallel):
+            model = model.module
+        self.model = model
+
+        self.freeze_layers = freeze_layers
+        self.freeze_iters = freeze_iters
+
+        self.is_frozen = False
+
+    def before_step(self):
+        # Freeze specific layers
+        if self.trainer.iter < self.freeze_iters and not self.is_frozen:
+            self.freeze_specific_layer()
+
+        # Recover original layers status
+        if self.trainer.iter >= self.freeze_iters and self.is_frozen:
+            self.open_all_layer()
+
+    def freeze_specific_layer(self):
+        for layer in self.freeze_layers:
+            if not hasattr(self.model, layer):
+                self._logger.info(f'{layer} is not an attribute of the model, will skip this layer')
+
+        for name, module in self.model.named_children():
+            if name in self.freeze_layers:
+                # Change BN in freeze layers to eval mode
+                module.eval()
+
+        self.is_frozen = True
+        freeze_layers = ", ".join(self.freeze_layers)
+        self._logger.info(f'Freeze layer group "{freeze_layers}" training for {self.freeze_iters:d} iterations')
+
+    def open_all_layer(self):
+        for name, module in self.model.named_children():
+            if name in self.freeze_layers:
+                module.train()
+
+        self.is_frozen = False
+
+        freeze_layers = ", ".join(self.freeze_layers)
+        self._logger.info(f'Open layer group "{freeze_layers}" training')
+
+
+class SWA(HookBase):
+    def __init__(self, swa_start: int, swa_freq: int, swa_lr_factor: float, eta_min: float, lr_sched=False, ):
+        self.swa_start = swa_start
+        self.swa_freq = swa_freq
+        self.swa_lr_factor = swa_lr_factor
+        self.eta_min = eta_min
+        self.lr_sched = lr_sched
+
+    def before_step(self):
+        is_swa = self.trainer.iter == self.swa_start
+        if is_swa:
+            # Wrapper optimizer with SWA
+            self.trainer.optimizer = optim.SWA(self.trainer.optimizer, self.swa_freq, self.swa_lr_factor)
+            self.trainer.optimizer.reset_lr_to_swa()
+
+            if self.lr_sched:
+                self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+                    optimizer=self.trainer.optimizer,
+                    T_0=self.swa_freq,
+                    eta_min=self.eta_min,
+                )
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+
+        # Use Cyclic learning rate scheduler
+        if next_iter > self.swa_start and self.lr_sched:
+            self.scheduler.step()
+
+        is_final = next_iter == self.trainer.max_iter
+        if is_final:
+            self.trainer.optimizer.swap_swa_param()
--- a/fastreid/engine/launch.py
+++ b/fastreid/engine/launch.py
+# encoding: utf-8
+"""
+@author:  xingyu liao
+@contact: sherlockliao01@gmail.com
+"""
+
+# based on:
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py
+
+
+import logging
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from fastreid.utils import comm
+
+__all__ = ["launch"]
+
+
+def _find_free_port():
+    import socket
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=()):
+    """
+    Launch multi-gpu or distributed training.
+    This function must be called on all machines involved in the training.
+    It will spawn child processes (defined by ``num_gpus_per_machine`) on each machine.
+    Args:
+        main_func: a function that will be called by `main_func(*args)`
+        num_gpus_per_machine (int): number of GPUs per machine
+        num_machines (int): the total number of machines
+        machine_rank (int): the rank of this machine
+        dist_url (str): url to connect to for distributed jobs, including protocol
+                       e.g. "tcp://127.0.0.1:8686".
+                       Can be set to "auto" to automatically select a free port on localhost
+        args (tuple): arguments passed to main_func
+    """
+    world_size = num_machines * num_gpus_per_machine
+    if world_size > 1:
+        # https://github.com/pytorch/pytorch/pull/14391
+        # TODO prctl in spawned processes
+
+        if dist_url == "auto":
+            assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs."
+            port = _find_free_port()
+            dist_url = f"tcp://127.0.0.1:{port}"
+        if num_machines > 1 and dist_url.startswith("file://"):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://"
+            )
+
+        mp.spawn(
+            _distributed_worker,
+            nprocs=num_gpus_per_machine,
+            args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args),
+            daemon=False,
+        )
+    else:
+        main_func(*args)
+
+
+def _distributed_worker(
+        local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
+):
+    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
+    global_rank = machine_rank * num_gpus_per_machine + local_rank
+    try:
+        dist.init_process_group(
+            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
+        )
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.error("Process group URL: {}".format(dist_url))
+        raise e
+    # synchronize is needed here to prevent a possible timeout after calling init_process_group
+    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+    comm.synchronize()
+
+    assert num_gpus_per_machine <= torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+
+    # Setup the local process group (which contains ranks within the same machine)
+    assert comm._LOCAL_PROCESS_GROUP is None
+    num_machines = world_size // num_gpus_per_machine
+    for i in range(num_machines):
+        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
+        pg = dist.new_group(ranks_on_i)
+        if i == machine_rank:
+            comm._LOCAL_PROCESS_GROUP = pg
+
+    main_func(*args)
--- a/fastreid/engine/train_loop.py
+++ b/fastreid/engine/train_loop.py
+# encoding: utf-8
+"""
+credit:
+https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/train_loop.py
+"""
+
+import logging
+import time
+import weakref
+from typing import Dict
+
+import numpy as np
+import torch
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+import fastreid.utils.comm as comm
+from fastreid.utils.events import EventStorage, get_event_storage
+from fastreid.utils.params import ContiguousParams
+
+__all__ = ["HookBase", "TrainerBase", "SimpleTrainer"]
+
+logger = logging.getLogger(__name__)
+
+
+class HookBase:
+    """
+    Base class for hooks that can be registered with :class:`TrainerBase`.
+    Each hook can implement 6 methods. The way they are called is demonstrated
+    in the following snippet:
+    .. code-block:: python
+        hook.before_train()
+        for _ in range(start_epoch, max_epoch):
+            hook.before_epoch()
+            for iter in range(start_iter, max_iter):
+                hook.before_step()
+                trainer.run_step()
+                hook.after_step()
+            hook.after_epoch()
+        hook.after_train()
+    Notes:
+        1. In the hook method, users can access `self.trainer` to access more
+           properties about the context (e.g., current iteration).
+        2. A hook that does something in :meth:`before_step` can often be
+           implemented equivalently in :meth:`after_step`.
+           If the hook takes non-trivial time, it is strongly recommended to
+           implement the hook in :meth:`after_step` instead of :meth:`before_step`.
+           The convention is that :meth:`before_step` should only take negligible time.
+           Following this convention will allow hooks that do care about the difference
+           between :meth:`before_step` and :meth:`after_step` (e.g., timer) to
+           function properly.
+    Attributes:
+        trainer: A weak reference to the trainer object. Set by the trainer when the hook is
+            registered.
+    """
+
+    def before_train(self):
+        """
+        Called before the first iteration.
+        """
+        pass
+
+    def after_train(self):
+        """
+        Called after the last iteration.
+        """
+        pass
+
+    def before_epoch(self):
+        """
+        Called before each epoch.
+        """
+        pass
+
+    def after_epoch(self):
+        """
+        Called after each epoch.
+        """
+        pass
+
+    def before_step(self):
+        """
+        Called before each iteration.
+        """
+        pass
+
+    def after_step(self):
+        """
+        Called after each iteration.
+        """
+        pass
+
+
+class TrainerBase:
+    """
+    Base class for iterative trainer with hooks.
+    The only assumption we made here is: the training runs in a loop.
+    A subclass can implement what the loop is.
+    We made no assumptions about the existence of dataloader, optimizer, model, etc.
+    Attributes:
+        iter(int): the current iteration.
+        epoch(int): the current epoch.
+        start_iter(int): The iteration to start with.
+            By convention the minimum possible value is 0.
+        max_epoch (int): The epoch to end training.
+        storage(EventStorage): An EventStorage that's opened during the course of training.
+    """
+
+    def __init__(self):
+        self._hooks = []
+
+    def register_hooks(self, hooks):
+        """
+        Register hooks to the trainer. The hooks are executed in the order
+        they are registered.
+        Args:
+            hooks (list[Optional[HookBase]]): list of hooks
+        """
+        hooks = [h for h in hooks if h is not None]
+        for h in hooks:
+            assert isinstance(h, HookBase)
+            # To avoid circular reference, hooks and trainer cannot own each other.
+            # This normally does not matter, but will cause memory leak if the
+            # involved objects contain __del__:
+            # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
+            h.trainer = weakref.proxy(self)
+        self._hooks.extend(hooks)
+
+    def train(self, start_epoch: int, max_epoch: int, iters_per_epoch: int):
+        """
+        Args:
+            start_epoch, max_epoch (int): See docs above
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Starting training from epoch {}".format(start_epoch))
+
+        self.iter = self.start_iter = start_epoch * iters_per_epoch
+
+        with EventStorage(self.start_iter) as self.storage:
+            try:
+                self.before_train()
+                for self.epoch in range(start_epoch, max_epoch):
+                    self.before_epoch()
+                    for _ in range(iters_per_epoch):
+                        self.before_step()
+                        self.run_step()
+                        self.after_step()
+                        self.iter += 1
+                    self.after_epoch()
+            except Exception:
+                logger.exception("Exception during training:")
+                raise
+            finally:
+                self.after_train()
+
+    def before_train(self):
+        for h in self._hooks:
+            h.before_train()
+
+    def after_train(self):
+        self.storage.iter = self.iter
+        for h in self._hooks:
+            h.after_train()
+
+    def before_epoch(self):
+        self.storage.epoch = self.epoch
+
+        for h in self._hooks:
+            h.before_epoch()
+
+    def before_step(self):
+        self.storage.iter = self.iter
+
+        for h in self._hooks:
+            h.before_step()
+
+    def after_step(self):
+        for h in self._hooks:
+            h.after_step()
+
+    def after_epoch(self):
+        for h in self._hooks:
+            h.after_epoch()
+
+    def run_step(self):
+        raise NotImplementedError
+
+
+class SimpleTrainer(TrainerBase):
+    """
+    A simple trainer for the most common type of task:
+    single-cost single-optimizer single-data-source iterative optimization.
+    It assumes that every step, you:
+    1. Compute the loss with a data from the data_loader.
+    2. Compute the gradients with the above loss.
+    3. Update the model with the optimizer.
+    If you want to do anything fancier than this,
+    either subclass TrainerBase and implement your own `run_step`,
+    or write your own training loop.
+    """
+
+    def __init__(self, model, data_loader, optimizer, param_wrapper):
+        """
+        Args:
+            model: a torch Module. Takes a data from data_loader and returns a
+                dict of heads.
+            data_loader: an iterable. Contains data to be used to call model.
+            optimizer: a torch optimizer.
+        """
+        super().__init__()
+
+        """
+        We set the model to training mode in the trainer.
+        However it's valid to train a model that's in eval mode.
+        If you want your model (or a submodule of it) to behave
+        like evaluation during training, you can overwrite its train() method.
+        """
+        model.train()
+
+        self.model = model
+        self.data_loader = data_loader
+        self._data_loader_iter = iter(data_loader)
+        self.optimizer = optimizer
+        self.param_wrapper = param_wrapper
+
+    def run_step(self):
+        """
+        Implement the standard training logic described above.
+        """
+        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
+        start = time.perf_counter()
+        """
+        If your want to do something with the data, you can wrap the dataloader.
+        """
+        data = next(self._data_loader_iter)
+        data_time = time.perf_counter() - start
+
+        """
+        If your want to do something with the heads, you can wrap the model.
+        """
+
+        loss_dict = self.model(data)
+        losses = sum(loss_dict.values())
+
+        """
+        If you need accumulate gradients or something similar, you can
+        wrap the optimizer with your custom `zero_grad()` method.
+        """
+        self.optimizer.zero_grad()
+
+        losses.backward()
+
+        self._write_metrics(loss_dict, data_time)
+
+        """
+        If you need gradient clipping/scaling or other processing, you can
+        wrap the optimizer with your custom `step()` method.
+        """
+        self.optimizer.step()
+        if isinstance(self.param_wrapper, ContiguousParams):
+            self.param_wrapper.assert_buffer_is_valid()
+
+    def _write_metrics(self, loss_dict: Dict[str, torch.Tensor], data_time: float):
+        """
+        Args:
+            loss_dict (dict): dict of scalar losses
+            data_time (float): time taken by the dataloader iteration
+        """
+        device = next(iter(loss_dict.values())).device
+
+        # Use a new stream so these ops don't wait for DDP or backward
+        with torch.cuda.stream(torch.cuda.Stream() if device.type == "cuda" else None):
+            metrics_dict = {k: v.detach().cpu().item() for k, v in loss_dict.items()}
+            metrics_dict["data_time"] = data_time
+
+            # Gather metrics among all workers for logging
+            # This assumes we do DDP-style training, which is currently the only
+            # supported method in detectron2.
+            all_metrics_dict = comm.gather(metrics_dict)
+
+        if comm.is_main_process():
+            storage = get_event_storage()
+
+            # data_time among workers can have high variance. The actual latency
+            # caused by data_time is the maximum among workers.
+            data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
+            storage.put_scalar("data_time", data_time)
+
+            # average the rest metrics
+            metrics_dict = {
+                k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()
+            }
+            total_losses_reduced = sum(metrics_dict.values())
+            if not np.isfinite(total_losses_reduced):
+                raise FloatingPointError(
+                    f"Loss became infinite or NaN at iteration={self.iter}!\n"
+                    f"loss_dict = {metrics_dict}"
+                )
+
+            storage.put_scalar("total_loss", total_losses_reduced)
+            if len(metrics_dict) > 1:
+                storage.put_scalars(**metrics_dict)
+
+
+class AMPTrainer(SimpleTrainer):
+    """
+    Like :class:`SimpleTrainer`, but uses automatic mixed precision
+    in the training loop.
+    """
+
+    def __init__(self, model, data_loader, optimizer, param_wrapper, grad_scaler=None):
+        """
+
+        Args:
+            model, data_loader, optimizer: same as in :class:`SimpleTrainer`.
+            grad_scaler: torch GradScaler to automatically scale gradients.
+        """
+        unsupported = "AMPTrainer does not support single-process multi-device training!"
+        if isinstance(model, DistributedDataParallel):
+            assert not (model.device_ids and len(model.device_ids) > 1), unsupported
+        assert not isinstance(model, DataParallel), unsupported
+
+        super().__init__(model, data_loader, optimizer, param_wrapper)
+
+        if grad_scaler is None:
+            from torch.cuda.amp import GradScaler
+
+            grad_scaler = GradScaler()
+        self.grad_scaler = grad_scaler
+
+    def run_step(self):
+        """
+        Implement the AMP training logic.
+        """
+        assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
+        assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!"
+        from torch.cuda.amp import autocast
+
+        start = time.perf_counter()
+        data = next(self._data_loader_iter)
+        data_time = time.perf_counter() - start
+
+        with autocast():
+            loss_dict = self.model(data)
+            losses = sum(loss_dict.values())
+
+        self.optimizer.zero_grad()
+        self.grad_scaler.scale(losses).backward()
+
+        self._write_metrics(loss_dict, data_time)
+
+        self.grad_scaler.step(self.optimizer)
+        self.grad_scaler.update()
+        if isinstance(self.param_wrapper, ContiguousParams):
+            self.param_wrapper.assert_buffer_is_valid()
--- a/fastreid/evaluation/__init__.py
+++ b/fastreid/evaluation/__init__.py
+from .evaluator import DatasetEvaluator, inference_context, inference_on_dataset
+from .reid_evaluation import ReidEvaluator
+from .clas_evaluator import ClasEvaluator
+from .testing import print_csv_format, verify_results
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
--- a/fastreid/evaluation/clas_evaluator.py
+++ b/fastreid/evaluation/clas_evaluator.py
+# encoding: utf-8
+"""
+@author:  xingyu liao
+@contact: sherlockliao01@gmail.com
+"""
+
+import copy
+import itertools
+import logging
+from collections import OrderedDict
+
+import torch
+
+from fastreid.utils import comm
+from .evaluator import DatasetEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+class ClasEvaluator(DatasetEvaluator):
+    def __init__(self, cfg, output_dir=None):
+        self.cfg = cfg
+        self._output_dir = output_dir
+        self._cpu_device = torch.device('cpu')
+
+        self._predictions = []
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        pred_logits = outputs.to(self._cpu_device, torch.float32)
+        labels = inputs["targets"].to(self._cpu_device)
+
+        # measure accuracy
+        acc1, = accuracy(pred_logits, labels, topk=(1,))
+        num_correct_acc1 = acc1 * labels.size(0) / 100
+
+        self._predictions.append({"num_correct": num_correct_acc1, "num_samples": labels.size(0)})
+
+    def evaluate(self):
+        if comm.get_world_size() > 1:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+
+            if not comm.is_main_process(): return {}
+
+        else:
+            predictions = self._predictions
+
+        total_correct_num = 0
+        total_samples = 0
+        for prediction in predictions:
+            total_correct_num += prediction["num_correct"]
+            total_samples += prediction["num_samples"]
+
+        acc1 = total_correct_num / total_samples * 100
+
+        self._results = OrderedDict()
+        self._results["Acc@1"] = acc1
+        self._results["metric"] = acc1
+
+        return copy.deepcopy(self._results)
--- a/fastreid/evaluation/evaluator.py
+++ b/fastreid/evaluation/evaluator.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import datetime
+import logging
+import time
+from contextlib import contextmanager
+
+import torch
+
+from fastreid.utils import comm
+from fastreid.utils.logger import log_every_n_seconds
+
+
+class DatasetEvaluator:
+    """
+    Base class for a dataset evaluator.
+    The function :func:`inference_on_dataset` runs the model over
+    all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+
+    def reset(self):
+        """
+        Preparation for a new round of evaluation.
+        Should be called before starting a round of evaluation.
+        """
+        pass
+
+    def preprocess_inputs(self, inputs):
+        pass
+
+    def process(self, inputs, outputs):
+        """
+        Process an input/output pair.
+        Args:
+            inputs: the inputs that's used to call the model.
+            outputs: the return value of `model(input)`
+        """
+        pass
+
+    def evaluate(self):
+        """
+        Evaluate/summarize the performance, after processing all input/output pairs.
+        Returns:
+            dict:
+                A new evaluator class can return a dict of arbitrary format
+                as long as the user can process the results.
+                In our train_net.py, we expect the following format:
+                * key: the name of the task (e.g., bbox)
+                * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
+        """
+        pass
+
+
+# class DatasetEvaluators(DatasetEvaluator):
+#     def __init__(self, evaluators):
+#         assert len(evaluators)
+#         super().__init__()
+#         self._evaluators = evaluators
+#
+#     def reset(self):
+#         for evaluator in self._evaluators:
+#             evaluator.reset()
+#
+#     def process(self, input, output):
+#         for evaluator in self._evaluators:
+#             evaluator.process(input, output)
+#
+#     def evaluate(self):
+#         results = OrderedDict()
+#         for evaluator in self._evaluators:
+#             result = evaluator.evaluate()
+#             if is_main_process() and result is not None:
+#                 for k, v in result.items():
+#                     assert (
+#                             k not in results
+#                     ), "Different evaluators produce results with the same key {}".format(k)
+#                     results[k] = v
+#         return results
+
+
+def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
+    """
+    Run model on the data_loader and evaluate the metrics with evaluator.
+    The model will be used in eval mode.
+    Args:
+        model (nn.Module): a module which accepts an object from
+            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+        evaluator (DatasetEvaluator): the evaluator to run. Use
+            :class:`DatasetEvaluators([])` if you only want to benchmark, but
+            don't want to do any evaluation.
+        flip_test (bool): If get features with flipped images
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    num_devices = comm.get_world_size()
+    logger = logging.getLogger(__name__)
+    logger.info("Start inference on {} images".format(len(data_loader.dataset)))
+
+    total = len(data_loader)  # inference data loader must have a fixed length
+    evaluator.reset()
+
+    num_warmup = min(5, total - 1)
+    start_time = time.perf_counter()
+    total_compute_time = 0
+    with inference_context(model), torch.no_grad():
+        for idx, inputs in enumerate(data_loader):
+            if idx == num_warmup:
+                start_time = time.perf_counter()
+                total_compute_time = 0
+
+            start_compute_time = time.perf_counter()
+            outputs = model(inputs)
+            # Flip test
+            if flip_test:
+                inputs["images"] = inputs["images"].flip(dims=[3])
+                flip_outputs = model(inputs)
+                outputs = (outputs + flip_outputs) / 2
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            total_compute_time += time.perf_counter() - start_compute_time
+            evaluator.process(inputs, outputs)
+
+            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+            seconds_per_batch = total_compute_time / iters_after_start
+            if idx >= num_warmup * 2 or seconds_per_batch > 30:
+                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start
+                eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1)))
+                log_every_n_seconds(
+                    logging.INFO,
+                    "Inference done {}/{}. {:.4f} s / batch. ETA={}".format(
+                        idx + 1, total, seconds_per_batch, str(eta)
+                    ),
+                    n=30,
+                )
+
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = time.perf_counter() - start_time
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        "Total inference time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+        )
+    )
+    results = evaluator.evaluate()
+
+    # An evaluator may return None when not in main process.
+    # Replace it by an empty dict instead to make it easier for downstream code to handle
+    if results is None:
+        results = {}
+    return results
+
+
+@contextmanager
+def inference_context(model):
+    """
+    A context where the model is temporarily changed to eval mode,
+    and restored to previous mode afterwards.
+    Args:
+        model: a torch Module
+    """
+    training_mode = model.training
+    model.eval()
+    yield
+    model.train(training_mode)
--- a/fastreid/evaluation/query_expansion.py
+++ b/fastreid/evaluation/query_expansion.py
+# encoding: utf-8
+"""
+@author:  xingyu liao
+@contact: sherlockliao01@gmail.com
+"""
+
+# based on
+# https://github.com/PyRetri/PyRetri/blob/master/pyretri/index/re_ranker/re_ranker_impl/query_expansion.py
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+def aqe(query_feat: torch.tensor, gallery_feat: torch.tensor,
+        qe_times: int = 1, qe_k: int = 10, alpha: float = 3.0):
+    """
+    Combining the retrieved topk nearest neighbors with the original query and doing another retrieval.
+    c.f. https://www.robots.ox.ac.uk/~vgg/publications/papers/chum07b.pdf
+    Args :
+        query_feat (torch.tensor):
+        gallery_feat (torch.tensor):
+        qe_times (int): number of query expansion times.
+        qe_k (int): number of the neighbors to be combined.
+        alpha (float):
+    """
+    num_query = query_feat.shape[0]
+    all_feat = torch.cat((query_feat, gallery_feat), dim=0)
+    norm_feat = F.normalize(all_feat, p=2, dim=1)
+
+    all_feat = all_feat.numpy()
+    for i in range(qe_times):
+        all_feat_list = []
+        sims = torch.mm(norm_feat, norm_feat.t())
+        sims = sims.data.cpu().numpy()
+        for sim in sims:
+            init_rank = np.argpartition(-sim, range(1, qe_k + 1))
+            weights = sim[init_rank[:qe_k]].reshape((-1, 1))
+            weights = np.power(weights, alpha)
+            all_feat_list.append(np.mean(all_feat[init_rank[:qe_k], :] * weights, axis=0))
+        all_feat = np.stack(all_feat_list, axis=0)
+        norm_feat = F.normalize(torch.from_numpy(all_feat), p=2, dim=1)
+
+    query_feat = torch.from_numpy(all_feat[:num_query])
+    gallery_feat = torch.from_numpy(all_feat[num_query:])
+    return query_feat, gallery_feat
--- a/fastreid/evaluation/rank.py
+++ b/fastreid/evaluation/rank.py
+# credits: https://github.com/KaiyangZhou/deep-person-reid/blob/master/torchreid/metrics/rank.py
+
+import warnings
+from collections import defaultdict
+
+import numpy as np
+
+try:
+    from .rank_cylib.rank_cy import evaluate_cy
+
+    IS_CYTHON_AVAI = True
+except ImportError:
+    IS_CYTHON_AVAI = False
+    warnings.warn(
+        'Cython rank evaluation (very fast so highly recommended) is '
+        'unavailable, now use python evaluation.'
+    )
+
+
+def eval_cuhk03(distmat, q_pids, g_pids, q_camids, g_camids, max_rank):
+    """Evaluation with cuhk03 metric
+    Key: one image for each gallery identity is randomly sampled for each query identity.
+    Random sampling is performed num_repeats times.
+    """
+    num_repeats = 10
+
+    num_q, num_g = distmat.shape
+
+    indices = np.argsort(distmat, axis=1)
+
+    if num_g < max_rank:
+        max_rank = num_g
+        print(
+            'Note: number of gallery samples is quite small, got {}'.
+                format(num_g)
+        )
+
+    matches = (g_pids[indices] == q_pids[:, np.newaxis]).astype(np.int32)
+
+    # compute cmc curve for each query
+    all_cmc = []
+    all_AP = []
+    num_valid_q = 0.  # number of valid query
+
+    for q_idx in range(num_q):
+        # get query pid and camid
+        q_pid = q_pids[q_idx]
+        q_camid = q_camids[q_idx]
+
+        # remove gallery samples that have the same pid and camid with query
+        order = indices[q_idx]
+        remove = (g_pids[order] == q_pid) & (g_camids[order] == q_camid)
+        keep = np.invert(remove)
+
+        # compute cmc curve
+        raw_cmc = matches[q_idx][
+            keep]  # binary vector, positions with value 1 are correct matches
+        if not np.any(raw_cmc):
+            # this condition is true when query identity does not appear in gallery
+            continue
+
+        kept_g_pids = g_pids[order][keep]
+        g_pids_dict = defaultdict(list)
+        for idx, pid in enumerate(kept_g_pids):
+            g_pids_dict[pid].append(idx)
+
+        cmc = 0.
+        for repeat_idx in range(num_repeats):
+            mask = np.zeros(len(raw_cmc), dtype=np.bool)
+            for _, idxs in g_pids_dict.items():
+                # randomly sample one image for each gallery person
+                rnd_idx = np.random.choice(idxs)
+                mask[rnd_idx] = True
+            masked_raw_cmc = raw_cmc[mask]
+            _cmc = masked_raw_cmc.cumsum()
+            _cmc[_cmc > 1] = 1
+            cmc += _cmc[:max_rank].astype(np.float32)
+
+        cmc /= num_repeats
+        all_cmc.append(cmc)
+        # compute AP
+        num_rel = raw_cmc.sum()
+        tmp_cmc = raw_cmc.cumsum()
+        tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)]
+        tmp_cmc = np.asarray(tmp_cmc) * raw_cmc
+        AP = tmp_cmc.sum() / num_rel
+        all_AP.append(AP)
+        num_valid_q += 1.
+
+    assert num_valid_q > 0, 'Error: all query identities do not appear in gallery'
+
+    all_cmc = np.asarray(all_cmc).astype(np.float32)
+    all_cmc = all_cmc.sum(0) / num_valid_q
+    mAP = np.mean(all_AP)
+
+    return all_cmc, mAP
+
+
+def eval_market1501(distmat, q_pids, g_pids, q_camids, g_camids, max_rank):
+    """Evaluation with market1501 metric
+    Key: for each query identity, its gallery images from the same camera view are discarded.
+    """
+    num_q, num_g = distmat.shape
+
+    if num_g < max_rank:
+        max_rank = num_g
+        print('Note: number of gallery samples is quite small, got {}'.format(num_g))
+
+    indices = np.argsort(distmat, axis=1)
+    # compute cmc curve for each query
+    all_cmc = []
+    all_AP = []
+    all_INP = []
+    num_valid_q = 0.  # number of valid query
+
+    for q_idx in range(num_q):
+        # get query pid and camid
+        q_pid = q_pids[q_idx]
+        q_camid = q_camids[q_idx]
+
+        # remove gallery samples that have the same pid and camid with query
+        order = indices[q_idx]
+        remove = (g_pids[order] == q_pid) & (g_camids[order] == q_camid)
+        keep = np.invert(remove)
+
+        # compute cmc curve
+        matches = (g_pids[order] == q_pid).astype(np.int32)
+        raw_cmc = matches[keep]  # binary vector, positions with value 1 are correct matches
+        if not np.any(raw_cmc):
+            # this condition is true when query identity does not appear in gallery
+            continue
+
+        cmc = raw_cmc.cumsum()
+
+        pos_idx = np.where(raw_cmc == 1)
+        max_pos_idx = np.max(pos_idx)
+        inp = cmc[max_pos_idx] / (max_pos_idx + 1.0)
+        all_INP.append(inp)
+
+        cmc[cmc > 1] = 1
+
+        all_cmc.append(cmc[:max_rank])
+        num_valid_q += 1.
+
+        # compute average precision
+        # reference: https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision
+        num_rel = raw_cmc.sum()
+        tmp_cmc = raw_cmc.cumsum()
+        tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)]
+        tmp_cmc = np.asarray(tmp_cmc) * raw_cmc
+        AP = tmp_cmc.sum() / num_rel
+        all_AP.append(AP)
+
+    assert num_valid_q > 0, 'Error: all query identities do not appear in gallery'
+
+    all_cmc = np.asarray(all_cmc).astype(np.float32)
+    all_cmc = all_cmc.sum(0) / num_valid_q
+
+    return all_cmc, all_AP, all_INP
+
+
+def evaluate_py(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_metric_cuhk03):
+    if use_metric_cuhk03:
+        return eval_cuhk03(distmat, q_pids, g_pids, q_camids, g_camids, max_rank)
+    else:
+        return eval_market1501(distmat, q_pids, g_pids, q_camids, g_camids, max_rank)
+
+
+def evaluate_rank(
+        distmat,
+        q_pids,
+        g_pids,
+        q_camids,
+        g_camids,
+        max_rank=50,
+        use_metric_cuhk03=False,
+        use_cython=True,
+):
+    """Evaluates CMC rank.
+    Args:
+        distmat (numpy.ndarray): distance matrix of shape (num_query, num_gallery).
+        q_pids (numpy.ndarray): 1-D array containing person identities
+            of each query instance.
+        g_pids (numpy.ndarray): 1-D array containing person identities
+            of each gallery instance.
+        q_camids (numpy.ndarray): 1-D array containing camera views under
+            which each query instance is captured.
+        g_camids (numpy.ndarray): 1-D array containing camera views under
+            which each gallery instance is captured.
+        max_rank (int, optional): maximum CMC rank to be computed. Default is 50.
+        use_metric_cuhk03 (bool, optional): use single-gallery-shot setting for cuhk03.
+            Default is False. This should be enabled when using cuhk03 classic split.
+        use_cython (bool, optional): use cython code for evaluation. Default is True.
+            This is highly recommended as the cython code can speed up the cmc computation
+            by more than 10x. This requires Cython to be installed.
+    """
+    if use_cython and IS_CYTHON_AVAI:
+        return evaluate_cy(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_metric_cuhk03)
+    else:
+        return evaluate_py(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_metric_cuhk03)
--- a/fastreid/evaluation/rank_cylib/Makefile
+++ b/fastreid/evaluation/rank_cylib/Makefile
+all:
+	python3 setup.py build_ext --inplace
+	rm -rf build
+clean:
+	rm -rf build
+	rm -f rank_cy.c *.so
--- a/fastreid/evaluation/rank_cylib/__init__.py
+++ b/fastreid/evaluation/rank_cylib/__init__.py
+# encoding: utf-8
+"""
+@author:  liaoxingyu
+@contact: sherlockliao01@gmail.com
+"""
+
+
+def compile_helper():
+    """Compile helper function at runtime. Make sure this
+    is invoked on a single process."""
+    import os
+    import subprocess
+
+    path = os.path.abspath(os.path.dirname(__file__))
+    ret = subprocess.run(["make", "-C", path])
+    if ret.returncode != 0:
+        print("Making cython reid evaluation module failed, exiting.")
+        import sys
+
+        sys.exit(1)
--- a/fastreid/evaluation/rank_cylib/rank_cy.pyx
+++ b/fastreid/evaluation/rank_cylib/rank_cy.pyx
+# cython: boundscheck=False, wraparound=False, nonecheck=False, cdivision=True
+# credits: https://github.com/KaiyangZhou/deep-person-reid/blob/master/torchreid/metrics/rank_cylib/rank_cy.pyx
+
+import cython
+import numpy as np
+cimport numpy as np
+from collections import defaultdict
+
+
+"""
+Compiler directives:
+https://github.com/cython/cython/wiki/enhancements-compilerdirectives
+Cython tutorial:
+https://cython.readthedocs.io/en/latest/src/userguide/numpy_tutorial.html
+Credit to https://github.com/luzai
+"""
+
+
+# Main interface
+cpdef evaluate_cy(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_metric_cuhk03=False):
+    distmat = np.asarray(distmat, dtype=np.float32)
+    q_pids = np.asarray(q_pids, dtype=np.int64)
+    g_pids = np.asarray(g_pids, dtype=np.int64)
+    q_camids = np.asarray(q_camids, dtype=np.int64)
+    g_camids = np.asarray(g_camids, dtype=np.int64)
+    if use_metric_cuhk03:
+        return eval_cuhk03_cy(distmat, q_pids, g_pids, q_camids, g_camids, max_rank)
+    return eval_market1501_cy(distmat, q_pids, g_pids, q_camids, g_camids, max_rank)
+
+
+cpdef eval_cuhk03_cy(float[:,:] distmat, long[:] q_pids, long[:]g_pids,
+                     long[:]q_camids, long[:]g_camids, long max_rank):
+    cdef long num_q = distmat.shape[0]
+    cdef long num_g = distmat.shape[1]
+
+
+    if num_g < max_rank:
+        max_rank = num_g
+        print('Note: number of gallery samples is quite small, got {}'.format(num_g))
+
+    cdef:
+        long num_repeats = 10
+        long[:,:] indices = np.argsort(distmat, axis=1)
+        long[:,:] matches = (np.asarray(g_pids)[np.asarray(indices)] == np.asarray(q_pids)[:, np.newaxis]).astype(np.int64)
+
+        float[:,:] all_cmc = np.zeros((num_q, max_rank), dtype=np.float32)
+        float[:] all_AP = np.zeros(num_q, dtype=np.float32)
+        float num_valid_q = 0. # number of valid query
+
+        long q_idx, q_pid, q_camid, g_idx
+        long[:] order = np.zeros(num_g, dtype=np.int64)
+        long keep
+
+        float[:] raw_cmc = np.zeros(num_g, dtype=np.float32) # binary vector, positions with value 1 are correct matches
+        float[:] masked_raw_cmc = np.zeros(num_g, dtype=np.float32)
+        float[:] cmc, masked_cmc
+        long num_g_real, num_g_real_masked, rank_idx, rnd_idx
+        unsigned long meet_condition
+        float AP
+        long[:] kept_g_pids, mask
+
+        float num_rel
+        float[:] tmp_cmc = np.zeros(num_g, dtype=np.float32)
+        float tmp_cmc_sum
+
+    for q_idx in range(num_q):
+        # get query pid and camid
+        q_pid = q_pids[q_idx]
+        q_camid = q_camids[q_idx]
+
+        # remove gallery samples that have the same pid and camid with query
+        for g_idx in range(num_g):
+            order[g_idx] = indices[q_idx, g_idx]
+        num_g_real = 0
+        meet_condition = 0
+        kept_g_pids = np.zeros(num_g, dtype=np.int64)
+
+        for g_idx in range(num_g):
+            if (g_pids[order[g_idx]] != q_pid) or (g_camids[order[g_idx]] != q_camid):
+                raw_cmc[num_g_real] = matches[q_idx][g_idx]
+                kept_g_pids[num_g_real] = g_pids[order[g_idx]]
+                num_g_real += 1
+                if matches[q_idx][g_idx] > 1e-31:
+                    meet_condition = 1
+
+        if not meet_condition:
+            # this condition is true when query identity does not appear in gallery
+            continue
+
+        # cuhk03-specific setting
+        g_pids_dict = defaultdict(list) # overhead!
+        for g_idx in range(num_g_real):
+            g_pids_dict[kept_g_pids[g_idx]].append(g_idx)
+
+        cmc = np.zeros(max_rank, dtype=np.float32)
+        for _ in range(num_repeats):
+            mask = np.zeros(num_g_real, dtype=np.int64)
+
+            for _, idxs in g_pids_dict.items():
+                # randomly sample one image for each gallery person
+                rnd_idx = np.random.choice(idxs)
+                #rnd_idx = idxs[0] # use deterministic for debugging
+                mask[rnd_idx] = 1
+
+            num_g_real_masked = 0
+            for g_idx in range(num_g_real):
+                if mask[g_idx] == 1:
+                    masked_raw_cmc[num_g_real_masked] = raw_cmc[g_idx]
+                    num_g_real_masked += 1
+
+            masked_cmc = np.zeros(num_g, dtype=np.float32)
+            function_cumsum(masked_raw_cmc, masked_cmc, num_g_real_masked)
+            for g_idx in range(num_g_real_masked):
+                if masked_cmc[g_idx] > 1:
+                    masked_cmc[g_idx] = 1
+
+            for rank_idx in range(max_rank):
+                cmc[rank_idx] += masked_cmc[rank_idx] / num_repeats
+
+        for rank_idx in range(max_rank):
+            all_cmc[q_idx, rank_idx] = cmc[rank_idx]
+        # compute average precision
+        # reference: https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision
+        function_cumsum(raw_cmc, tmp_cmc, num_g_real)
+        num_rel = 0
+        tmp_cmc_sum = 0
+        for g_idx in range(num_g_real):
+            tmp_cmc_sum += (tmp_cmc[g_idx] / (g_idx + 1.)) * raw_cmc[g_idx]
+            num_rel += raw_cmc[g_idx]
+        all_AP[q_idx] = tmp_cmc_sum / num_rel
+        num_valid_q += 1.
+
+    assert num_valid_q > 0, 'Error: all query identities do not appear in gallery'
+
+    # compute averaged cmc
+    cdef float[:] avg_cmc = np.zeros(max_rank, dtype=np.float32)
+    for rank_idx in range(max_rank):
+        for q_idx in range(num_q):
+            avg_cmc[rank_idx] += all_cmc[q_idx, rank_idx]
+        avg_cmc[rank_idx] /= num_valid_q
+
+    cdef float mAP = 0
+    for q_idx in range(num_q):
+        mAP += all_AP[q_idx]
+    mAP /= num_valid_q
+
+    return np.asarray(avg_cmc).astype(np.float32), mAP
+
+
+cpdef eval_market1501_cy(float[:,:] distmat, long[:] q_pids, long[:]g_pids,
+                         long[:]q_camids, long[:]g_camids, long max_rank):
+
+    cdef long num_q = distmat.shape[0]
+    cdef long num_g = distmat.shape[1]
+
+    if num_g < max_rank:
+        max_rank = num_g
+        print('Note: number of gallery samples is quite small, got {}'.format(num_g))
+
+    cdef:
+        long[:,:] indices = np.argsort(distmat, axis=1)
+        long[:] matches
+
+        float[:,:] all_cmc = np.zeros((num_q, max_rank), dtype=np.float32)
+        float[:] all_AP = np.zeros(num_q, dtype=np.float32)
+        float[:] all_INP = np.zeros(num_q, dtype=np.float32)
+        float num_valid_q = 0. # number of valid query
+        long valid_index = 0
+
+        long q_idx, q_pid, q_camid, g_idx
+        long[:] order = np.zeros(num_g, dtype=np.int64)
+        long keep
+
+        float[:] raw_cmc = np.zeros(num_g, dtype=np.float32) # binary vector, positions with value 1 are correct matches
+        float[:] cmc = np.zeros(num_g, dtype=np.float32)
+        long max_pos_idx = 0
+        float inp
+        long num_g_real, rank_idx
+        unsigned long meet_condition
+
+        float num_rel
+        float[:] tmp_cmc = np.zeros(num_g, dtype=np.float32)
+        float tmp_cmc_sum
+
+
+    for q_idx in range(num_q):
+        # get query pid and camid
+        q_pid = q_pids[q_idx]
+        q_camid = q_camids[q_idx]
+
+        for g_idx in range(num_g):
+            order[g_idx] = indices[q_idx, g_idx]
+        num_g_real = 0
+        meet_condition = 0
+        matches = (np.asarray(g_pids)[np.asarray(order)] == q_pid).astype(np.int64)
+
+        # remove gallery samples that have the same pid and camid with query
+        for g_idx in range(num_g):
+            if (g_pids[order[g_idx]] != q_pid) or (g_camids[order[g_idx]] != q_camid):
+                raw_cmc[num_g_real] = matches[g_idx]
+                num_g_real += 1
+                # this condition is true if query appear in gallery
+                if matches[g_idx] > 1e-31:
+                    meet_condition = 1
+
+        if not meet_condition:
+            # this condition is true when query identity does not appear in gallery
+            continue
+
+        # compute cmc
+        function_cumsum(raw_cmc, cmc, num_g_real)
+        # compute mean inverse negative penalty
+        # reference : https://github.com/mangye16/ReID-Survey/blob/master/utils/reid_metric.py
+        max_pos_idx = 0
+        for g_idx in range(num_g_real):
+            if (raw_cmc[g_idx] == 1) and (g_idx > max_pos_idx):
+                max_pos_idx = g_idx
+        inp = cmc[max_pos_idx] / (max_pos_idx + 1.0)
+        all_INP[valid_index] = inp
+
+        for g_idx in range(num_g_real):
+            if cmc[g_idx] > 1:
+                cmc[g_idx] = 1
+
+        for rank_idx in range(max_rank):
+            all_cmc[q_idx, rank_idx] = cmc[rank_idx]
+        num_valid_q += 1.
+
+        # compute average precision
+        # reference: https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision
+        function_cumsum(raw_cmc, tmp_cmc, num_g_real)
+        num_rel = 0
+        tmp_cmc_sum = 0
+        for g_idx in range(num_g_real):
+            tmp_cmc_sum += (tmp_cmc[g_idx] / (g_idx + 1.)) * raw_cmc[g_idx]
+            num_rel += raw_cmc[g_idx]
+        all_AP[valid_index] = tmp_cmc_sum / num_rel
+        valid_index += 1
+
+    assert num_valid_q > 0, 'Error: all query identities do not appear in gallery'
+
+    # compute averaged cmc
+    cdef float[:] avg_cmc = np.zeros(max_rank, dtype=np.float32)
+    for rank_idx in range(max_rank):
+        for q_idx in range(num_q):
+            avg_cmc[rank_idx] += all_cmc[q_idx, rank_idx]
+        avg_cmc[rank_idx] /= num_valid_q
+
+    return np.asarray(avg_cmc).astype(np.float32), np.asarray(all_AP[:valid_index]), np.asarray(all_INP[:valid_index])
+
+
+# Compute the cumulative sum
+cdef void function_cumsum(cython.numeric[:] src, cython.numeric[:] dst, long n):
+    cdef long i
+    dst[0] = src[0]
+    for i in range(1, n):
+        dst[i] = src[i] + dst[i - 1]
\ No newline at end of file
--- a/fastreid/evaluation/rank_cylib/roc_cy.pyx
+++ b/fastreid/evaluation/rank_cylib/roc_cy.pyx
+# cython: boundscheck=False, wraparound=False, nonecheck=False, cdivision=True
+# credits: https://github.com/KaiyangZhou/deep-person-reid/blob/master/torchreid/metrics/rank_cylib/rank_cy.pyx
+
+import cython
+import faiss
+import numpy as np
+cimport numpy as np
+
+
+"""
+Compiler directives:
+https://github.com/cython/cython/wiki/enhancements-compilerdirectives
+Cython tutorial:
+https://cython.readthedocs.io/en/latest/src/userguide/numpy_tutorial.html
+Credit to https://github.com/luzai
+"""
+
+
+# Main interface
+cpdef evaluate_roc_cy(float[:,:] distmat, long[:] q_pids, long[:]g_pids,
+                  long[:]q_camids, long[:]g_camids):
+
+    distmat = np.asarray(distmat, dtype=np.float32)
+    q_pids = np.asarray(q_pids, dtype=np.int64)
+    g_pids = np.asarray(g_pids, dtype=np.int64)
+    q_camids = np.asarray(q_camids, dtype=np.int64)
+    g_camids = np.asarray(g_camids, dtype=np.int64)
+
+    cdef long num_q = distmat.shape[0]
+    cdef long num_g = distmat.shape[1]
+
+    cdef:
+        long[:,:] indices = np.argsort(distmat, axis=1)
+        long[:,:] matches = (np.asarray(g_pids)[np.asarray(indices)] == np.asarray(q_pids)[:, np.newaxis]).astype(np.int64)
+
+        float[:] pos = np.zeros(num_q*num_g, dtype=np.float32)
+        float[:] neg = np.zeros(num_q*num_g, dtype=np.float32)
+
+        long valid_pos = 0
+        long valid_neg = 0
+        long ind
+
+        long q_idx, q_pid, q_camid, g_idx
+        long[:] order = np.zeros(num_g, dtype=np.int64)
+
+        float[:] raw_cmc = np.zeros(num_g, dtype=np.float32) # binary vector, positions with value 1 are correct matches
+        long[:] sort_idx = np.zeros(num_g, dtype=np.int64)
+
+        long idx
+
+    for q_idx in range(num_q):
+        # get query pid and camid
+        q_pid = q_pids[q_idx]
+        q_camid = q_camids[q_idx]
+
+        for g_idx in range(num_g):
+            order[g_idx] = indices[q_idx, g_idx]
+        num_g_real = 0
+
+        # remove gallery samples that have the same pid and camid with query
+        for g_idx in range(num_g):
+            if (g_pids[order[g_idx]] != q_pid) or (g_camids[order[g_idx]] != q_camid):
+                raw_cmc[num_g_real] = matches[q_idx][g_idx]
+                sort_idx[num_g_real] = order[g_idx]
+                num_g_real += 1
+
+        q_dist = distmat[q_idx]
+
+        for valid_idx in range(num_g_real):
+            if raw_cmc[valid_idx] == 1:
+                pos[valid_pos] = q_dist[sort_idx[valid_idx]]
+                valid_pos += 1
+            elif raw_cmc[valid_idx] == 0:
+                neg[valid_neg] = q_dist[sort_idx[valid_idx]]
+                valid_neg += 1
+
+    cdef float[:] scores = np.hstack((pos[:valid_pos], neg[:valid_neg]))
+    cdef float[:] labels = np.hstack((np.zeros(valid_pos, dtype=np.float32),
+                                      np.ones(valid_neg, dtype=np.float32)))
+    return np.asarray(scores), np.asarray(labels)
+
+
+# Compute the cumulative sum
+cdef void function_cumsum(cython.numeric[:] src, cython.numeric[:] dst, long n):
+    cdef long i
+    dst[0] = src[0]
+    for i in range(1, n):
+        dst[i] = src[i] + dst[i - 1]
\ No newline at end of file
--- a/fastreid/evaluation/rank_cylib/setup.py
+++ b/fastreid/evaluation/rank_cylib/setup.py
+from distutils.core import setup
+from distutils.extension import Extension
+
+import numpy as np
+from Cython.Build import cythonize
+
+
+def numpy_include():
+    try:
+        numpy_include = np.get_include()
+    except AttributeError:
+        numpy_include = np.get_numpy_include()
+    return numpy_include
+
+
+ext_modules = [
+    Extension(
+        'rank_cy',
+        ['rank_cy.pyx'],
+        include_dirs=[numpy_include()],
+    ),
+    Extension(
+        'roc_cy',
+        ['roc_cy.pyx'],
+        include_dirs=[numpy_include()],
+    )
+]
+
+setup(
+    name='Cython-based reid evaluation code',
+    ext_modules=cythonize(ext_modules)
+)
--- a/fastreid/evaluation/rank_cylib/test_cython.py
+++ b/fastreid/evaluation/rank_cylib/test_cython.py
+import sys
+import timeit
+import numpy as np
+import os.path as osp
+
+sys.path.insert(0, osp.dirname(osp.abspath(__file__)) + '/../../..')
+
+from fastreid.evaluation.rank import evaluate_rank
+from fastreid.evaluation.roc import evaluate_roc
+
+"""
+Test the speed of cython-based evaluation code. The speed improvements
+can be much bigger when using the real reid data, which contains a larger
+amount of query and gallery images.
+Note: you might encounter the following error:
+  'AssertionError: Error: all query identities do not appear in gallery'.
+This is normal because the inputs are random numbers. Just try again.
+"""
+
+print('*** Compare running time ***')
+
+setup = '''
+import sys
+import os.path as osp
+import numpy as np
+sys.path.insert(0, osp.dirname(osp.abspath(__file__)) + '/../../..')
+from fastreid.evaluation.rank import evaluate_rank
+from fastreid.evaluation.roc import evaluate_roc
+num_q = 30
+num_g = 300
+dim = 512
+max_rank = 5
+q_feats = np.random.rand(num_q, dim).astype(np.float32) * 20
+q_feats = q_feats / np.linalg.norm(q_feats, ord=2, axis=1, keepdims=True)
+g_feats = np.random.rand(num_g, dim).astype(np.float32) * 20
+g_feats = g_feats / np.linalg.norm(g_feats, ord=2, axis=1, keepdims=True)
+distmat = 1 - np.dot(q_feats, g_feats.transpose())
+q_pids = np.random.randint(0, num_q, size=num_q)
+g_pids = np.random.randint(0, num_g, size=num_g)
+q_camids = np.random.randint(0, 5, size=num_q)
+g_camids = np.random.randint(0, 5, size=num_g)
+'''
+
+print('=> Using CMC metric')
+pytime = timeit.timeit(
+    'evaluate_rank(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_cython=False)',
+    setup=setup,
+    number=20
+)
+cytime = timeit.timeit(
+    'evaluate_rank(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_cython=True)',
+    setup=setup,
+    number=20
+)
+print('Python time: {} s'.format(pytime))
+print('Cython time: {} s'.format(cytime))
+print('CMC Cython is {} times faster than python\n'.format(pytime / cytime))
+
+print('=> Using ROC metric')
+pytime = timeit.timeit(
+    'evaluate_roc(distmat, q_pids, g_pids, q_camids, g_camids, use_cython=False)',
+    setup=setup,
+    number=20
+)
+cytime = timeit.timeit(
+    'evaluate_roc(distmat, q_pids, g_pids, q_camids, g_camids, use_cython=True)',
+    setup=setup,
+    number=20
+)
+print('Python time: {} s'.format(pytime))
+print('Cython time: {} s'.format(cytime))
+print('ROC Cython is {} times faster than python\n'.format(pytime / cytime))
+
+print("=> Check precision")
+num_q = 30
+num_g = 300
+dim = 512
+max_rank = 5
+q_feats = np.random.rand(num_q, dim).astype(np.float32) * 20
+q_feats = q_feats / np.linalg.norm(q_feats, ord=2, axis=1, keepdims=True)
+g_feats = np.random.rand(num_g, dim).astype(np.float32) * 20
+g_feats = g_feats / np.linalg.norm(g_feats, ord=2, axis=1, keepdims=True)
+distmat = 1 - np.dot(q_feats, g_feats.transpose())
+q_pids = np.random.randint(0, num_q, size=num_q)
+g_pids = np.random.randint(0, num_g, size=num_g)
+q_camids = np.random.randint(0, 5, size=num_q)
+g_camids = np.random.randint(0, 5, size=num_g)
+
+cmc_py, mAP_py, mINP_py = evaluate_rank(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_cython=False)
+
+cmc_cy, mAP_cy, mINP_cy = evaluate_rank(distmat, q_pids, g_pids, q_camids, g_camids, max_rank, use_cython=True)
+
+np.testing.assert_allclose(cmc_py, cmc_cy, rtol=1e-3, atol=1e-6)
+np.testing.assert_allclose(mAP_py, mAP_cy, rtol=1e-3, atol=1e-6)
+np.testing.assert_allclose(mINP_py, mINP_cy, rtol=1e-3, atol=1e-6)
+print('Rank results between python and cython are the same!')
+
+scores_cy, labels_cy = evaluate_roc(distmat, q_pids, g_pids, q_camids, g_camids, use_cython=True)
+scores_py, labels_py = evaluate_roc(distmat, q_pids, g_pids, q_camids, g_camids, use_cython=False)
+
+np.testing.assert_allclose(scores_cy, scores_py, rtol=1e-3, atol=1e-6)
+np.testing.assert_allclose(labels_cy, labels_py, rtol=1e-3, atol=1e-6)
+print('ROC results between python and cython are the same!\n')
+
+print("=> Check exact values")
+print("mAP = {} \ncmc = {}\nmINP = {}\nScores = {}".format(np.array(mAP_cy), cmc_cy, np.array(mINP_cy), scores_cy))
--- a/fastreid/evaluation/reid_evaluation.py
+++ b/fastreid/evaluation/reid_evaluation.py
+# encoding: utf-8
+"""
+@author:  liaoxingyu
+@contact: sherlockliao01@gmail.com
+"""
+import copy
+import logging
+import time
+import itertools
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from sklearn import metrics
+
+from fastreid.utils import comm
+from fastreid.utils.compute_dist import build_dist
+from .evaluator import DatasetEvaluator
+from .query_expansion import aqe
+from .rank_cylib import compile_helper
+
+logger = logging.getLogger(__name__)
+
+
+class ReidEvaluator(DatasetEvaluator):
+    def __init__(self, cfg, num_query, output_dir=None):
+        self.cfg = cfg
+        self._num_query = num_query
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device('cpu')
+
+        self._predictions = []
+        self._compile_dependencies()
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        prediction = {
+            'feats': outputs.to(self._cpu_device, torch.float32),
+            'pids': inputs['targets'].to(self._cpu_device),
+            'camids': inputs['camids'].to(self._cpu_device)
+
+        }
+        self._predictions.append(prediction)
+
+    def evaluate(self):
+        if comm.get_world_size() > 1:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+
+            if not comm.is_main_process():
+                return {}
+
+        else:
+            predictions = self._predictions
+
+        features = []
+        pids = []
+        camids = []
+        for prediction in predictions:
+            features.append(prediction['feats'])
+            pids.append(prediction['pids'])
+            camids.append(prediction['camids'])
+
+        features = torch.cat(features, dim=0)
+        pids = torch.cat(pids, dim=0).numpy()
+        camids = torch.cat(camids, dim=0).numpy()
+        # query feature, person ids and camera ids
+        query_features = features[:self._num_query]
+        query_pids = pids[:self._num_query]
+        query_camids = camids[:self._num_query]
+
+        # gallery features, person ids and camera ids
+        gallery_features = features[self._num_query:]
+        gallery_pids = pids[self._num_query:]
+        gallery_camids = camids[self._num_query:]
+
+        self._results = OrderedDict()
+
+        if self.cfg.TEST.AQE.ENABLED:
+            logger.info("Test with AQE setting")
+            qe_time = self.cfg.TEST.AQE.QE_TIME
+            qe_k = self.cfg.TEST.AQE.QE_K
+            alpha = self.cfg.TEST.AQE.ALPHA
+            query_features, gallery_features = aqe(query_features, gallery_features, qe_time, qe_k, alpha)
+
+        dist = build_dist(query_features, gallery_features, self.cfg.TEST.METRIC)
+
+        if self.cfg.TEST.RERANK.ENABLED:
+            logger.info("Test with rerank setting")
+            k1 = self.cfg.TEST.RERANK.K1
+            k2 = self.cfg.TEST.RERANK.K2
+            lambda_value = self.cfg.TEST.RERANK.LAMBDA
+
+            if self.cfg.TEST.METRIC == "cosine":
+                query_features = F.normalize(query_features, dim=1)
+                gallery_features = F.normalize(gallery_features, dim=1)
+
+            rerank_dist = build_dist(query_features, gallery_features, metric="jaccard", k1=k1, k2=k2)
+            dist = rerank_dist * (1 - lambda_value) + dist * lambda_value
+
+        from .rank import evaluate_rank
+        cmc, all_AP, all_INP = evaluate_rank(dist, query_pids, gallery_pids, query_camids, gallery_camids)
+
+        mAP = np.mean(all_AP)
+        mINP = np.mean(all_INP)
+        for r in [1, 5, 10]:
+            self._results['Rank-{}'.format(r)] = cmc[r - 1] * 100
+        self._results['mAP'] = mAP * 100
+        self._results['mINP'] = mINP * 100
+        self._results["metric"] = (mAP + cmc[0]) / 2 * 100
+
+        if self.cfg.TEST.ROC.ENABLED:
+            from .roc import evaluate_roc
+            scores, labels = evaluate_roc(dist, query_pids, gallery_pids, query_camids, gallery_camids)
+            fprs, tprs, thres = metrics.roc_curve(labels, scores)
+
+            for fpr in [1e-4, 1e-3, 1e-2]:
+                ind = np.argmin(np.abs(fprs - fpr))
+                self._results["TPR@FPR={:.0e}".format(fpr)] = tprs[ind]
+
+        return copy.deepcopy(self._results)
+
+    def _compile_dependencies(self):
+        # Since we only evaluate results in rank(0), so we just need to compile
+        # cython evaluation tool on rank(0)
+        if comm.is_main_process():
+            try:
+                from .rank_cylib.rank_cy import evaluate_cy
+            except ImportError:
+                start_time = time.time()
+                logger.info("> compiling reid evaluation cython tool")
+
+                compile_helper()
+
+                logger.info(
+                    ">>> done with reid evaluation cython tool. Compilation time: {:.3f} "
+                    "seconds".format(time.time() - start_time))
+        comm.synchronize()