init

20e33356 · luopl · 20e33356 · 20e33356 · 20e33356 · 20e33356
Commit 20e33356 authored Aug 22, 2024 by luopl
8 changed files
--- a/mmdet/datasets/samplers/track_img_sampler.py
+++ b/mmdet/datasets/samplers/track_img_sampler.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import random
+from typing import Iterator, Optional, Sized
+
+import numpy as np
+from mmengine.dataset import ClassBalancedDataset, ConcatDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from ..base_video_dataset import BaseVideoDataset
+
+
+@DATA_SAMPLERS.register_module()
+class TrackImgSampler(Sampler):
+    """Sampler that providing image-level sampling outputs for video datasets
+    in tracking tasks. It could be both used in both distributed and
+    non-distributed environment.
+    If using the default sampler in pytorch, the subsequent data receiver will
+    get one video, which is not desired in some cases:
+    (Take a non-distributed environment as an example)
+    1. In test mode, we want only one image is fed into the data pipeline. This
+    is in consideration of memory usage since feeding the whole video commonly
+    requires a large amount of memory (>=20G on MOTChallenge17 dataset), which
+    is not available in some machines.
+    2. In training mode, we may want to make sure all the images in one video
+    are randomly sampled once in one epoch and this can not be guaranteed in
+    the default sampler in pytorch.
+
+    Args:
+        dataset (Sized): Dataset used for sampling.
+        seed (int, optional): random seed used to shuffle the sampler. This
+            number should be identical across all processes in the distributed
+            group. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dataset: Sized,
+        seed: Optional[int] = None,
+    ) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+        self.epoch = 0
+        if seed is None:
+            self.seed = sync_random_seed()
+        else:
+            self.seed = seed
+
+        self.dataset = dataset
+        self.indices = []
+        # Hard code here to handle different dataset wrapper
+        if isinstance(self.dataset, ConcatDataset):
+            cat_datasets = self.dataset.datasets
+            assert isinstance(
+                cat_datasets[0], BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(cat_datasets[0])}'
+            self.test_mode = cat_datasets[0].test_mode
+            assert not self.test_mode, "'ConcatDataset' should not exist in "
+            'test mode'
+            for dataset in cat_datasets:
+                num_videos = len(dataset)
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            dataset.get_len_per_video(video_ind))
+                    ])
+        elif isinstance(self.dataset, ClassBalancedDataset):
+            ori_dataset = self.dataset.dataset
+            assert isinstance(
+                ori_dataset, BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(ori_dataset)}'
+            self.test_mode = ori_dataset.test_mode
+            assert not self.test_mode, "'ClassBalancedDataset' should not "
+            'exist in test mode'
+            video_indices = self.dataset.repeat_indices
+            for index in video_indices:
+                self.indices.extend([(index, frame_ind) for frame_ind in range(
+                    ori_dataset.get_len_per_video(index))])
+        else:
+            assert isinstance(
+                self.dataset, BaseVideoDataset
+            ), 'TrackImgSampler is only supported in BaseVideoDataset or '
+            'dataset wrapper: ClassBalancedDataset and ConcatDataset, but '
+            f'got {type(self.dataset)} '
+            self.test_mode = self.dataset.test_mode
+            num_videos = len(self.dataset)
+
+            if self.test_mode:
+                # in test mode, the images belong to the same video must be put
+                # on the same device.
+                if num_videos < self.world_size:
+                    raise ValueError(f'only {num_videos} videos loaded,'
+                                     f'but {self.world_size} gpus were given.')
+                chunks = np.array_split(
+                    list(range(num_videos)), self.world_size)
+                for videos_inds in chunks:
+                    indices_chunk = []
+                    for video_ind in videos_inds:
+                        indices_chunk.extend([
+                            (video_ind, frame_ind) for frame_ind in range(
+                                self.dataset.get_len_per_video(video_ind))
+                        ])
+                    self.indices.append(indices_chunk)
+            else:
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            self.dataset.get_len_per_video(video_ind))
+                    ])
+
+        if self.test_mode:
+            self.num_samples = len(self.indices[self.rank])
+            self.total_size = sum(
+                [len(index_list) for index_list in self.indices])
+        else:
+            self.num_samples = int(
+                math.ceil(len(self.indices) * 1.0 / self.world_size))
+            self.total_size = self.num_samples * self.world_size
+
+    def __iter__(self) -> Iterator:
+        if self.test_mode:
+            # in test mode, the order of frames can not be shuffled.
+            indices = self.indices[self.rank]
+        else:
+            # deterministically shuffle based on epoch
+            rng = random.Random(self.epoch + self.seed)
+            indices = rng.sample(self.indices, len(self.indices))
+
+            # add extra samples to make it evenly divisible
+            indices += indices[:(self.total_size - len(indices))]
+            assert len(indices) == self.total_size
+
+            # subsample
+            indices = indices[self.rank:self.total_size:self.world_size]
+            assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
--- a/mmdet/datasets/transforms/__init__.py
+++ b/mmdet/datasets/transforms/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .augment_wrappers import AutoAugment, RandAugment
+from .colorspace import (AutoContrast, Brightness, Color, ColorTransform,
+                         Contrast, Equalize, Invert, Posterize, Sharpness,
+                         Solarize, SolarizeAdd)
+from .formatting import (ImageToTensor, PackDetInputs, PackReIDInputs,
+                         PackTrackInputs, ToTensor, Transpose)
+from .frame_sampling import BaseFrameSample, UniformRefFrameSample
+from .geometric import (GeomTransform, Rotate, ShearX, ShearY, TranslateX,
+                        TranslateY)
+from .instaboost import InstaBoost
+from .loading import (FilterAnnotations, InferencerLoader, LoadAnnotations,
+                      LoadEmptyAnnotations, LoadImageFromNDArray,
+                      LoadMultiChannelImageFromFiles, LoadPanopticAnnotations,
+                      LoadProposals, LoadTrackAnnotations)
+from .text_transformers import LoadTextAnnotations, RandomSamplingNegPos
+from .transformers_glip import GTBoxSubOne_GLIP, RandomFlip_GLIP
+from .transforms import (Albu, CachedMixUp, CachedMosaic, CopyPaste, CutOut,
+                         Expand, FixScaleResize, FixShapeResize,
+                         MinIoURandomCrop, MixUp, Mosaic, Pad,
+                         PhotoMetricDistortion, RandomAffine,
+                         RandomCenterCropPad, RandomCrop, RandomErasing,
+                         RandomFlip, RandomShift, Resize, ResizeShortestEdge,
+                         SegRescale, YOLOXHSVRandomAug)
+from .wrappers import MultiBranch, ProposalBroadcaster, RandomOrder
+
+__all__ = [
+    'PackDetInputs', 'ToTensor', 'ImageToTensor', 'Transpose',
+    'LoadImageFromNDArray', 'LoadAnnotations', 'LoadPanopticAnnotations',
+    'LoadMultiChannelImageFromFiles', 'LoadProposals', 'Resize', 'RandomFlip',
+    'RandomCrop', 'SegRescale', 'MinIoURandomCrop', 'Expand',
+    'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad',
+    'AutoAugment', 'CutOut', 'ShearX', 'ShearY', 'Rotate', 'Color', 'Equalize',
+    'Brightness', 'Contrast', 'TranslateX', 'TranslateY', 'RandomShift',
+    'Mosaic', 'MixUp', 'RandomAffine', 'YOLOXHSVRandomAug', 'CopyPaste',
+    'FilterAnnotations', 'Pad', 'GeomTransform', 'ColorTransform',
+    'RandAugment', 'Sharpness', 'Solarize', 'SolarizeAdd', 'Posterize',
+    'AutoContrast', 'Invert', 'MultiBranch', 'RandomErasing',
+    'LoadEmptyAnnotations', 'RandomOrder', 'CachedMosaic', 'CachedMixUp',
+    'FixShapeResize', 'ProposalBroadcaster', 'InferencerLoader',
+    'LoadTrackAnnotations', 'BaseFrameSample', 'UniformRefFrameSample',
+    'PackTrackInputs', 'PackReIDInputs', 'FixScaleResize',
+    'ResizeShortestEdge', 'GTBoxSubOne_GLIP', 'RandomFlip_GLIP',
+    'RandomSamplingNegPos', 'LoadTextAnnotations'
+]
--- a/mmdet/datasets/transforms/augment_wrappers.py
+++ b/mmdet/datasets/transforms/augment_wrappers.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import numpy as np
+from mmcv.transforms import RandomChoice
+from mmcv.transforms.utils import cache_randomness
+from mmengine.config import ConfigDict
+
+from mmdet.registry import TRANSFORMS
+
+# AutoAugment uses reinforcement learning to search for
+# some widely useful data augmentation strategies,
+# here we provide AUTOAUG_POLICIES_V0.
+# For AUTOAUG_POLICIES_V0, each tuple is an augmentation
+# operation of the form (operation, probability, magnitude).
+# Each element in policies is a policy that will be applied
+# sequentially on the image.
+
+# RandAugment defines a data augmentation search space, RANDAUG_SPACE,
+# sampling 1~3 data augmentations each time, and
+# setting the magnitude of each data augmentation randomly,
+# which will be applied sequentially on the image.
+
+_MAX_LEVEL = 10
+
+AUTOAUG_POLICIES_V0 = [
+    [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+    [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+    [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+    [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+    [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+    [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+    [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+    [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+    [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+    [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+    [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+    [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+    [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+    [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+    [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+    [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+    [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+    [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+    [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+    [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+    [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+    [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+    [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+    [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+]
+
+
+def policies_v0():
+    """Autoaugment policies that was used in AutoAugment Paper."""
+    policies = list()
+    for policy_args in AUTOAUG_POLICIES_V0:
+        policy = list()
+        for args in policy_args:
+            policy.append(dict(type=args[0], prob=args[1], level=args[2]))
+        policies.append(policy)
+    return policies
+
+
+RANDAUG_SPACE = [[dict(type='AutoContrast')], [dict(type='Equalize')],
+                 [dict(type='Invert')], [dict(type='Rotate')],
+                 [dict(type='Posterize')], [dict(type='Solarize')],
+                 [dict(type='SolarizeAdd')], [dict(type='Color')],
+                 [dict(type='Contrast')], [dict(type='Brightness')],
+                 [dict(type='Sharpness')], [dict(type='ShearX')],
+                 [dict(type='ShearY')], [dict(type='TranslateX')],
+                 [dict(type='TranslateY')]]
+
+
+def level_to_mag(level: Optional[int], min_mag: float,
+                 max_mag: float) -> float:
+    """Map from level to magnitude."""
+    if level is None:
+        return round(np.random.rand() * (max_mag - min_mag) + min_mag, 1)
+    else:
+        return round(level / _MAX_LEVEL * (max_mag - min_mag) + min_mag, 1)
+
+
+@TRANSFORMS.register_module()
+class AutoAugment(RandomChoice):
+    """Auto augmentation.
+
+    This data augmentation is proposed in `AutoAugment: Learning
+    Augmentation Policies from Data <https://arxiv.org/abs/1805.09501>`_
+    and in `Learning Data Augmentation Strategies for Object Detection
+    <https://arxiv.org/pdf/1906.11172>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        policies (List[List[Union[dict, ConfigDict]]]):
+            The policies of auto augmentation.Each policy in ``policies``
+            is a specific augmentation policy, and is composed by several
+            augmentations. When AutoAugment is called, a random policy in
+            ``policies`` will be selected to augment images.
+            Defaults to policy_v0().
+        prob (list[float], optional): The probabilities associated
+            with each policy. The length should be equal to the policy
+            number and the sum should be 1. If not given, a uniform
+            distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> policies = [
+        >>>     [
+        >>>         dict(type='Sharpness', prob=0.0, level=8),
+        >>>         dict(type='ShearX', prob=0.4, level=0,)
+        >>>     ],
+        >>>     [
+        >>>         dict(type='Rotate', prob=0.6, level=10),
+        >>>         dict(type='Color', prob=1.0, level=6)
+        >>>     ]
+        >>> ]
+        >>> augmentation = AutoAugment(policies)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 policies: List[List[Union[dict, ConfigDict]]] = policies_v0(),
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(policies, list) and len(policies) > 0, \
+            'Policies must be a non-empty list.'
+        for policy in policies:
+            assert isinstance(policy, list) and len(policy) > 0, \
+                'Each policy in policies must be a non-empty list.'
+            for augment in policy:
+                assert isinstance(augment, dict) and 'type' in augment, \
+                    'Each specific augmentation must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=policies, prob=prob)
+        self.policies = policies
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(policies={self.policies}, ' \
+               f'prob={self.prob})'
+
+
+@TRANSFORMS.register_module()
+class RandAugment(RandomChoice):
+    """Rand augmentation.
+
+    This data augmentation is proposed in `RandAugment:
+    Practical automated data augmentation with a reduced
+    search space <https://arxiv.org/abs/1909.13719>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        aug_space (List[List[Union[dict, ConfigDict]]]): The augmentation space
+            of rand augmentation. Each augmentation transform in ``aug_space``
+            is a specific transform, and is composed by several augmentations.
+            When RandAugment is called, a random transform in ``aug_space``
+            will be selected to augment images. Defaults to aug_space.
+        aug_num (int): Number of augmentation to apply equentially.
+            Defaults to 2.
+        prob (list[float], optional): The probabilities associated with
+            each augmentation. The length should be equal to the
+            augmentation space and the sum should be 1. If not given,
+            a uniform distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> aug_space = [
+        >>>     dict(type='Sharpness'),
+        >>>     dict(type='ShearX'),
+        >>>     dict(type='Color'),
+        >>>     ],
+        >>> augmentation = RandAugment(aug_space)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 aug_space: List[Union[dict, ConfigDict]] = RANDAUG_SPACE,
+                 aug_num: int = 2,
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(aug_space, list) and len(aug_space) > 0, \
+            'Augmentation space must be a non-empty list.'
+        for aug in aug_space:
+            assert isinstance(aug, list) and len(aug) == 1, \
+                'Each augmentation in aug_space must be a list.'
+            for transform in aug:
+                assert isinstance(transform, dict) and 'type' in transform, \
+                    'Each specific transform must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=aug_space, prob=prob)
+        self.aug_space = aug_space
+        self.aug_num = aug_num
+
+    @cache_randomness
+    def random_pipeline_index(self):
+        indices = np.arange(len(self.transforms))
+        return np.random.choice(
+            indices, self.aug_num, p=self.prob, replace=False)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to use RandAugment.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with RandAugment.
+        """
+        for idx in self.random_pipeline_index():
+            results = self.transforms[idx](results)
+        return results
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(' \
+               f'aug_space={self.aug_space}, '\
+               f'aug_num={self.aug_num}, ' \
+               f'prob={self.prob})'
--- a/mmdet/datasets/transforms/colorspace.py
+++ b/mmdet/datasets/transforms/colorspace.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class ColorTransform(BaseTransform):
+    """Base class for color transformations. All color transformations need to
+    inherit from this base class. ``ColorTransform`` unifies the class
+    attributes and class functions of color transformations (Color, Brightness,
+    Contrast, Sharpness, Solarize, SolarizeAdd, Equalize, AutoContrast, Invert,
+    and Posterize), and only distort color channels, without impacting the
+    locations of the instances.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        return level_to_mag(self.level, self.min_mag, self.max_mag)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function for images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self._transform_img(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Color(ColorTransform):
+    """Adjust the color balance of the image, in a manner similar to the
+    controls on a colour TV set. A magnitude=0 gives a black & white image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Color transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Color should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Color should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Apply Color transformation to image."""
+        # NOTE defaultly the image should be BGR format
+        img = results['img']
+        results['img'] = mmcv.adjust_color(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Brightness(ColorTransform):
+    """Adjust the brightness of the image. A magnitude=0 gives a black image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Brightness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Brightness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Brightness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Brightness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Brightness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the brightness of image."""
+        img = results['img']
+        results['img'] = mmcv.adjust_brightness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Contrast(ColorTransform):
+    """Control the contrast of the image. A magnitude=0 gives a gray image,
+    whereas magnitude=1 gives the original imageThe bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Contrast transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Contrast transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Contrast transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Contrast should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Contrast should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image contrast."""
+        img = results['img']
+        results['img'] = mmcv.adjust_contrast(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Sharpness(ColorTransform):
+    """Adjust images sharpness. A positive magnitude would enhance the
+    sharpness and a negative magnitude would make the image blurry. A
+    magnitude=0 gives the origin img.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Sharpness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Sharpness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Sharpness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Sharpness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Sharpness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image sharpness."""
+        img = results['img']
+        results['img'] = mmcv.adjust_sharpness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Solarize(ColorTransform):
+    """Solarize images (Invert all pixels above a threshold value of
+    magnitude.).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Solarize transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Solarize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Solarize transformation.
+            Defaults to 256.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 256.0) -> None:
+        assert 0. <= min_mag <= 256.0, f'min_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {min_mag}.'
+        assert 0. <= max_mag <= 256.0, f'max_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert all pixel values above magnitude."""
+        img = results['img']
+        results['img'] = mmcv.solarize(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class SolarizeAdd(ColorTransform):
+    """SolarizeAdd images. For each pixel in the image that is less than 128,
+    add an additional amount to it decided by the magnitude.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing SolarizeAdd
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for SolarizeAdd transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for SolarizeAdd transformation.
+            Defaults to 110.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 110.0) -> None:
+        assert 0. <= min_mag <= 110.0, f'min_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {min_mag}.'
+        assert 0. <= max_mag <= 110.0, f'max_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """SolarizeAdd the image."""
+        img = results['img']
+        img_solarized = np.where(img < 128, np.minimum(img + mag, 255), img)
+        results['img'] = img_solarized.astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Posterize(ColorTransform):
+    """Posterize images (reduce the number of bits for each color channel).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Posterize
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Posterize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Posterize transformation.
+            Defaults to 4.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 4.0) -> None:
+        assert 0. <= min_mag <= 8.0, f'min_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {min_mag}.'
+        assert 0. <= max_mag <= 8.0, f'max_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Posterize the image."""
+        img = results['img']
+        results['img'] = mmcv.posterize(img, math.ceil(mag)).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Equalize(ColorTransform):
+    """Equalize the image histogram. The bboxes, masks and segmentations are
+    not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Equalize transformation.
+            Defaults to 1.0.
+        level (int, optional): No use for Equalize transformation.
+            Defaults to None.
+        min_mag (float): No use for Equalize transformation. Defaults to 0.1.
+        max_mag (float): No use for Equalize transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Equalizes the histogram of one image."""
+        img = results['img']
+        results['img'] = mmcv.imequalize(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class AutoContrast(ColorTransform):
+    """Auto adjust image contrast.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing AutoContrast should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for AutoContrast transformation.
+            Defaults to None.
+        min_mag (float): No use for AutoContrast transformation.
+            Defaults to 0.1.
+        max_mag (float): No use for AutoContrast transformation.
+            Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Auto adjust image contrast."""
+        img = results['img']
+        results['img'] = mmcv.auto_contrast(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Invert(ColorTransform):
+    """Invert images.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing invert therefore should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for Invert transformation.
+            Defaults to None.
+        min_mag (float): No use for Invert transformation. Defaults to 0.1.
+        max_mag (float): No use for Invert transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert the image."""
+        img = results['img']
+        results['img'] = mmcv.iminvert(img).astype(img.dtype)
--- a/mmdet/datasets/transforms/formatting.py
+++ b/mmdet/datasets/transforms/formatting.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+import numpy as np
+from mmcv.transforms import to_tensor
+from mmcv.transforms.base import BaseTransform
+from mmengine.structures import InstanceData, PixelData
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures import DetDataSample, ReIDDataSample, TrackDataSample
+from mmdet.structures.bbox import BaseBoxes
+
+
+@TRANSFORMS.register_module()
+class PackDetInputs(BaseTransform):
+    """Pack the inputs data for the detection / semantic segmentation /
+    panoptic segmentation.
+
+    The ``img_meta`` item is always populated.  The contents of the
+    ``img_meta`` dictionary depends on ``meta_keys``. By default this includes:
+
+        - ``img_id``: id of the image
+
+        - ``img_path``: path to the image file
+
+        - ``ori_shape``: original shape of the image as a tuple (h, w)
+
+        - ``img_shape``: shape of the image input to the network as a tuple \
+            (h, w).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - ``scale_factor``: a float indicating the preprocessing scale
+
+        - ``flip``: a boolean indicating if image flip transform was used
+
+        - ``flip_direction``: the flipping direction
+
+    Args:
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('img_id', 'img_path', 'ori_shape', 'img_shape',
+            'scale_factor', 'flip', 'flip_direction')``
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks'
+    }
+
+    def __init__(self,
+                 meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                            'scale_factor', 'flip', 'flip_direction')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`DetDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            # To improve the computational speed by by 3-5 times, apply:
+            # If image is not contiguous, use
+            # `numpy.transpose()` followed by `numpy.ascontiguousarray()`
+            # If image is already contiguous, use
+            # `torch.permute()` followed by `torch.contiguous()`
+            # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+            # for more details
+            if not img.flags.c_contiguous:
+                img = np.ascontiguousarray(img.transpose(2, 0, 1))
+                img = to_tensor(img)
+            else:
+                img = to_tensor(img).permute(2, 0, 1).contiguous()
+
+            packed_results['inputs'] = img
+
+        if 'gt_ignore_flags' in results:
+            valid_idx = np.where(results['gt_ignore_flags'] == 0)[0]
+            ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0]
+
+        data_sample = DetDataSample()
+        instance_data = InstanceData()
+        ignore_instance_data = InstanceData()
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks' or isinstance(results[key], BaseBoxes):
+                if 'gt_ignore_flags' in results:
+                    instance_data[
+                        self.mapping_table[key]] = results[key][valid_idx]
+                    ignore_instance_data[
+                        self.mapping_table[key]] = results[key][ignore_idx]
+                else:
+                    instance_data[self.mapping_table[key]] = results[key]
+            else:
+                if 'gt_ignore_flags' in results:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][valid_idx])
+                    ignore_instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][ignore_idx])
+                else:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key])
+        data_sample.gt_instances = instance_data
+        data_sample.ignored_instances = ignore_instance_data
+
+        if 'proposals' in results:
+            proposals = InstanceData(
+                bboxes=to_tensor(results['proposals']),
+                scores=to_tensor(results['proposals_scores']))
+            data_sample.proposals = proposals
+
+        if 'gt_seg_map' in results:
+            gt_sem_seg_data = dict(
+                sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy()))
+            gt_sem_seg_data = PixelData(**gt_sem_seg_data)
+            if 'ignore_index' in results:
+                metainfo = dict(ignore_index=results['ignore_index'])
+                gt_sem_seg_data.set_metainfo(metainfo)
+            data_sample.gt_sem_seg = gt_sem_seg_data
+
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ToTensor:
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class ImageToTensor:
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and permuted to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img).permute(2, 0, 1).contiguous()
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class Transpose:
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to transpose the channel order of data in results.
+
+        Args:
+            results (dict): Result dict contains the data to transpose.
+
+        Returns:
+            dict: The result dict contains the data transposed to \
+                ``self.order``.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+@TRANSFORMS.register_module()
+class WrapFieldsToLists:
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='Pad', size_divisor=32),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapFieldsToLists')
+        >>> ]
+    """
+
+    def __call__(self, results):
+        """Call function to wrap fields into lists.
+
+        Args:
+            results (dict): Result dict contains the data to wrap.
+
+        Returns:
+            dict: The result dict where value of ``self.keys`` are wrapped \
+                into list.
+        """
+
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
+
+
+@TRANSFORMS.register_module()
+class PackTrackInputs(BaseTransform):
+    """Pack the inputs data for the multi object tracking and video instance
+    segmentation. All the information of images are packed to ``inputs``. All
+    the information except images are packed to ``data_samples``. In order to
+    get the original annotaiton and meta info, we add `instances` key into meta
+    keys.
+
+    Args:
+        meta_keys (Sequence[str]): Meta keys to be collected in
+            ``data_sample.metainfo``. Defaults to None.
+        default_meta_keys (tuple): Default meta keys. Defaults to ('img_id',
+            'img_path', 'ori_shape', 'img_shape', 'scale_factor',
+            'flip', 'flip_direction', 'frame_id', 'is_video_data',
+            'video_id', 'video_length', 'instances').
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks',
+        'gt_instances_ids': 'instances_ids'
+    }
+
+    def __init__(self,
+                 meta_keys: Optional[dict] = None,
+                 default_meta_keys: tuple = ('img_id', 'img_path', 'ori_shape',
+                                             'img_shape', 'scale_factor',
+                                             'flip', 'flip_direction',
+                                             'frame_id', 'video_id',
+                                             'video_length',
+                                             'ori_video_length', 'instances')):
+        self.meta_keys = default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`TrackDataSample`): The annotation info of
+                the samples.
+        """
+        packed_results = dict()
+        packed_results['inputs'] = dict()
+
+        # 1. Pack images
+        if 'img' in results:
+            imgs = results['img']
+            imgs = np.stack(imgs, axis=0)
+            imgs = imgs.transpose(0, 3, 1, 2)
+            packed_results['inputs'] = to_tensor(imgs)
+
+        # 2. Pack InstanceData
+        if 'gt_ignore_flags' in results:
+            gt_ignore_flags_list = results['gt_ignore_flags']
+            valid_idx_list, ignore_idx_list = [], []
+            for gt_ignore_flags in gt_ignore_flags_list:
+                valid_idx = np.where(gt_ignore_flags == 0)[0]
+                ignore_idx = np.where(gt_ignore_flags == 1)[0]
+                valid_idx_list.append(valid_idx)
+                ignore_idx_list.append(ignore_idx)
+
+        assert 'img_id' in results, "'img_id' must contained in the results "
+        'for counting the number of images'
+
+        num_imgs = len(results['img_id'])
+        instance_data_list = [InstanceData() for _ in range(num_imgs)]
+        ignore_instance_data_list = [InstanceData() for _ in range(num_imgs)]
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks':
+                mapped_key = self.mapping_table[key]
+                gt_masks_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][mapped_key] = gt_mask[valid_idx]
+                        ignore_instance_data_list[i][mapped_key] = gt_mask[
+                            ignore_idx]
+
+                else:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        instance_data_list[i][mapped_key] = gt_mask
+
+            else:
+                anns_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, ann in enumerate(anns_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[valid_idx])
+                        ignore_instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[ignore_idx])
+                else:
+                    for i, ann in enumerate(anns_list):
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(ann)
+
+        det_data_samples_list = []
+        for i in range(num_imgs):
+            det_data_sample = DetDataSample()
+            det_data_sample.gt_instances = instance_data_list[i]
+            det_data_sample.ignored_instances = ignore_instance_data_list[i]
+            det_data_samples_list.append(det_data_sample)
+
+        # 3. Pack metainfo
+        for key in self.meta_keys:
+            if key not in results:
+                continue
+            img_metas_list = results[key]
+            for i, img_meta in enumerate(img_metas_list):
+                det_data_samples_list[i].set_metainfo({f'{key}': img_meta})
+
+        track_data_sample = TrackDataSample()
+        track_data_sample.video_data_samples = det_data_samples_list
+        if 'key_frame_flags' in results:
+            key_frame_flags = np.asarray(results['key_frame_flags'])
+            key_frames_inds = np.where(key_frame_flags)[0].tolist()
+            ref_frames_inds = np.where(~key_frame_flags)[0].tolist()
+            track_data_sample.set_metainfo(
+                dict(key_frames_inds=key_frames_inds))
+            track_data_sample.set_metainfo(
+                dict(ref_frames_inds=ref_frames_inds))
+
+        packed_results['data_samples'] = track_data_sample
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'meta_keys={self.meta_keys}, '
+        repr_str += f'default_meta_keys={self.default_meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PackReIDInputs(BaseTransform):
+    """Pack the inputs data for the ReID. The ``meta_info`` item is always
+    populated. The contents of the ``meta_info`` dictionary depends on
+    ``meta_keys``. By default this includes:
+
+        - ``img_path``: path to the image file.
+        - ``ori_shape``: original shape of the image as a tuple (H, W).
+        - ``img_shape``: shape of the image input to the network as a tuple
+            (H, W). Note that images may be zero padded on the bottom/right
+          if the batch tensor is larger than this shape.
+        - ``scale``: scale of the image as a tuple (W, H).
+        - ``scale_factor``: a float indicating the pre-processing scale.
+        -  ``flip``: a boolean indicating if image flip transform was used.
+        - ``flip_direction``: the flipping direction.
+    Args:
+        meta_keys (Sequence[str], optional): The meta keys to saved in the
+            ``metainfo`` of the packed ``data_sample``.
+    """
+    default_meta_keys = ('img_path', 'ori_shape', 'img_shape', 'scale',
+                         'scale_factor')
+
+    def __init__(self, meta_keys: Sequence[str] = ()) -> None:
+        self.meta_keys = self.default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple.'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`ReIDDataSample`): The meta info of the
+                sample.
+        """
+        packed_results = dict(inputs=dict(), data_samples=None)
+        assert 'img' in results, 'Missing the key ``img``.'
+        _type = type(results['img'])
+        label = results['gt_label']
+
+        if _type == list:
+            img = results['img']
+            label = np.stack(label, axis=0)  # (N,)
+            assert all([type(v) == _type for v in results.values()]), \
+                'All items in the results must have the same type.'
+        else:
+            img = [results['img']]
+
+        img = np.stack(img, axis=3)  # (H, W, C, N)
+        img = img.transpose(3, 2, 0, 1)  # (N, C, H, W)
+        img = np.ascontiguousarray(img)
+
+        packed_results['inputs'] = to_tensor(img)
+
+        data_sample = ReIDDataSample()
+        data_sample.set_gt_label(label)
+
+        meta_info = dict()
+        for key in self.meta_keys:
+            meta_info[key] = results[key]
+        data_sample.set_metainfo(meta_info)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
--- a/mmdet/datasets/transforms/frame_sampling.py
+++ b/mmdet/datasets/transforms/frame_sampling.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class BaseFrameSample(BaseTransform):
+    """Directly get the key frame, no reference frames.
+
+    Args:
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.collect_video_keys = collect_video_keys
+
+    def prepare_data(self, video_infos: dict,
+                     sampled_inds: List[int]) -> Dict[str, List]:
+        """Prepare data for the subsequent pipeline.
+
+        Args:
+            video_infos (dict): The whole video information.
+            sampled_inds (list[int]): The sampled frame indices.
+
+        Returns:
+            dict: The processed data information.
+        """
+        frames_anns = video_infos['images']
+        final_data_info = defaultdict(list)
+        # for data in frames_anns:
+        for index in sampled_inds:
+            data = frames_anns[index]
+            # copy the info in video-level into img-level
+            for key in self.collect_video_keys:
+                if key == 'video_length':
+                    data['ori_video_length'] = video_infos[key]
+                    data['video_length'] = len(sampled_inds)
+                else:
+                    data[key] = video_infos[key]
+            # Collate data_list (list of dict to dict of list)
+            for key, value in data.items():
+                final_data_info[key].append(value)
+
+        return final_data_info
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the key frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+        results = self.prepare_data(video_infos, [key_frame_id])
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(collect_video_keys={self.collect_video_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class UniformRefFrameSample(BaseFrameSample):
+    """Uniformly sample reference frames.
+
+    Args:
+        num_ref_imgs (int): Number of reference frames to be sampled.
+        frame_range (int | list[int]): Range of frames to be sampled around
+            key frame. If int, the range is [-frame_range, frame_range].
+            Defaults to 10.
+        filter_key_img (bool): Whether to filter the key frame when
+            sampling reference frames. Defaults to True.
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 num_ref_imgs: int = 1,
+                 frame_range: Union[int, List[int]] = 10,
+                 filter_key_img: bool = True,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.num_ref_imgs = num_ref_imgs
+        self.filter_key_img = filter_key_img
+        if isinstance(frame_range, int):
+            assert frame_range >= 0, 'frame_range can not be a negative value.'
+            frame_range = [-frame_range, frame_range]
+        elif isinstance(frame_range, list):
+            assert len(frame_range) == 2, 'The length must be 2.'
+            assert frame_range[0] <= 0 and frame_range[1] >= 0
+            for i in frame_range:
+                assert isinstance(i, int), 'Each element must be int.'
+        else:
+            raise TypeError('The type of frame_range must be int or list.')
+        self.frame_range = frame_range
+        super().__init__(collect_video_keys=collect_video_keys)
+
+    def sampling_frames(self, video_length: int, key_frame_id: int):
+        """Sampling frames.
+
+        Args:
+            video_length (int): The length of the video.
+            key_frame_id (int): The key frame id.
+
+        Returns:
+            list[int]: The sampled frame indices.
+        """
+        if video_length > 1:
+            left = max(0, key_frame_id + self.frame_range[0])
+            right = min(key_frame_id + self.frame_range[1], video_length - 1)
+            frame_ids = list(range(0, video_length))
+
+            valid_ids = frame_ids[left:right + 1]
+            if self.filter_key_img and key_frame_id in valid_ids:
+                valid_ids.remove(key_frame_id)
+            assert len(
+                valid_ids
+            ) > 0, 'After filtering key frame, there are no valid frames'
+            if len(valid_ids) < self.num_ref_imgs:
+                valid_ids = valid_ids * self.num_ref_imgs
+            ref_frame_ids = random.sample(valid_ids, self.num_ref_imgs)
+        else:
+            ref_frame_ids = [key_frame_id] * self.num_ref_imgs
+
+        sampled_frames_ids = [key_frame_id] + ref_frame_ids
+        sampled_frames_ids = sorted(sampled_frames_ids)
+
+        key_frames_ind = sampled_frames_ids.index(key_frame_id)
+        key_frame_flags = [False] * len(sampled_frames_ids)
+        key_frame_flags[key_frames_ind] = True
+        return sampled_frames_ids, key_frame_flags
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the sampled frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+
+        (sampled_frames_ids, key_frame_flags) = self.sampling_frames(
+            video_infos['video_length'], key_frame_id=key_frame_id)
+        results = self.prepare_data(video_infos, sampled_frames_ids)
+        results['key_frame_flags'] = key_frame_flags
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_ref_imgs={self.num_ref_imgs}, '
+        repr_str += f'frame_range={self.frame_range}, '
+        repr_str += f'filter_key_img={self.filter_key_img}, '
+        repr_str += f'collect_video_keys={self.collect_video_keys})'
+        return repr_str
--- a/mmdet/datasets/transforms/geometric.py
+++ b/mmdet/datasets/transforms/geometric.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Optional, Union
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import autocast_box_type
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class GeomTransform(BaseTransform):
+    """Base class for geometric transformations. All geometric transformations
+    need to inherit from this base class. ``GeomTransform`` unifies the class
+    attributes and class functions of geometric transformations (ShearX,
+    ShearY, Rotate, TranslateX, and TranslateY), and records the homography
+    matrix.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for geometric transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for geometric transformation.
+            Defaults to 1.0.
+        reversal_prob (float): The probability that reverses the geometric
+            transformation magnitude. Should be in range [0,1].
+            Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 1.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        assert isinstance(reversal_prob, float), \
+            f'reversal_prob should be type float, got {type(max_mag)}.'
+        assert 0 <= reversal_prob <= 1.0, \
+            f'The reversal probability of the transformation magnitude ' \
+            f'should be type float, got {type(reversal_prob)}.'
+        if isinstance(img_border_value, (float, int)):
+            img_border_value = tuple([float(img_border_value)] * 3)
+        elif isinstance(img_border_value, tuple):
+            assert len(img_border_value) == 3, \
+                f'img_border_value as tuple must have 3 elements, ' \
+                f'got {len(img_border_value)}.'
+            img_border_value = tuple([float(val) for val in img_border_value])
+        else:
+            raise ValueError(
+                'img_border_value must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_border_value]), 'all ' \
+            'elements of img_border_value should between range [0,255].' \
+            f'got {img_border_value}.'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+        self.reversal_prob = reversal_prob
+        self.img_border_value = img_border_value
+        self.mask_border_value = mask_border_value
+        self.seg_ignore_label = seg_ignore_label
+        self.interpolation = interpolation
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Transform the masks."""
+        pass
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Transform the segmentation map."""
+        pass
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for the geometric transformation."""
+        return np.eye(3, dtype=np.float32)
+
+    def _transform_bboxes(self, results: dict, mag: float) -> None:
+        """Transform the bboxes."""
+        results['gt_bboxes'].project_(self.homography_matrix)
+        results['gt_bboxes'].clip_(results['img_shape'])
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the geometric transformation."""
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = self.homography_matrix
+        else:
+            results['homography_matrix'] = self.homography_matrix @ results[
+                'homography_matrix']
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function for images, bounding boxes, masks and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self.homography_matrix = self._get_homography_matrix(results, mag)
+        self._record_homography_matrix(results)
+        self._transform_img(results, mag)
+        if results.get('gt_bboxes', None) is not None:
+            self._transform_bboxes(results, mag)
+        if results.get('gt_masks', None) is not None:
+            self._transform_masks(results, mag)
+        if results.get('gt_seg_map', None) is not None:
+            self._transform_seg(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag}, '
+        repr_str += f'reversal_prob={self.reversal_prob}, '
+        repr_str += f'img_border_value={self.img_border_value}, '
+        repr_str += f'mask_border_value={self.mask_border_value}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ShearX(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing Shear and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the horizontal shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the horizontal shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the horizontal
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearX."""
+        return np.array([[1, mag, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image horizontally."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks horizontally."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map horizontally."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class ShearY(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing ShearY and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the vertical shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the vertical shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the vertical
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearY."""
+        return np.array([[1, 0, 0], [mag, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image vertically."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks vertically."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map vertically."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class Rotate(GeomTransform):
+    """Rotate the images, bboxes, masks and segmentation map.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The maximum angle for rotation.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for rotation.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the rotation
+            magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 180., \
+            f'min_mag for Rotate should be in range [0,180], got {min_mag}.'
+        assert 0. <= max_mag <= 180., \
+            f'max_mag for Rotate should be in range [0,180], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for Rotate."""
+        img_shape = results['img_shape']
+        center = ((img_shape[1] - 1) * 0.5, (img_shape[0] - 1) * 0.5)
+        cv2_rotation_matrix = cv2.getRotationMatrix2D(center, -mag, 1.0)
+        return np.concatenate(
+            [cv2_rotation_matrix,
+             np.array([0, 0, 1]).reshape((1, 3))]).astype(np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Rotate the image."""
+        results['img'] = mmcv.imrotate(
+            results['img'],
+            mag,
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Rotate the masks."""
+        results['gt_masks'] = results['gt_masks'].rotate(
+            results['img_shape'],
+            mag,
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Rotate the segmentation map."""
+        results['gt_seg_map'] = mmcv.imrotate(
+            results['gt_seg_map'],
+            mag,
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateX(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for horizontal
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for horizontal
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the horizontal
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateX."""
+        mag = int(results['img_shape'][1] * mag)
+        return np.array([[1, 0, mag], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate the masks horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate the segmentation map horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateY(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for vertical
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for vertical
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the vertical
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateY."""
+        mag = int(results['img_shape'][0] * mag)
+        return np.array([[1, 0, 0], [0, 1, mag], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate masks vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate segmentation map vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
--- a/mmdet/datasets/transforms/instaboost.py
+++ b/mmdet/datasets/transforms/instaboost.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class InstaBoost(BaseTransform):
+    r"""Data augmentation method in `InstaBoost: Boosting Instance
+    Segmentation Via Probability Map Guided Copy-Pasting
+    <https://arxiv.org/abs/1908.07801>`_.
+
+    Refer to https://github.com/GothicAi/Instaboost for implementation details.
+
+
+    Required Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Args:
+        action_candidate (tuple): Action candidates. "normal", "horizontal", \
+            "vertical", "skip" are supported. Defaults to ('normal', \
+            'horizontal', 'skip').
+        action_prob (tuple): Corresponding action probabilities. Should be \
+            the same length as action_candidate. Defaults to (1, 0, 0).
+        scale (tuple): (min scale, max scale). Defaults to (0.8, 1.2).
+        dx (int): The maximum x-axis shift will be (instance width) / dx.
+            Defaults to 15.
+        dy (int): The maximum y-axis shift will be (instance height) / dy.
+            Defaults to 15.
+        theta (tuple): (min rotation degree, max rotation degree). \
+            Defaults to (-1, 1).
+        color_prob (float): Probability of images for color augmentation.
+            Defaults to 0.5.
+        hflag (bool): Whether to use heatmap guided. Defaults to False.
+        aug_ratio (float): Probability of applying this transformation. \
+            Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 action_candidate: tuple = ('normal', 'horizontal', 'skip'),
+                 action_prob: tuple = (1, 0, 0),
+                 scale: tuple = (0.8, 1.2),
+                 dx: int = 15,
+                 dy: int = 15,
+                 theta: tuple = (-1, 1),
+                 color_prob: float = 0.5,
+                 hflag: bool = False,
+                 aug_ratio: float = 0.5) -> None:
+
+        import matplotlib
+        import matplotlib.pyplot as plt
+        default_backend = plt.get_backend()
+
+        try:
+            import instaboostfast as instaboost
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install instaboostfast" '
+                'to install instaboostfast first for instaboost augmentation.')
+
+        # instaboost will modify the default backend
+        # and cause visualization to fail.
+        matplotlib.use(default_backend)
+
+        self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob,
+                                               scale, dx, dy, theta,
+                                               color_prob, hflag)
+        self.aug_ratio = aug_ratio
+
+    def _load_anns(self, results: dict) -> Tuple[list, list]:
+        """Convert raw anns to instaboost expected input format."""
+        anns = []
+        ignore_anns = []
+        for instance in results['instances']:
+            label = instance['bbox_label']
+            bbox = instance['bbox']
+            mask = instance['mask']
+            x1, y1, x2, y2 = bbox
+            # assert (x2 - x1) >= 1 and (y2 - y1) >= 1
+            bbox = [x1, y1, x2 - x1, y2 - y1]
+
+            if instance['ignore_flag'] == 0:
+                anns.append({
+                    'category_id': label,
+                    'segmentation': mask,
+                    'bbox': bbox
+                })
+            else:
+                # Ignore instances without data augmentation
+                ignore_anns.append(instance)
+        return anns, ignore_anns
+
+    def _parse_anns(self, results: dict, anns: list, ignore_anns: list,
+                    img: np.ndarray) -> dict:
+        """Restore the result of instaboost processing to the original anns
+        format."""
+        instances = []
+        for ann in anns:
+            x1, y1, w, h = ann['bbox']
+            # TODO: more essential bug need to be fixed in instaboost
+            if w <= 0 or h <= 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            instances.append(
+                dict(
+                    bbox=bbox,
+                    bbox_label=ann['category_id'],
+                    mask=ann['segmentation'],
+                    ignore_flag=0))
+
+        instances.extend(ignore_anns)
+        results['img'] = img
+        results['instances'] = instances
+        return results
+
+    def transform(self, results) -> dict:
+        """The transform function."""
+        img = results['img']
+        ori_type = img.dtype
+        if 'instances' not in results or len(results['instances']) == 0:
+            return results
+
+        anns, ignore_anns = self._load_anns(results)
+        if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
+            try:
+                import instaboostfast as instaboost
+            except ImportError:
+                raise ImportError('Please run "pip install instaboostfast" '
+                                  'to install instaboostfast first.')
+            anns, img = instaboost.get_new_data(
+                anns, img.astype(np.uint8), self.cfg, background=None)
+
+        results = self._parse_anns(results, anns, ignore_anns,
+                                   img.astype(ori_type))
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(aug_ratio={self.aug_ratio})'
+        return repr_str