Make data pre-processing pipeline customizable (#935)

* define data pipelines * update two config files * minor fix for config files * allow img_scale to be optional and update config * add some docstrings * add extra aug to transform * bug fix for mask resizing * fix cropping * add faster rcnn example * fix imports * fix robustness testing * add img_norm_cfg to img_meta * fix the inference api with the new data pipeline * fix proposal loading * delete args of DefaultFormatBundle * add more configs * update configs * bug fix * add a brief doc * update gt_labels in RandomCrop * fix key error for new apis * bug fix for masks of crowd bboxes * add argument data_root * minor fix * update new hrnet configs * update docs * rename MultiscaleFlipAug to MultiScaleFlipAug * add __repr__ for all transforms * move DATA_PIPELINE.md to docs/ * fix image url

Make data pre-processing pipeline customizable (#935)
* define data pipelines * update two config files * minor fix for config files * allow img_scale to be optional and update config * add some docstrings * add extra aug to transform * bug fix for mask resizing * fix cropping * add faster rcnn example * fix imports * fix robustness testing * add img_norm_cfg to img_meta * fix the inference api with the new data pipeline * fix proposal loading * delete args of DefaultFormatBundle * add more configs * update configs * bug fix * add a brief doc * update gt_labels in RandomCrop * fix key error for new apis * bug fix for masks of crowd bboxes * add argument data_root * minor fix * update new hrnet configs * update docs * rename MultiscaleFlipAug to MultiScaleFlipAug * add __repr__ for all transforms * move DATA_PIPELINE.md to docs/ * fix image url
0d5233a3 · Kai Chen · GitHub · 7bb38af4 · 0d5233a3 · 0d5233a3
Unverified Commit 0d5233a3 authored Aug 23, 2019 by Kai Chen Committed by GitHub Aug 23, 2019
14 changed files
--- a/mmdet/datasets/pipelines/__init__.py
+++ b/mmdet/datasets/pipelines/__init__.py
+from .compose import Compose
+from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor,
+                        Transpose, to_tensor)
+from .loading import LoadAnnotations, LoadImageFromFile, LoadProposals
+from .test_aug import MultiScaleFlipAug
+from .transforms import (Expand, MinIoURandomCrop, Normalize, Pad,
+                         PhotoMetricDistortion, RandomCrop, RandomFlip, Resize,
+                         SegResizeFlipPadRescale)
+__all__ = [
+    'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
+    'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile',
+    'LoadProposals', 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad',
+    'RandomCrop', 'Normalize', 'SegResizeFlipPadRescale', 'MinIoURandomCrop',
+    'Expand', 'PhotoMetricDistortion'
+]
--- a/mmdet/datasets/pipelines/compose.py
+++ b/mmdet/datasets/pipelines/compose.py
+import collections
+from mmdet.utils import build_from_cfg
+from ..registry import PIPELINES
+@PIPELINES.register_module
+class Compose(object):
+    def __init__(self, transforms):
+        assert isinstance(transforms, collections.abc.Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict')
+    def __call__(self, data):
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
--- a/mmdet/datasets/pipelines/formating.py
+++ b/mmdet/datasets/pipelines/formating.py
+from collections.abc import Sequence
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from ..registry import PIPELINES
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError('type {} cannot be converted to tensor.'.format(
+            type(data)))
+@PIPELINES.register_module
+class ToTensor(object):
+    def __init__(self, keys):
+        self.keys = keys
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(keys={})'.format(self.keys)
+@PIPELINES.register_module
+class ImageToTensor(object):
+    def __init__(self, keys):
+        self.keys = keys
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = to_tensor(results[key].transpose(2, 0, 1))
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(keys={})'.format(self.keys)
+@PIPELINES.register_module
+class Transpose(object):
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(keys={}, order={})'.format(
+            self.keys, self.order)
+@PIPELINES.register_module
+class ToDataContainer(object):
+    def __init__(self,
+                 fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
+                         dict(key='gt_labels'))):
+        self.fields = fields
+    def __call__(self, results):
+        for field in self.fields:
+            field = field.copy()
+            key = field.pop('key')
+            results[key] = DC(results[key], **field)
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(fields={})'.format(self.fields)
+@PIPELINES.register_module
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
+                       (3)to DataContainer (stack=True)
+    """
+    def __call__(self, results):
+        if 'img' in results:
+            img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img), stack=True)
+        for key in ['proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels']:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+        return results
+    def __repr__(self):
+        return self.__class__.__name__
+@PIPELINES.register_module
+class Collect(object):
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape',
+                            'scale_factor', 'flip', 'img_norm_cfg')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+    def __call__(self, results):
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            img_meta[key] = results[key]
+        data['img_meta'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+    def __repr__(self):
+        return self.__class__.__name__ + '(keys={}, meta_keys={})'.format(
+            self.keys, self.meta_keys)
--- a/mmdet/datasets/pipelines/loading.py
+++ b/mmdet/datasets/pipelines/loading.py
+import os.path as osp
+import warnings
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+from ..registry import PIPELINES
+@PIPELINES.register_module
+class LoadImageFromFile(object):
+    def __init__(self, to_float32=False):
+        self.to_float32 = to_float32
+    def __call__(self, results):
+        filename = osp.join(results['img_prefix'],
+                            results['img_info']['filename'])
+        img = mmcv.imread(filename)
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results['filename'] = filename
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(to_float32={})'.format(
+            self.to_float32)
+@PIPELINES.register_module
+class LoadAnnotations(object):
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_mask=False,
+                 with_seg=False,
+                 poly2mask=True,
+                 skip_img_without_anno=True):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+        self.skip_img_without_anno = skip_img_without_anno
+    def _load_bboxes(self, results):
+        ann_info = results['ann_info']
+        results['gt_bboxes'] = ann_info['bboxes']
+        if len(results['gt_bboxes']) == 0 and self.skip_img_without_anno:
+            file_path = osp.join(results['img_prefix'],
+                                 results['img_info']['filename'])
+            warnings.warn(
+                'Skip the image "{}" that has no valid gt bbox'.format(
+                    file_path))
+            return None
+        results['gt_bboxes_ignore'] = ann_info.get('bboxes_ignore', None)
+        results['bbox_fields'].extend(['gt_bboxes', 'gt_bboxes_ignore'])
+        return results
+    def _load_labels(self, results):
+        results['gt_labels'] = results['ann_info']['labels']
+        return results
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+    def _load_masks(self, results):
+        h, w = results['img_info']['height'], results['img_info']['width']
+        gt_masks = results['ann_info']['masks']
+        if self.poly2mask:
+            gt_masks = [self._poly2mask(mask, h, w) for mask in gt_masks]
+        results['gt_masks'] = gt_masks
+        results['mask_fields'].append('gt_masks')
+        return results
+    def _load_semantic_seg(self, results):
+        results['gt_semantic_seg'] = mmcv.imread(
+            osp.join(results['seg_prefix'], results['ann_info']['seg_map']),
+            flag='unchanged').squeeze()
+        return results
+    def __call__(self, results):
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += ('(with_bbox={}, with_label={}, with_mask={},'
+                     ' with_seg={})').format(self.with_bbox, self.with_label,
+                                             self.with_mask, self.with_seg)
+        return repr_str
+@PIPELINES.register_module
+class LoadProposals(object):
+    def __init__(self, num_max_proposals=None):
+        self.num_max_proposals = num_max_proposals
+    def __call__(self, results):
+        proposals = results['proposals']
+        if proposals.shape[1] not in (4, 5):
+            raise AssertionError(
+                'proposals should have shapes (n, 4) or (n, 5), '
+                'but found {}'.format(proposals.shape))
+        proposals = proposals[:, :4]
+        if self.num_max_proposals is not None:
+            proposals = proposals[:self.num_max_proposals]
+        if len(proposals) == 0:
+            proposals = np.array([0, 0, 0, 0], dtype=np.float32)
+        results['proposals'] = proposals
+        results['bbox_fields'].append('proposals')
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(num_max_proposals={})'.format(
+            self.num_max_proposals)
--- a/mmdet/datasets/pipelines/test_aug.py
+++ b/mmdet/datasets/pipelines/test_aug.py
+import mmcv
+from ..registry import PIPELINES
+from .compose import Compose
+@PIPELINES.register_module
+class MultiScaleFlipAug(object):
+    def __init__(self, transforms, img_scale, flip=False):
+        self.transforms = Compose(transforms)
+        self.img_scale = img_scale if isinstance(img_scale,
+                                                 list) else [img_scale]
+        assert mmcv.is_list_of(self.img_scale, tuple)
+        self.flip = flip
+    def __call__(self, results):
+        aug_data = []
+        flip_aug = [False, True] if self.flip else [False]
+        for scale in self.img_scale:
+            for flip in flip_aug:
+                _results = results.copy()
+                _results['scale'] = scale
+                _results['flip'] = flip
+                data = self.transforms(_results)
+                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(transforms={}, img_scale={}, flip={})'.format(
+            self.transforms, self.img_scale, self.flip)
+        return repr_str
--- a/mmdet/datasets/pipelines/transforms.py
+++ b/mmdet/datasets/pipelines/transforms.py
+import mmcv
+import numpy as np
+from imagecorruptions import corrupt
+from numpy import random
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from ..registry import PIPELINES
+@PIPELINES.register_module
+class Resize(object):
+    """Resize images & bbox & mask.
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used.
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+    - `ratio_range` is not None: randomly sample a ratio from the ratio range
+        and multiply it with the image scale.
+    - `ratio_range` is None and `multiscale_mode` == "range": randomly sample a
+        scale from the a range.
+    - `ratio_range` is None and `multiscale_mode` == "value": randomly sample a
+        scale from multiple scales.
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+    """
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+    @staticmethod
+    def random_select(img_scales):
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+    @staticmethod
+    def random_sample(img_scales):
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+    def _random_scale(self, results):
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+    def _resize_img(self, results):
+        if self.keep_ratio:
+            img, scale_factor = mmcv.imrescale(
+                results['img'], results['scale'], return_scale=True)
+        else:
+            img, w_scale, h_scale = mmcv.imresize(
+                results['img'], results['scale'], return_scale=True)
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape  # in case that there is no padding
+        results['scale_factor'] = scale_factor
+        results['keep_ratio'] = self.keep_ratio
+    def _resize_bboxes(self, results):
+        img_shape = results['img_shape']
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key] * results['scale_factor']
+            bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1] - 1)
+            bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0] - 1)
+            results[key] = bboxes
+    def _resize_masks(self, results):
+        for key in results.get('mask_fields', []):
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                masks = [
+                    mmcv.imrescale(
+                        mask, results['scale_factor'], interpolation='nearest')
+                    for mask in results[key]
+                ]
+            else:
+                mask_size = (results['img_shape'][1], results['img_shape'][0])
+                masks = [
+                    mmcv.imresize(mask, mask_size, interpolation='nearest')
+                    for mask in results[key]
+                ]
+            results[key] = masks
+    def __call__(self, results):
+        if 'scale' not in results:
+            self._random_scale(results)
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += ('(img_scale={}, multiscale_mode={}, ratio_range={}, '
+                     'keep_ratio={})').format(self.img_scale,
+                                              self.multiscale_mode,
+                                              self.ratio_range,
+                                              self.keep_ratio)
+        return repr_str
+@PIPELINES.register_module
+class RandomFlip(object):
+    """Flip the image & bbox & mask.
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+    Args:
+        flip_ratio (float, optional): The flipping probability.
+    """
+    def __init__(self, flip_ratio=None):
+        self.flip_ratio = flip_ratio
+        if flip_ratio is not None:
+            assert flip_ratio >= 0 and flip_ratio <= 1
+    def bbox_flip(self, bboxes, img_shape):
+        """Flip bboxes horizontally.
+        Args:
+            bboxes(ndarray): shape (..., 4*k)
+            img_shape(tuple): (height, width)
+        """
+        assert bboxes.shape[-1] % 4 == 0
+        w = img_shape[1]
+        flipped = bboxes.copy()
+        flipped[..., 0::4] = w - bboxes[..., 2::4] - 1
+        flipped[..., 2::4] = w - bboxes[..., 0::4] - 1
+        return flipped
+    def __call__(self, results):
+        if 'flip' not in results:
+            flip = True if np.random.rand() < self.flip_ratio else False
+            results['flip'] = flip
+        if results['flip']:
+            # flip image
+            results['img'] = mmcv.imflip(results['img'])
+            # flip bboxes
+            for key in results.get('bbox_fields', []):
+                results[key] = self.bbox_flip(results[key],
+                                              results['img_shape'])
+            # flip masks
+            for key in results.get('mask_fields', []):
+                results[key] = [mask[:, ::-1] for mask in results[key]]
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(flip_ratio={})'.format(
+            self.flip_ratio)
+@PIPELINES.register_module
+class Pad(object):
+    """Pad the image & mask.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+    def _pad_img(self, results):
+        if self.size is not None:
+            padded_img = mmcv.impad(results['img'], self.size)
+        elif self.size_divisor is not None:
+            padded_img = mmcv.impad_to_multiple(
+                results['img'], self.size_divisor, pad_val=self.pad_val)
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+    def _pad_masks(self, results):
+        pad_shape = results['pad_shape'][:2]
+        for key in results.get('mask_fields', []):
+            padded_masks = [
+                mmcv.impad(mask, pad_shape, pad_val=self.pad_val)
+                for mask in results[key]
+            ]
+            results[key] = np.stack(padded_masks, axis=0)
+    def __call__(self, results):
+        self._pad_img(results)
+        self._pad_masks(results)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(size={}, size_divisor={}, pad_val={})'.format(
+            self.size, self.size_divisor, self.pad_val)
+        return repr_str
+@PIPELINES.register_module
+class Normalize(object):
+    """Normalize the image.
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+    def __call__(self, results):
+        results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std,
+                                          self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(mean={}, std={}, to_rgb={})'.format(
+            self.mean, self.std, self.to_rgb)
+        return repr_str
+@PIPELINES.register_module
+class RandomCrop(object):
+    """Random crop the image & bboxes.
+    Args:
+        crop_size (tuple): Expected size after cropping, (h, w).
+    """
+    def __init__(self, crop_size):
+        self.crop_size = crop_size
+    def __call__(self, results):
+        img = results['img']
+        margin_h = max(img.shape[0] - self.crop_size[0], 0)
+        margin_w = max(img.shape[1] - self.crop_size[1], 0)
+        offset_h = np.random.randint(0, margin_h + 1)
+        offset_w = np.random.randint(0, margin_w + 1)
+        crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
+        crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
+        # crop the image
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, :]
+        img_shape = img.shape
+        results['img'] = img
+        results['img_shape'] = img_shape
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1] - 1)
+            bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0] - 1)
+            results[key] = bboxes
+        # filter out the gt bboxes that are completely cropped
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            valid_inds = (gt_bboxes[:, 2] > gt_bboxes[:, 0]) & (
+                gt_bboxes[:, 3] > gt_bboxes[:, 1])
+            # if no gt bbox remains after cropping, just skip this image
+            if not np.any(valid_inds):
+                return None
+            results['gt_bboxes'] = gt_bboxes[valid_inds, :]
+            if 'gt_labels' in results:
+                results['gt_labels'] = results['gt_labels'][valid_inds]
+            # filter and crop the masks
+            if 'gt_masks' in results:
+                valid_gt_masks = []
+                for i in valid_inds:
+                    gt_mask = results['gt_masks'][i][crop_y1:crop_y2, crop_x1:
+                                                     crop_x2]
+                    valid_gt_masks.append(gt_mask)
+                results['gt_masks'] = valid_gt_masks
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(crop_size={})'.format(
+            self.crop_size)
+@PIPELINES.register_module
+class SegResizeFlipPadRescale(object):
+    """A sequential transforms to semantic segmentation maps.
+    The same pipeline as input images is applied to the semantic segmentation
+    map, and finally rescale it by some scale factor. The transforms include:
+    1. resize
+    2. flip
+    3. pad
+    4. rescale (so that the final size can be different from the image size)
+    Args:
+        scale_factor (float): The scale factor of the final output.
+    """
+    def __init__(self, scale_factor=1):
+        self.scale_factor = scale_factor
+    def __call__(self, results):
+        if results['keep_ratio']:
+            gt_seg = mmcv.imrescale(
+                results['gt_semantic_seg'],
+                results['scale'],
+                interpolation='nearest')
+        else:
+            gt_seg = mmcv.imresize(
+                results['gt_semantic_seg'],
+                results['scale'],
+                interpolation='nearest')
+        if results['flip']:
+            gt_seg = mmcv.imflip(gt_seg)
+        if gt_seg.shape != results['pad_shape']:
+            gt_seg = mmcv.impad(gt_seg, results['pad_shape'][:2])
+        if self.scale_factor != 1:
+            gt_seg = mmcv.imrescale(
+                gt_seg, self.scale_factor, interpolation='nearest')
+        results['gt_semantic_seg'] = gt_seg
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + '(scale_factor={})'.format(
+            self.scale_factor)
+@PIPELINES.register_module
+class PhotoMetricDistortion(object):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+    def __call__(self, results):
+        img = results['img']
+        # random brightness
+        if random.randint(2):
+            delta = random.uniform(-self.brightness_delta,
+                                   self.brightness_delta)
+            img += delta
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+        # random saturation
+        if random.randint(2):
+            img[..., 1] *= random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+        # random hue
+        if random.randint(2):
+            img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+        # random contrast
+        if mode == 0:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+        # randomly swap channels
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+        results['img'] = img
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += ('(brightness_delta={}, contrast_range={}, '
+                     'saturation_range={}, hue_delta={})').format(
+                         self.brightness_delta, self.contrast_range,
+                         self.saturation_range, self.hue_delta)
+        return repr_str
+@PIPELINES.register_module
+class Expand(object):
+    """Random expand the image & bboxes.
+    Randomly place the original image on a canvas of 'ratio' x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+    Args:
+        mean (tuple): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (tuple): range of expand ratio.
+    """
+    def __init__(self, mean=(0, 0, 0), to_rgb=True, ratio_range=(1, 4)):
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+    def __call__(self, results):
+        if random.randint(2):
+            return results
+        img, boxes = [results[k] for k in ('img', 'gt_bboxes')]
+        h, w, c = img.shape
+        ratio = random.uniform(self.min_ratio, self.max_ratio)
+        expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                             self.mean).astype(img.dtype)
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        expand_img[top:top + h, left:left + w] = img
+        boxes += np.tile((left, top), 2)
+        results['img'] = expand_img
+        results['gt_bboxes'] = boxes
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(mean={}, to_rgb={}, ratio_range={})'.format(
+            self.mean, self.to_rgb, self.ratio_range)
+        return repr_str
+@PIPELINES.register_module
+class MinIoURandomCrop(object):
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+    Args:
+        min_ious (tuple): minimum IoU threshold
+        crop_size (tuple): Expected size after cropping, (h, w).
+    """
+    def __init__(self, min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3):
+        # 1: return ori img
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+    def __call__(self, results):
+        img, boxes, labels = [
+            results[k] for k in ('img', 'gt_bboxes', 'gt_labels')
+        ]
+        h, w, c = img.shape
+        while True:
+            mode = random.choice(self.sample_mode)
+            if mode == 1:
+                return results
+            min_iou = mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                overlaps = bbox_overlaps(
+                    patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+                if overlaps.min() < min_iou:
+                    continue
+                # center of boxes should inside the crop img
+                center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                mask = (center[:, 0] > patch[0]) * (
+                    center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (
+                        center[:, 1] < patch[3])
+                if not mask.any():
+                    continue
+                boxes = boxes[mask]
+                labels = labels[mask]
+                # adjust boxes
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                boxes -= np.tile(patch[:2], 2)
+                results['img'] = img
+                results['gt_bboxes'] = boxes
+                results['gt_labels'] = labels
+                return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(min_ious={}, min_crop_size={})'.format(
+            self.min_ious, self.min_crop_size)
+        return repr_str
+@PIPELINES.register_module
+class Corrupt(object):
+    def __init__(self, corruption, severity=1):
+        self.corruption = corruption
+        self.severity = severity
+    def __call__(self, results):
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(corruption={}, severity={})'.format(
+            self.corruption, self.severity)
+        return repr_str
--- a/mmdet/datasets/registry.py
+++ b/mmdet/datasets/registry.py
 from mmdet.utils import Registry
 DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
--- a/mmdet/datasets/utils.py
+++ b/mmdet/datasets/utils.py
-from collections import Sequence
-import matplotlib.pyplot as plt
-import mmcv
-import numpy as np
-import torch
-def to_tensor(data):
-    """Convert objects of various python types to :obj:`torch.Tensor`.
-    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
-    :class:`Sequence`, :class:`int` and :class:`float`.
-    """
-    if isinstance(data, torch.Tensor):
-        return data
-    elif isinstance(data, np.ndarray):
-        return torch.from_numpy(data)
-    elif isinstance(data, Sequence) and not mmcv.is_str(data):
-        return torch.tensor(data)
-    elif isinstance(data, int):
-        return torch.LongTensor([data])
-    elif isinstance(data, float):
-        return torch.FloatTensor([data])
-    else:
-        raise TypeError('type {} cannot be converted to tensor.'.format(
-            type(data)))
-def random_scale(img_scales, mode='range'):
-    """Randomly select a scale from a list of scales or scale ranges.
-    Args:
-        img_scales (list[tuple]): Image scale or scale range.
-        mode (str): "range" or "value".
-    Returns:
-        tuple: Sampled image scale.
-    """
-    num_scales = len(img_scales)
-    if num_scales == 1:  # fixed scale is specified
-        img_scale = img_scales[0]
-    elif num_scales == 2:  # randomly sample a scale
-        if mode == 'range':
-            img_scale_long = [max(s) for s in img_scales]
-            img_scale_short = [min(s) for s in img_scales]
-            long_edge = np.random.randint(
-                min(img_scale_long),
-                max(img_scale_long) + 1)
-            short_edge = np.random.randint(
-                min(img_scale_short),
-                max(img_scale_short) + 1)
-            img_scale = (long_edge, short_edge)
-        elif mode == 'value':
-            img_scale = img_scales[np.random.randint(num_scales)]
-    else:
-        if mode != 'value':
-            raise ValueError(
-                'Only "value" mode supports more than 2 image scales')
-        img_scale = img_scales[np.random.randint(num_scales)]
-    return img_scale
-def show_ann(coco, img, ann_info):
-    plt.imshow(mmcv.bgr2rgb(img))
-    plt.axis('off')
-    coco.showAnns(ann_info)
-    plt.show()
--- a/mmdet/models/detectors/base.py
+++ b/mmdet/models/detectors/base.py
@@ -87,12 +87,7 @@ class BaseDetector(nn.Module):
        else:
            return self.forward_test(img, img_meta, **kwargs)
-    def show_result(self,
+    def show_result(self, data, result, dataset=None, score_thr=0.3):
-                    data,
-                    result,
-                    img_norm_cfg,
-                    dataset=None,
-                    score_thr=0.3):
        if isinstance(result, tuple):
            bbox_result, segm_result = result
        else:
@@ -100,7 +95,7 @@ class BaseDetector(nn.Module):
        img_tensor = data['img'][0]
        img_metas = data['img_meta'][0].data[0]
-        imgs = tensor2imgs(img_tensor, **img_norm_cfg)
+        imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
        assert len(imgs) == len(img_metas)
        if dataset is None:

--- a/mmdet/models/detectors/cascade_rcnn.py
+++ b/mmdet/models/detectors/cascade_rcnn.py
@@ -402,7 +402,7 @@ class CascadeRCNN(BaseDetector, RPNTestMixin):
    def aug_test(self, img, img_meta, proposals=None, rescale=False):
        raise NotImplementedError
-    def show_result(self, data, result, img_norm_cfg, **kwargs):
+    def show_result(self, data, result, **kwargs):
        if self.with_mask:
            ms_bbox_result, ms_segm_result = result
            if isinstance(ms_bbox_result, dict):
@@ -411,5 +411,4 @@ class CascadeRCNN(BaseDetector, RPNTestMixin):
        else:
            if isinstance(result, dict):
                result = result['ensemble']
-        super(CascadeRCNN, self).show_result(data, result, img_norm_cfg,
+        super(CascadeRCNN, self).show_result(data, result, **kwargs)
-                                             **kwargs)
--- a/mmdet/models/detectors/rpn.py
+++ b/mmdet/models/detectors/rpn.py
@@ -81,7 +81,7 @@ class RPN(BaseDetector, RPNTestMixin):
        # TODO: remove this restriction
        return proposal_list[0].cpu().numpy()
-    def show_result(self, data, result, img_norm_cfg, dataset=None, top_k=20):
+    def show_result(self, data, result, dataset=None, top_k=20):
        """Show RPN proposals on the image.
        Although we assume batch size is 1, this method supports arbitrary
@@ -89,7 +89,7 @@ class RPN(BaseDetector, RPNTestMixin):
        """
        img_tensor = data['img'][0]
        img_metas = data['img_meta'][0].data[0]
-        imgs = tensor2imgs(img_tensor, **img_norm_cfg)
+        imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
        assert len(imgs) == len(img_metas)
        for img, img_meta in zip(imgs, img_metas):
            h, w, _ = img_meta['img_shape']

--- a/mmdet/utils/registry.py
+++ b/mmdet/utils/registry.py
@@ -61,14 +61,16 @@ def build_from_cfg(cfg, registry, default_args=None):
    args = cfg.copy()
    obj_type = args.pop('type')
    if mmcv.is_str(obj_type):
-        obj_type = registry.get(obj_type)
+        obj_cls = registry.get(obj_type)
-        if obj_type is None:
+        if obj_cls is None:
            raise KeyError('{} is not in the {} registry'.format(
                obj_type, registry.name))
-    elif not inspect.isclass(obj_type):
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
        raise TypeError('type must be a str or valid type, but got {}'.format(
            type(obj_type)))
    if default_args is not None:
        for name, value in default_args.items():
            args.setdefault(name, value)
-    return obj_type(**args)
+    return obj_cls(**args)
--- a/tools/test.py
+++ b/tools/test.py
@@ -27,7 +27,7 @@ def single_gpu_test(model, data_loader, show=False):
        results.append(result)
        if show:
-            model.module.show_result(data, result, dataset.img_norm_cfg)
+            model.module.show_result(data, result)
        batch_size = data['img'][0].size(0)
        for _ in range(batch_size):

--- a/tools/test_robustness.py
+++ b/tools/test_robustness.py
 import argparse
+import copy
 import os
 import os.path as osp
 import shutil
@@ -350,13 +351,15 @@ def main():
                continue
            # assign corruption and severity
-            if corruption_severity == 0:
+            if corruption_severity > 0:
-                # evaluate without corruptions for severity = 0
+                test_data_cfg = copy.deepcopy(cfg.data.test)
-                cfg.data.test['corruption'] = None
+                corruption_trans = dict(
-                cfg.data.test['corruption_severity'] = 0
+                    type='Corrupt',
-            else:
+                    corruption=corruption,
-                cfg.data.test['corruption'] = corruption
+                    severity=corruption_severity)
-                cfg.data.test['corruption_severity'] = corruption_severity
+                # TODO: hard coded "1", we assume that the first step is
+                # loading images, which needs to be fixed in the future
+                test_data_cfg['pipeline'].insert(1, corruption_trans)
            # print info
            print('\nTesting {} at severity {}'.format(corruption,