set up the codebase skeleton (WIP)

108fc9e1 · Kai Chen · 6985ef31 · 108fc9e1 · 108fc9e1 · 108fc9e1
Commit 108fc9e1 authored Sep 02, 2018 by Kai Chen
20 changed files
--- a/mmdet/core/post_processing/merge_augs.py
+++ b/mmdet/core/post_processing/merge_augs.py
+import torch
+
+from mmcv.ops import nms
+import numpy as np
+
+from ..bbox_ops import bbox_mapping_back
+
+
+def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+        img_metas (list[dict]): image info including "shape_scale" and "flip".
+        rpn_test_cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        shape_scale = img_info['shape_scale'][0]
+        flip = img_info['flip'][0]
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], shape_scale,
+                                              flip)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    nms_keep = nms(aug_proposals, rpn_test_cfg.nms_thr,
+                   aug_proposals.get_device())
+    merged_proposals = aug_proposals[nms_keep, :]
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        shape_scale = img_info['shape_scale'][0]
+        flip = img_info['flip'][0]
+        bboxes = bbox_mapping_back(bboxes, shape_scale, flip)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, bboxes, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = [
+        mask if not img_info['flip'][0] else mask[..., ::-1]
+        for mask, img_info in zip(aug_masks, img_metas)
+    ]
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
--- a/mmdet/core/targets/__init__.py
+++ b/mmdet/core/targets/__init__.py
+from .anchor_target import anchor_target
+from .bbox_target import bbox_target
+from .mask_target import mask_target
+
+__all__ = ['anchor_target', 'bbox_target', 'mask_target']
--- a/mmdet/core/targets/anchor_target.py
+++ b/mmdet/core/targets/anchor_target.py
+def anchor_target():
+    pass
--- a/mmdet/core/targets/bbox_target.py
+++ b/mmdet/core/targets/bbox_target.py
+def bbox_target():
+    pass
--- a/mmdet/core/targets/mask_target.py
+++ b/mmdet/core/targets/mask_target.py
+def mask_target():
+    pass
--- a/mmdet/datasets/__init__.py
+++ b/mmdet/datasets/__init__.py
+from .coco import CocoDataset
+from .collate import *
+from .sampler import *
+from .transforms import *
--- a/mmdet/datasets/coco.py
+++ b/mmdet/datasets/coco.py
+import os.path as osp
+
+import mmcv
+import numpy as np
+from pycocotools.coco import COCO
+from torch.utils.data import Dataset
+
+from .transforms import (ImageTransform, BboxTransform, PolyMaskTransform,
+                         Numpy2Tensor)
+from .utils import show_ann, random_scale
+from .utils import DataContainer as DC
+
+
+def parse_ann_info(ann_info, cat2label, with_mask=True):
+    """Parse bbox and mask annotation.
+
+    Args:
+        ann_info (list[dict]): Annotation info of an image.
+        cat2label (dict): The mapping from category ids to labels.
+        with_mask (bool): Whether to parse mask annotations.
+
+    Returns:
+        tuple: gt_bboxes, gt_labels and gt_mask_info
+    """
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    # each mask consists of one or several polys, each poly is a list of float.
+    if with_mask:
+        gt_mask_polys = []
+        gt_poly_lens = []
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0 or w < 1 or h < 1:
+            continue
+        bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
+        if ann['iscrowd']:
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_labels.append(cat2label[ann['category_id']])
+            if with_mask:
+                # Note polys are not resized
+                mask_polys = [
+                    p for p in ann['segmentation'] if len(p) >= 6
+                ]  # valid polygons have >= 3 points (6 coordinates)
+                poly_lens = [len(p) for p in mask_polys]
+                gt_mask_polys.append(mask_polys)
+                gt_poly_lens.extend(poly_lens)
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
+
+    if with_mask:
+        ann['mask_polys'] = gt_mask_polys
+        ann['poly_lens'] = gt_poly_lens
+    return ann
+
+
+class CocoDataset(Dataset):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 img_scale,
+                 img_norm_cfg,
+                 size_divisor=None,
+                 proposal_file=None,
+                 num_max_proposals=1000,
+                 flip_ratio=0,
+                 with_mask=True,
+                 with_crowd=True,
+                 with_label=True,
+                 test_mode=False,
+                 debug=False):
+        # path of the data file
+        self.coco = COCO(ann_file)
+        # filter images with no annotation during training
+        if not test_mode:
+            self.img_ids, self.img_infos = self._filter_imgs()
+        else:
+            self.img_ids = self.coco.getImgIds()
+            self.img_infos = [
+                self.coco.loadImgs(idx)[0] for idx in self.img_ids
+            ]
+        assert len(self.img_ids) == len(self.img_infos)
+        # get the mapping from original category ids to labels
+        self.cat_ids = self.coco.getCatIds()
+        self.cat2label = {
+            cat_id: i + 1
+            for i, cat_id in enumerate(self.cat_ids)
+        }
+        # prefix of images path
+        self.img_prefix = img_prefix
+        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
+        self.img_scales = img_scale if isinstance(img_scale,
+                                                  list) else [img_scale]
+        assert mmcv.is_list_of(self.img_scales, tuple)
+        # color channel order and normalize configs
+        self.img_norm_cfg = img_norm_cfg
+        # proposals
+        self.proposals = mmcv.load(
+            proposal_file) if proposal_file is not None else None
+        self.num_max_proposals = num_max_proposals
+        # flip ratio
+        self.flip_ratio = flip_ratio
+        assert flip_ratio >= 0 and flip_ratio <= 1
+        # padding border to ensure the image size can be divided by
+        # size_divisor (used for FPN)
+        self.size_divisor = size_divisor
+        # with crowd or not, False when using RetinaNet
+        self.with_crowd = with_crowd
+        # with mask or not
+        self.with_mask = with_mask
+        # with label is False for RPN
+        self.with_label = with_label
+        # in test mode or not
+        self.test_mode = test_mode
+        # debug mode or not
+        self.debug = debug
+
+        # set group flag for the sampler
+        self._set_group_flag()
+        # transforms
+        self.img_transform = ImageTransform(
+            size_divisor=self.size_divisor, **self.img_norm_cfg)
+        self.bbox_transform = BboxTransform()
+        self.mask_transform = PolyMaskTransform()
+        self.numpy2tensor = Numpy2Tensor()
+
+    def __len__(self):
+        return len(self.img_ids)
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        img_ids = list(set([_['image_id'] for _ in self.coco.anns.values()]))
+        valid_ids = []
+        img_infos = []
+        for i in img_ids:
+            info = self.coco.loadImgs(i)[0]
+            if min(info['width'], info['height']) >= min_size:
+                valid_ids.append(i)
+                img_infos.append(info)
+        return valid_ids, img_infos
+
+    def _load_ann_info(self, idx):
+        img_id = self.img_ids[idx]
+        ann_ids = self.coco.getAnnIds(imgIds=img_id)
+        ann_info = self.coco.loadAnns(ann_ids)
+        return ann_info
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        """
+        self.flag = np.zeros(len(self.img_ids), dtype=np.uint8)
+        for i in range(len(self.img_ids)):
+            img_info = self.img_infos[i]
+            if img_info['width'] / img_info['height'] > 1:
+                self.flag[i] = 1
+
+    def _rand_another(self, idx):
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+        while True:
+            img_info = self.img_infos[idx]
+            ann_info = self._load_ann_info(idx)
+
+            # load image
+            img = mmcv.imread(osp.join(self.img_prefix, img_info['file_name']))
+            if self.debug:
+                show_ann(self.coco, img, ann_info)
+
+            # load proposals if necessary
+            if self.proposals is not None:
+                proposals = self.proposals[idx][:self.num_max_proposals, :4]
+                # TODO: Handle empty proposals properly. Currently images with
+                # no proposals are just ignored, but they can be used for
+                # training in concept.
+                if len(proposals) == 0:
+                    idx = self._rand_another(idx)
+                    continue
+
+            ann = parse_ann_info(ann_info, self.cat2label, self.with_mask)
+            gt_bboxes = ann['bboxes']
+            gt_labels = ann['labels']
+            gt_bboxes_ignore = ann['bboxes_ignore']
+            # skip the image if there is no valid gt bbox
+            if len(gt_bboxes) == 0:
+                idx = self._rand_another(idx)
+                continue
+
+            # apply transforms
+            flip = True if np.random.rand() < self.flip_ratio else False
+            img_scale = random_scale(self.img_scales)  # sample a scale
+            img, img_shape, scale_factor = self.img_transform(
+                img, img_scale, flip)
+            if self.proposals is not None:
+                proposals = self.bbox_transform(proposals, img_shape,
+                                                scale_factor, flip)
+            gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
+                                            flip)
+            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
+                                                   scale_factor, flip)
+
+            if self.with_mask:
+                gt_mask_polys, gt_poly_lens, num_polys_per_mask = \
+                    self.mask_transform(
+                        ann['mask_polys'], ann['poly_lens'],
+                        img_info['height'], img_info['width'], flip)
+
+            ori_shape = (img_info['height'], img_info['width'])
+            img_meta = dict(
+                ori_shape=DC(ori_shape),
+                img_shape=DC(img_shape),
+                scale_factor=DC(scale_factor),
+                flip=DC(flip))
+
+            data = dict(
+                img=DC(img, stack=True),
+                img_meta=img_meta,
+                gt_bboxes=DC(gt_bboxes))
+            if self.proposals is not None:
+                data['proposals'] = DC(proposals)
+            if self.with_label:
+                data['gt_labels'] = DC(gt_labels)
+            if self.with_crowd:
+                data['gt_bboxes_ignore'] = DC(gt_bboxes_ignore)
+            if self.with_mask:
+                data['gt_mask_polys'] = DC(gt_mask_polys)
+                data['gt_poly_lens'] = DC(gt_poly_lens)
+                data['num_polys_per_mask'] = DC(num_polys_per_mask)
+            return data
+
+    def prepare_test_img(self, idx):
+        """Prepare an image for testing (multi-scale and flipping)"""
+        img_info = self._load_info(idx, with_ann=False)
+        img_file = osp.join(self.prefix, img_info['file_name'])
+        proposal = (self.proposals[idx][:, :4]
+                    if self.proposals is not None else None)
+
+        def prepare_single(img_file, scale, flip, proposal=None):
+            img_np, shape_scale_np = self.img_transform(img_file, scale, flip)
+            img, shape_scale = self.numpy2tensor(img_np, shape_scale_np)
+            img_meta = dict(shape_scale=shape_scale, flip=flip)
+            if proposal is not None:
+                proposal = self.bbox_transform(proposal, shape_scale_np, flip)
+                proposal = self.numpy2tensor(proposal)
+            return img, img_meta, proposal
+
+        imgs = []
+        img_metas = []
+        proposals = []
+        for scale in self.img_scale:
+            img, img_meta, proposal = prepare_single(img_file, scale, False,
+                                                     proposal)
+            imgs.append(img)
+            img_metas.append(img_meta)
+            proposals.append(proposal)
+            if self.flip_ratio > 0:
+                img, img_meta, prop = prepare_single(img_file, scale, True,
+                                                     proposal)
+                imgs.append(img)
+                img_metas.append(img_meta)
+                proposals.append(prop)
+        if self.proposals is None:
+            return imgs, img_metas
+        else:
+            return imgs, img_metas, proposals
--- a/mmdet/datasets/collate.py
+++ b/mmdet/datasets/collate.py
+import collections
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data.dataloader import default_collate
+
+from .utils import DataContainer
+
+# https://github.com/pytorch/pytorch/issues/973
+import resource
+rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+__all__ = ['collate']
+
+
+def collate(batch, samples_per_gpu=1):
+
+    if not isinstance(batch, collections.Sequence):
+        raise TypeError("{} is not supported.".format(batch.dtype))
+
+    if isinstance(batch[0], DataContainer):
+        assert len(batch) % samples_per_gpu == 0
+        stacked = []
+        if batch[0].stack:
+            for i in range(0, len(batch), samples_per_gpu):
+                assert isinstance(batch[i].data, torch.Tensor)
+                # TODO: handle tensors other than 3d
+                assert batch[i].dim() == 3
+                c, h, w = batch[0].size()
+                for sample in batch[i:i + samples_per_gpu]:
+                    assert c == sample.size(0)
+                    h = max(h, sample.size(1))
+                    w = max(w, sample.size(2))
+                padded_samples = [
+                    F.pad(
+                        sample.data,
+                        (0, w - sample.size(2), 0, h - sample.size(1)),
+                        value=sample.padding_value)
+                    for sample in batch[i:i + samples_per_gpu]
+                ]
+                stacked.append(default_collate(padded_samples))
+        else:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
+    elif isinstance(batch[0], collections.Sequence):
+        transposed = zip(*batch)
+        return [collate(samples, samples_per_gpu) for samples in transposed]
+    elif isinstance(batch[0], collections.Mapping):
+        return {
+            key: collate([d[key] for d in batch], samples_per_gpu)
+            for key in batch[0]
+        }
+    else:
+        return default_collate(batch)
--- a/mmdet/datasets/sampler.py
+++ b/mmdet/datasets/sampler.py
+from __future__ import division
+
+import math
+import torch
+import numpy as np
+
+from torch.distributed import get_world_size, get_rank
+from torch.utils.data.sampler import Sampler
+
+__all__ = ['GroupSampler', 'DistributedGroupSampler']
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate([indice, indice[:num_extra]])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = torch.from_numpy(indices).long()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None):
+        if num_replicas is None:
+            num_replicas = get_world_size()
+        if rank is None:
+            rank = get_rank()
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                indice = indice[list(torch.randperm(int(size),
+                                                    generator=g))].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                indice += indice[:extra]
+                indices += indice
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
--- a/mmdet/datasets/transforms.py
+++ b/mmdet/datasets/transforms.py
+import mmcv
+# import cvbase as cvb
+import numpy as np
+import torch
+
+from mmdet.core import segms
+
+__all__ = [
+    'ImageTransform', 'BboxTransform', 'PolyMaskTransform', 'Numpy2Tensor'
+]
+
+
+class ImageTransform(object):
+    """Preprocess an image
+    1. rescale the image to expected size
+    2. normalize the image
+    3. flip the image (if needed)
+    4. pad the image (if needed)
+    5. transpose to (c, h, w)
+    """
+
+    def __init__(self,
+                 mean=(0, 0, 0),
+                 std=(1, 1, 1),
+                 to_rgb=True,
+                 size_divisor=None):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+        self.size_divisor = size_divisor
+
+    def __call__(self, img, scale, flip=False):
+        img, scale_factor = mmcv.imrescale(img, scale, True)
+        img_shape = img.shape
+        img = mmcv.imnorm(img, self.mean, self.std, self.to_rgb)
+        if flip:
+            img = mmcv.imflip(img)
+        if self.size_divisor is not None:
+            img = mmcv.impad_to_multiple(img, self.size_divisor)
+        img = img.transpose(2, 0, 1)
+        return img, img_shape, scale_factor
+
+        # img, scale = cvb.resize_keep_ar(img_or_path, max_long_edge,
+        #                                 max_short_edge, True)
+        # shape_scale = np.array(img.shape + (scale, ), dtype=np.float32)
+        # if flip:
+        #     img = img[:, ::-1, :].copy()
+        # if self.color_order == 'RGB':
+        #     img = cvb.bgr2rgb(img)
+        # img = img.astype(np.float32)
+        # img -= self.color_mean
+        # img /= self.color_std
+        # if self.size_divisor is None:
+        #     padded_img = img
+        # else:
+        #     pad_h = int(np.ceil(
+        #         img.shape[0] / self.size_divisor)) * self.size_divisor
+        #     pad_w = int(np.ceil(
+        #         img.shape[1] / self.size_divisor)) * self.size_divisor
+        #     padded_img = cvb.pad_img(img, (pad_h, pad_w), pad_val=0)
+        # padded_img = padded_img.transpose(2, 0, 1)
+        # return padded_img, shape_scale
+
+
+class ImageCrop(object):
+    """crop image patches and resize patches into fixed size
+    1. (read and) flip image (if needed) 
+    2. crop image patches according to given bboxes
+    3. resize patches into fixed size (default 224x224)
+    4. normalize the image (if needed)
+    5. transpose to (c, h, w) (if needed)
+    """
+
+    def __init__(self,
+                 normalize=True,
+                 transpose=True,
+                 color_order='RGB',
+                 color_mean=(0, 0, 0),
+                 color_std=(1, 1, 1)):
+        self.normalize = normalize
+        self.transpose = transpose
+
+        assert color_order in ['RGB', 'BGR']
+        self.color_order = color_order
+        self.color_mean = np.array(color_mean, dtype=np.float32)
+        self.color_std = np.array(color_std, dtype=np.float32)
+
+    def __call__(self,
+                 img_or_path,
+                 bboxes,
+                 crop_size,
+                 scale_ratio=1.0,
+                 flip=False):
+        img = cvb.read_img(img_or_path)
+        if flip:
+            img = img[:, ::-1, :].copy()
+        crop_imgs = cvb.crop_img(
+            img,
+            bboxes[:, :4],
+            scale_ratio=scale_ratio,
+            pad_fill=self.color_mean)
+        processed_crop_imgs_list = []
+        for i in range(len(crop_imgs)):
+            crop_img = crop_imgs[i]
+            crop_img = cvb.resize(crop_img, crop_size)
+            crop_img = crop_img.astype(np.float32)
+            crop_img -= self.color_mean
+            crop_img /= self.color_std
+            processed_crop_imgs_list.append(crop_img)
+        processed_crop_imgs = np.stack(processed_crop_imgs_list, axis=0)
+        processed_crop_imgs = processed_crop_imgs.transpose(0, 3, 1, 2)
+        return processed_crop_imgs
+
+
+class BboxTransform(object):
+    """Preprocess gt bboxes
+    1. rescale bboxes according to image size
+    2. flip bboxes (if needed)
+    3. pad the first dimension to `max_num_gts`
+    """
+
+    def __init__(self, max_num_gts=None):
+        self.max_num_gts = max_num_gts
+
+    def __call__(self, bboxes, img_shape, scale_factor, flip=False):
+        gt_bboxes = bboxes * scale_factor
+        if flip:
+            gt_bboxes = mmcv.bbox_flip(gt_bboxes, img_shape)
+        if self.max_num_gts is None:
+            return gt_bboxes
+        else:
+            num_gts = gt_bboxes.shape[0]
+            padded_bboxes = np.zeros((self.max_num_gts, 4), dtype=np.float32)
+            padded_bboxes[:num_gts, :] = gt_bboxes
+            return padded_bboxes
+
+
+class PolyMaskTransform(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, gt_mask_polys, gt_poly_lens, img_h, img_w, flip=False):
+        """
+        Args:
+            gt_mask_polys(list): a list of masks, each mask is a list of polys,
+                each poly is a list of numbers
+            gt_poly_lens(list): a list of int, indicating the size of each poly
+        """
+        if flip:
+            gt_mask_polys = segms.flip_segms(gt_mask_polys, img_h, img_w)
+        num_polys_per_mask = np.array(
+            [len(mask_polys) for mask_polys in gt_mask_polys], dtype=np.int64)
+        gt_poly_lens = np.array(gt_poly_lens, dtype=np.int64)
+        gt_mask_polys = [
+            np.concatenate(mask_polys).astype(np.float32)
+            for mask_polys in gt_mask_polys
+        ]
+        gt_mask_polys = np.concatenate(gt_mask_polys)
+        return gt_mask_polys, gt_poly_lens, num_polys_per_mask
+
+
+class MaskTransform(object):
+    """Preprocess masks
+    1. resize masks to expected size and stack to a single array
+    2. flip the masks (if needed)
+    3. pad the masks (if needed)
+    """
+
+    def __init__(self, max_num_gts, pad_size=None):
+        self.max_num_gts = max_num_gts
+        self.pad_size = pad_size
+
+    def __call__(self, masks, img_size, flip=False):
+        max_long_edge = max(img_size)
+        max_short_edge = min(img_size)
+        masks = [
+            cvb.resize_keep_ar(
+                mask,
+                max_long_edge,
+                max_short_edge,
+                interpolation=cvb.INTER_NEAREST) for mask in masks
+        ]
+        masks = np.stack(masks, axis=0)
+        if flip:
+            masks = masks[:, ::-1, :]
+        if self.pad_size is None:
+            pad_h = masks.shape[1]
+            pad_w = masks.shape[2]
+        else:
+            pad_size = self.pad_size if self.pad_size > 0 else max_long_edge
+            pad_h = pad_w = pad_size
+        padded_masks = np.zeros(
+            (self.max_num_gts, pad_h, pad_w), dtype=masks.dtype)
+        padded_masks[:masks.shape[0], :masks.shape[1], :masks.shape[2]] = masks
+        return padded_masks
+
+
+class Numpy2Tensor(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, *args):
+        if len(args) == 1:
+            return torch.from_numpy(args[0])
+        else:
+            return tuple([torch.from_numpy(array) for array in args])
--- a/mmdet/datasets/utils/__init__.py
+++ b/mmdet/datasets/utils/__init__.py
+from .data_container import DataContainer
+from .misc import *
--- a/mmdet/datasets/utils/data_container.py
+++ b/mmdet/datasets/utils/data_container.py
+import functools
+from collections import Sequence
+
+import mmcv
+import numpy as np
+import torch
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError('type {} cannot be converted to tensor.'.format(
+            type(data)))
+
+
+def assert_tensor_type(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not isinstance(args[0].data, torch.Tensor):
+            raise AttributeError('{} has no attribute {} for type {}'.format(
+                args[0].__class__.__name__, func.__name__, args[0].datatype))
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+class DataContainer(object):
+
+    def __init__(self, data, stack=False, padding_value=0):
+        if isinstance(data, list):
+            self._data = data
+        else:
+            self._data = to_tensor(data)
+        self._stack = stack
+        self._padding_value = padding_value
+
+    def __repr__(self):
+        return '{}({})'.format(self.__class__.__name__, repr(self.data))
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def datatype(self):
+        if isinstance(self.data, torch.Tensor):
+            return self.data.type()
+        else:
+            return type(self.data)
+
+    @property
+    def stack(self):
+        return self._stack
+
+    @property
+    def padding_value(self):
+        return self._padding_value
+
+    @assert_tensor_type
+    def size(self, *args, **kwargs):
+        return self.data.size(*args, **kwargs)
+
+    @assert_tensor_type
+    def dim(self):
+        return self.data.dim()
--- a/mmdet/datasets/utils/misc.py
+++ b/mmdet/datasets/utils/misc.py
+import mmcv
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pycocotools.mask as maskUtils
+
+
+def random_scale(img_scales, mode='range'):
+    """Randomly select a scale from a list of scales or scale ranges.
+
+    Args:
+        img_scales (list[tuple]): Image scale or scale range.
+        mode (str): "range" or "value".
+
+    Returns:
+        tuple: Sampled image scale.
+    """
+    num_scales = len(img_scales)
+    if num_scales == 1:  # fixed scale is specified
+        img_scale = img_scales[0]
+    elif num_scales == 2:  # randomly sample a scale
+        if mode == 'range':
+            img_scale_long = [max(s) for s in img_scales]
+            img_scale_short = [min(s) for s in img_scales]
+            long_edge = np.random.randint(
+                min(img_scale_long),
+                max(img_scale_long) + 1)
+            short_edge = np.random.randint(
+                min(img_scale_short),
+                max(img_scale_short) + 1)
+            img_scale = (long_edge, short_edge)
+        elif mode == 'value':
+            img_scale = img_scales[np.random.randint(num_scales)]
+    else:
+        if mode != 'value':
+            raise ValueError(
+                'Only "value" mode supports more than 2 image scales')
+        img_scale = img_scales[np.random.randint(num_scales)]
+    return img_scale
+
+
+def show_ann(coco, img, ann_info):
+    plt.imshow(mmcv.bgr2rgb(img))
+    plt.axis('off')
+    coco.showAnns(ann_info)
+    plt.show()
+
+
+def draw_bbox_and_segm(img, results, dataset, score_thr=0.5):
+    bbox_results, segm_results = results
+    hi_bboxes = []
+    for cls_bboxes, cls_segms in zip(bbox_results, segm_results):
+        if len(cls_bboxes) == 0:
+            hi_bboxes.append(cls_bboxes)
+            continue
+        inds = np.where(cls_bboxes[:, -1] > score_thr)[0]
+        hi_bboxes.append(cls_bboxes[inds, :])
+        color_mask = np.random.random((1, 3))
+        for i in inds:
+            mask = maskUtils.decode(cls_segms[i]).astype(np.bool)
+            img[mask] = img[mask] * 0.5 + color_mask * 0.5
+    mmcv.draw_bboxes_with_label(np.ascontiguousarray(img), hi_bboxes, dataset)
--- a/mmdet/models/__init__.py
+++ b/mmdet/models/__init__.py
--- a/mmdet/models/backbones/__init__.py
+++ b/mmdet/models/backbones/__init__.py
+from .resnet import resnet
--- a/mmdet/models/backbones/resnet.py
+++ b/mmdet/models/backbones/resnet.py
+import math
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from torchpack import load_checkpoint
+
+
+def conv3x3(in_planes, out_planes, stride=1, dilation=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        dilation=dilation,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='fb'):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='fb',
+                 with_cp=False):
+        """Bottleneck block
+        if style is "fb", the stride-two layer is the 3x3 conv layer,
+        if style is "msra", the stride-two layer is the first 1x1 conv layer
+        """
+        super(Bottleneck, self).__init__()
+        assert style in ['fb', 'msra']
+        if style == 'fb':
+            conv1_stride = 1
+            conv2_stride = stride
+        else:
+            conv1_stride = stride
+            conv2_stride = 1
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.conv1(x)
+            out = self.bn1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.bn2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.bn3(out)
+
+            if self.downsample is not None:
+                residual = self.downsample(x)
+
+            out += residual
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   style='fb',
+                   with_cp=False):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            nn.BatchNorm2d(planes * block.expansion),
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for i in range(1, blocks):
+        layers.append(
+            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+class ResHead(nn.Module):
+
+    def __init__(self, block, num_blocks, stride=2, dilation=1, style='fb'):
+        self.layer4 = make_res_layer(
+            block,
+            1024,
+            512,
+            num_blocks,
+            stride=stride,
+            dilation=dilation,
+            style=style)
+
+    def forward(self, x):
+        return self.layer4(x)
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 style='fb',
+                 sync_bn=False,
+                 with_cp=False):
+        super(ResNet, self).__init__()
+        if not len(layers) == len(strides) == len(dilations):
+            raise ValueError(
+                'The number of layers, strides and dilations must be equal, '
+                'but found have {} layers, {} strides and {} dilations'.format(
+                    len(layers), len(strides), len(dilations)))
+        assert max(out_indices) < len(layers)
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.style = style
+        self.sync_bn = sync_bn
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.res_layers = []
+        for i, num_blocks in enumerate(layers):
+
+            stride = strides[i]
+            dilation = dilations[i]
+
+            layer_name = 'layer{}'.format(i + 1)
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp)
+            self.inplanes = planes * block.expansion
+            setattr(self, layer_name, res_layer)
+            self.res_layers.append(layer_name)
+        self.feat_dim = block.expansion * 64 * 2**(len(layers) - 1)
+        self.with_cp = with_cp
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            load_checkpoint(self, pretrained, strict=False)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                    nn.init.normal_(m.weight, 0, math.sqrt(2. / n))
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode=True):
+        super(ResNet, self).train(mode)
+        if not self.sync_bn:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+        if mode and self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+            for param in self.bn1.parameters():
+                param.requires_grad = False
+            self.bn1.eval()
+            self.bn1.weight.requires_grad = False
+            self.bn1.bias.requires_grad = False
+            for i in range(1, self.frozen_stages + 1):
+                mod = getattr(self, 'layer{}'.format(i))
+                mod.eval()
+                for param in mod.parameters():
+                    param.requires_grad = False
+
+
+resnet_cfg = {
+    18: (BasicBlock, (2, 2, 2, 2)),
+    34: (BasicBlock, (3, 4, 6, 3)),
+    50: (Bottleneck, (3, 4, 6, 3)),
+    101: (Bottleneck, (3, 4, 23, 3)),
+    152: (Bottleneck, (3, 8, 36, 3))
+}
+
+
+def resnet(depth,
+           num_stages=4,
+           strides=(1, 2, 2, 2),
+           dilations=(1, 1, 1, 1),
+           out_indices=(2, ),
+           frozen_stages=-1,
+           style='fb',
+           sync_bn=False,
+           with_cp=False):
+    """Constructs a ResNet model.
+
+    Args:
+        depth (int): depth of resnet, from {18, 34, 50, 101, 152}
+        num_stages (int): num of resnet stages, normally 4
+        strides (list): strides of the first block of each stage
+        dilations (list): dilation of each stage
+        out_indices (list): output from which stages
+    """
+    if depth not in resnet_cfg:
+        raise KeyError('invalid depth {} for resnet'.format(depth))
+    block, layers = resnet_cfg[depth]
+    model = ResNet(block, layers[:num_stages], strides, dilations, out_indices,
+                   frozen_stages, style, sync_bn, with_cp)
+    return model
--- a/mmdet/models/bbox_heads/__init__.py
+++ b/mmdet/models/bbox_heads/__init__.py
+from .bbox_head import BBoxHead
+
+__all__ = ['BBoxHead']
--- a/mmdet/models/bbox_heads/bbox_head.py
+++ b/mmdet/models/bbox_heads/bbox_head.py
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.core import (bbox_transform_inv, bbox_target, multiclass_nms,
+                        weighted_cross_entropy, weighted_smoothl1, accuracy)
+
+
+class BBoxHead(nn.Module):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively"""
+
+    def __init__(self,
+                 exclude_mal_box=True,
+                 with_avg_pool=False,
+                 with_cls=True,
+                 with_reg=True,
+                 roi_feat_size=7,
+                 in_channels=256,
+                 num_classes=81,
+                 target_means=[0., 0., 0., 0.],
+                 target_stds=[0.1, 0.1, 0.2, 0.2],
+                 reg_class_agnostic=False):
+        super(BBoxHead, self).__init__()
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.reg_class_agnostic = reg_class_agnostic
+        self.exclude_mal_box = exclude_mal_box
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(roi_feat_size)
+        else:
+            in_channels *= (self.roi_feat_size * self.roi_feat_size)
+        if self.with_cls:
+            self.fc_cls = nn.Linear(in_channels, num_classes)
+        if self.with_reg:
+            out_dim_reg = 4 if reg_class_agnostic else 4 * num_classes
+            self.fc_reg = nn.Linear(in_channels, out_dim_reg)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        if self.with_cls:
+            nn.init.normal_(self.fc_cls.weight, 0, 0.01)
+            nn.init.constant_(self.fc_cls.bias, 0)
+        if self.with_reg:
+            nn.init.normal_(self.fc_reg.weight, 0, 0.001)
+            nn.init.constant_(self.fc_reg.bias, 0)
+
+    def forward(self, x):
+        if self.with_avg_pool:
+            x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def bbox_target(self, pos_proposals, neg_proposals, pos_gt_bboxes,
+                    pos_gt_labels, rcnn_train_cfg):
+        reg_num_classes = 1 if self.reg_class_agnostic else self.num_classes
+        cls_reg_targets = bbox_target(
+            pos_proposals,
+            neg_proposals,
+            pos_gt_bboxes,
+            pos_gt_labels,
+            self.target_means,
+            self.target_stds,
+            rcnn_train_cfg,
+            reg_num_classes,
+            debug_imgs=self.debug_imgs)
+        return cls_reg_targets
+
+    def loss(self, cls_score, bbox_pred, labels, label_weights, bbox_targets,
+             bbox_weights):
+        losses = dict()
+        if cls_score is not None:
+            losses['loss_cls'] = weighted_cross_entropy(
+                cls_score, labels, label_weights)
+            losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            losses['loss_reg'] = weighted_smoothl1(
+                bbox_pred,
+                bbox_targets,
+                bbox_weights,
+                ave_factor=bbox_targets.size(0))
+        return losses
+
+    def get_det_bboxes(self,
+                       rois,
+                       cls_score,
+                       bbox_pred,
+                       img_shape,
+                       rescale=False,
+                       nms_cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+
+        if bbox_pred is not None:
+            bboxes = bbox_transform_inv(rois[:, 1:], bbox_pred,
+                                        self.target_means, self.target_stds,
+                                        img_shape)
+        else:
+            bboxes = rois[:, 1:]
+            # TODO: add clip here
+
+        if rescale:
+            bboxes /= img_shape[-1]
+
+        if nms_cfg is None:
+            return bboxes, scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes, scores, nms_cfg.score_thr, nms_cfg.nms_thr,
+                nms_cfg.max_per_img)
+
+            return det_bboxes, det_labels
--- a/mmdet/models/builder.py
+++ b/mmdet/models/builder.py
+import mmcv
+from torch import nn
+
+from . import (backbones, necks, roi_extractors, rpn_heads, bbox_heads,
+               mask_heads)
+
+__all__ = [
+    'build_backbone', 'build_neck', 'build_rpn_head', 'build_roi_extractor',
+    'build_bbox_head', 'build_mask_head'
+]
+
+
+def _build_module(cfg, parrent=None):
+    return cfg if isinstance(cfg, nn.Module) else mmcv.obj_from_dict(
+        cfg, parrent)
+
+
+def build(cfg, parrent=None):
+    if isinstance(cfg, list):
+        modules = [_build_module(cfg_, parrent) for cfg_ in cfg]
+        return nn.Sequential(*modules)
+    else:
+        return _build_module(cfg, parrent)
+
+
+def build_backbone(cfg):
+    return build(cfg, backbones)
+
+
+def build_neck(cfg):
+    return build(cfg, necks)
+
+
+def build_rpn_head(cfg):
+    return build(cfg, rpn_heads)
+
+
+def build_roi_extractor(cfg):
+    return build(cfg, roi_extractors)
+
+
+def build_bbox_head(cfg):
+    return build(cfg, bbox_heads)
+
+
+def build_mask_head(cfg):
+    return build(cfg, mask_heads)
--- a/mmdet/models/common/__init__.py
+++ b/mmdet/models/common/__init__.py
+from .conv_module import ConvModule
+from .norm import build_norm_layer
+
+__all__ = ['ConvModule', 'build_norm_layer']