Merge branch 'master' into pytorch-1.0

441015ea · Kai Chen · 2017c81e · 3b6ae96d · 441015ea · 441015ea
Commit 441015ea authored Feb 06, 2019 by Kai Chen
20 changed files
--- a/mmdet/core/bbox/assigners/max_iou_assigner.py
+++ b/mmdet/core/bbox/assigners/max_iou_assigner.py
@@ -69,7 +69,7 @@ class MaxIoUAssigner(BaseAssigner):
        if bboxes.shape[0] == 0 or gt_bboxes.shape[0] == 0:
            raise ValueError('No gt or bboxes')
        bboxes = bboxes[:, :4]
-        overlaps = bbox_overlaps(bboxes, gt_bboxes)
+        overlaps = bbox_overlaps(gt_bboxes, bboxes)
        if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and (
                gt_bboxes_ignore.numel() > 0):
@@ -88,8 +88,8 @@ class MaxIoUAssigner(BaseAssigner):
        """Assign w.r.t. the overlaps of bboxes with gts.
        Args:
-            overlaps (Tensor): Overlaps between n bboxes and k gt_bboxes,
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
-                shape(n, k).
+                shape(k, n).
            gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
        Returns:
@@ -98,19 +98,18 @@ class MaxIoUAssigner(BaseAssigner):
        if overlaps.numel() == 0:
            raise ValueError('No gt or proposals')
-        num_bboxes, num_gts = overlaps.size(0), overlaps.size(1)
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
        # 1. assign -1 by default
        assigned_gt_inds = overlaps.new_full(
            (num_bboxes, ), -1, dtype=torch.long)
-        assert overlaps.size() == (num_bboxes, num_gts)
        # for each anchor, which gt best overlaps with it
        # for each anchor, the max iou of all gts
-        max_overlaps, argmax_overlaps = overlaps.max(dim=1)
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
        # for each gt, which anchor best overlaps with it
        # for each gt, the max iou of all proposals
-        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=0)
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
        # 2. assign negative: below
        if isinstance(self.neg_iou_thr, float):
@@ -129,7 +128,7 @@ class MaxIoUAssigner(BaseAssigner):
        for i in range(num_gts):
            if gt_max_overlaps[i] >= self.min_pos_iou:
                if self.gt_max_assign_all:
-                    max_iou_inds = overlaps[:, i] == gt_max_overlaps[i]
+                    max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
                    assigned_gt_inds[max_iou_inds] = i + 1
                else:
                    assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1

--- a/mmdet/core/bbox/geometry.py
+++ b/mmdet/core/bbox/geometry.py
@@ -16,7 +16,7 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False):
            foreground).
    Returns:
-        ious(Tensor): shape (n, k) if is_aligned == False else shape (n, 1)
+        ious(Tensor): shape (m, n) if is_aligned == False else shape (m, 1)
    """
    assert mode in ['iou', 'iof']

--- a/mmdet/core/evaluation/mean_ap.py
+++ b/mmdet/core/evaluation/mean_ap.py
+import mmcv
 import numpy as np
 from terminaltables import AsciiTable
@@ -234,8 +235,9 @@ def eval_map(det_results,
        gt_ignore (list): gt ignore indicators of each image, a list of K array
        scale_ranges (list, optional): [(min1, max1), (min2, max2), ...]
        iou_thr (float): IoU threshold
-        dataset (None or str): dataset name, there are minor differences in
+        dataset (None or str or list): dataset name or dataset classes, there
-            metrics for different datsets, e.g. "voc07", "imagenet_det", etc.
+            are minor differences in metrics for different datsets, e.g.
+            "voc07", "imagenet_det", etc.
        print_summary (bool): whether to print the mAP summary
    Returns:
@@ -333,7 +335,7 @@ def print_map_summary(mean_ap, results, dataset=None):
    Args:
        mean_ap(float): calculated from `eval_map`
        results(list): calculated from `eval_map`
-        dataset(None or str or list): dataset name.
+        dataset(None or str or list): dataset name or dataset classes.
    """
    num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'],
                                                     np.ndarray) else 1
@@ -353,8 +355,10 @@ def print_map_summary(mean_ap, results, dataset=None):
    if dataset is None:
        label_names = [str(i) for i in range(1, num_classes + 1)]
-    else:
+    elif mmcv.is_str(dataset):
        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
    if not isinstance(mean_ap, list):
        mean_ap = [mean_ap]

--- a/mmdet/core/loss/losses.py
+++ b/mmdet/core/loss/losses.py
@@ -100,6 +100,8 @@ def accuracy(pred, target, topk=1):
    if isinstance(topk, int):
        topk = (topk, )
        return_single = True
+    else:
+        return_single = False
    maxk = max(topk)
    _, pred_label = pred.topk(maxk, 1, True, True)

--- a/mmdet/datasets/__init__.py
+++ b/mmdet/datasets/__init__.py
@@ -6,9 +6,11 @@ from .loader import GroupSampler, DistributedGroupSampler, build_dataloader
 from .utils import to_tensor, random_scale, show_ann, get_dataset
 from .concat_dataset import ConcatDataset
 from .repeat_dataset import RepeatDataset
+from .extra_aug import ExtraAugmentation
 __all__ = [
    'CustomDataset', 'XMLDataset', 'CocoDataset', 'VOCDataset', 'GroupSampler',
    'DistributedGroupSampler', 'build_dataloader', 'to_tensor', 'random_scale',
-    'show_ann', 'get_dataset', 'ConcatDataset', 'RepeatDataset'
+    'show_ann', 'get_dataset', 'ConcatDataset', 'RepeatDataset',
+    'ExtraAugmentation'
 ]
--- a/mmdet/datasets/coco.py
+++ b/mmdet/datasets/coco.py
@@ -40,7 +40,7 @@ class CocoDataset(CustomDataset):
        img_id = self.img_infos[idx]['id']
        ann_ids = self.coco.getAnnIds(imgIds=[img_id])
        ann_info = self.coco.loadAnns(ann_ids)
-        return self._parse_ann_info(ann_info)
+        return self._parse_ann_info(ann_info, self.with_mask)
    def _filter_imgs(self, min_size=32):
        """Filter images too small or without ground truths."""

--- a/mmdet/datasets/custom.py
+++ b/mmdet/datasets/custom.py
@@ -8,6 +8,7 @@ from torch.utils.data import Dataset
 from .transforms import (ImageTransform, BboxTransform, MaskTransform,
                         Numpy2Tensor)
 from .utils import to_tensor, random_scale
+from .extra_aug import ExtraAugmentation
 class CustomDataset(Dataset):
@@ -46,9 +47,12 @@ class CustomDataset(Dataset):
                 with_mask=True,
                 with_crowd=True,
                 with_label=True,
+                 extra_aug=None,
+                 resize_keep_ratio=True,
                 test_mode=False):
        # prefix of images path
        self.img_prefix = img_prefix
        # load annotations (and proposals)
        self.img_infos = self.load_annotations(ann_file)
        if proposal_file is not None:
@@ -98,6 +102,15 @@ class CustomDataset(Dataset):
        self.mask_transform = MaskTransform()
        self.numpy2tensor = Numpy2Tensor()
+        # if use extra augmentation
+        if extra_aug is not None:
+            self.extra_aug = ExtraAugmentation(**extra_aug)
+        else:
+            self.extra_aug = None
+        # image rescale if keep ratio
+        self.resize_keep_ratio = resize_keep_ratio
    def __len__(self):
        return len(self.img_infos)
@@ -176,11 +189,17 @@ class CustomDataset(Dataset):
        if len(gt_bboxes) == 0:
            return None
+        # extra augmentation
+        if self.extra_aug is not None:
+            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
+                                                       gt_labels)
        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        img, img_shape, pad_shape, scale_factor = self.img_transform(
-            img, img_scale, flip)
+            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
+        img = img.copy()
        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
@@ -232,7 +251,7 @@ class CustomDataset(Dataset):
        def prepare_single(img, scale, flip, proposal=None):
            _img, img_shape, pad_shape, scale_factor = self.img_transform(
-                img, scale, flip)
+                img, scale, flip, keep_ratio=self.resize_keep_ratio)
            _img = to_tensor(_img)
            _img_meta = dict(
                ori_shape=(img_info['height'], img_info['width'], 3),

--- a/mmdet/datasets/extra_aug.py
+++ b/mmdet/datasets/extra_aug.py
+import mmcv
+import numpy as np
+from numpy import random
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+class PhotoMetricDistortion(object):
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+    def __call__(self, img, boxes, labels):
+        # random brightness
+        if random.randint(2):
+            delta = random.uniform(-self.brightness_delta,
+                                   self.brightness_delta)
+            img += delta
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+        # random saturation
+        if random.randint(2):
+            img[..., 1] *= random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+        # random hue
+        if random.randint(2):
+            img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+        # random contrast
+        if mode == 0:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+        # randomly swap channels
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+        return img, boxes, labels
+class Expand(object):
+    def __init__(self, mean=(0, 0, 0), to_rgb=True, ratio_range=(1, 4)):
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+    def __call__(self, img, boxes, labels):
+        if random.randint(2):
+            return img, boxes, labels
+        h, w, c = img.shape
+        ratio = random.uniform(self.min_ratio, self.max_ratio)
+        expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                             self.mean).astype(img.dtype)
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        expand_img[top:top + h, left:left + w] = img
+        img = expand_img
+        boxes += np.tile((left, top), 2)
+        return img, boxes, labels
+class RandomCrop(object):
+    def __init__(self,
+                 min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size=0.3):
+        # 1: return ori img
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+    def __call__(self, img, boxes, labels):
+        h, w, c = img.shape
+        while True:
+            mode = random.choice(self.sample_mode)
+            if mode == 1:
+                return img, boxes, labels
+            min_iou = mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+                patch = np.array((int(left), int(top), int(left + new_w),
+                                  int(top + new_h)))
+                overlaps = bbox_overlaps(
+                    patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+                if overlaps.min() < min_iou:
+                    continue
+                # center of boxes should inside the crop img
+                center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                mask = (center[:, 0] > patch[0]) * (
+                    center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (
+                        center[:, 1] < patch[3])
+                if not mask.any():
+                    continue
+                boxes = boxes[mask]
+                labels = labels[mask]
+                # adjust boxes
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                boxes -= np.tile(patch[:2], 2)
+                return img, boxes, labels
+class ExtraAugmentation(object):
+    def __init__(self,
+                 photo_metric_distortion=None,
+                 expand=None,
+                 random_crop=None):
+        self.transforms = []
+        if photo_metric_distortion is not None:
+            self.transforms.append(
+                PhotoMetricDistortion(**photo_metric_distortion))
+        if expand is not None:
+            self.transforms.append(Expand(**expand))
+        if random_crop is not None:
+            self.transforms.append(RandomCrop(**random_crop))
+    def __call__(self, img, boxes, labels):
+        img = img.astype(np.float32)
+        for transform in self.transforms:
+            img, boxes, labels = transform(img, boxes, labels)
+        return img, boxes, labels
--- a/mmdet/datasets/transforms.py
+++ b/mmdet/datasets/transforms.py
@@ -25,8 +25,14 @@ class ImageTransform(object):
        self.to_rgb = to_rgb
        self.size_divisor = size_divisor
-    def __call__(self, img, scale, flip=False):
+    def __call__(self, img, scale, flip=False, keep_ratio=True):
-        img, scale_factor = mmcv.imrescale(img, scale, return_scale=True)
+        if keep_ratio:
+            img, scale_factor = mmcv.imrescale(img, scale, return_scale=True)
+        else:
+            img, w_scale, h_scale = mmcv.imresize(
+                img, scale, return_scale=True)
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
        img_shape = img.shape
        img = mmcv.imnormalize(img, self.mean, self.std, self.to_rgb)
        if flip:

--- a/mmdet/models/__init__.py
+++ b/mmdet/models/__init__.py
-from .detectors import (BaseDetector, TwoStageDetector, RPN, FastRCNN,
+from .backbones import *  # noqa: F401,F403
-                        FasterRCNN, MaskRCNN)
+from .necks import *  # noqa: F401,F403
-from .builder import (build_neck, build_rpn_head, build_roi_extractor,
+from .roi_extractors import *  # noqa: F401,F403
-                      build_bbox_head, build_mask_head, build_detector)
+from .anchor_heads import *  # noqa: F401,F403
+from .bbox_heads import *  # noqa: F401,F403
+from .mask_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .registry import BACKBONES, NECKS, ROI_EXTRACTORS, HEADS, DETECTORS
+from .builder import (build_backbone, build_neck, build_roi_extractor,
+                      build_head, build_detector)
 __all__ = [
-    'BaseDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN',
+    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'HEADS', 'DETECTORS',
-    'MaskRCNN', 'build_backbone', 'build_neck', 'build_rpn_head',
+    'build_backbone', 'build_neck', 'build_roi_extractor', 'build_head',
-    'build_roi_extractor', 'build_bbox_head', 'build_mask_head',
    'build_detector'
 ]
--- a/mmdet/models/anchor_heads/__init__.py
+++ b/mmdet/models/anchor_heads/__init__.py
+from .anchor_head import AnchorHead
+from .rpn_head import RPNHead
+from .retina_head import RetinaHead
+from .ssd_head import SSDHead
+__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead']
--- a/mmdet/models/single_stage_heads/retina_head.py
+++ b/mmdet/models/single_stage_heads/retina_head.py
@@ -3,114 +3,84 @@ from __future__ import division
 import numpy as np
 import torch
 import torch.nn as nn
+from mmcv.cnn import normal_init
-from mmdet.core import (AnchorGenerator, anchor_target, multi_apply,
+from mmdet.core import (AnchorGenerator, anchor_target, delta2bbox,
-                        delta2bbox, weighted_smoothl1,
+                        multi_apply, weighted_cross_entropy, weighted_smoothl1,
+                        weighted_binary_cross_entropy,
                        weighted_sigmoid_focal_loss, multiclass_nms)
-from ..utils import normal_init, bias_init_with_prob
+from ..registry import HEADS
-class RetinaHead(nn.Module):
+@HEADS.register_module
-    """Head of RetinaNet.
+class AnchorHead(nn.Module):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
-            / cls_convs - retina_cls (3x3 conv)
-    input -
-            \ reg_convs - retina_reg (3x3 conv)
    Args:
        in_channels (int): Number of channels in the input feature map.
-        num_classes (int): Class number (including background).
+        feat_channels (int): Number of channels of the feature map.
-        stacked_convs (int): Number of convolutional layers added for cls and
+        anchor_scales (Iterable): Anchor scales.
-            reg branch.
-        feat_channels (int): Number of channels for the RPN feature map.
-        scales_per_octave (int): Number of anchor scales per octave.
-        octave_base_scale (int): Base octave scale. Anchor scales are computed
-            as `s*2^(i/n)`, for i in [0, n-1], where s is `octave_base_scale`
-            and n is `scales_per_octave`.
        anchor_ratios (Iterable): Anchor aspect ratios.
        anchor_strides (Iterable): Anchor strides.
+        anchor_base_sizes (Iterable): Anchor base sizes.
        target_means (Iterable): Mean values of regression targets.
        target_stds (Iterable): Std values of regression targets.
+        use_sigmoid_cls (bool): Whether to use sigmoid loss for classification.
+            (softmax by default)
+        use_focal_loss (bool): Whether to use focal loss for classification.
    """  # noqa: W605
    def __init__(self,
-                 in_channels,
                 num_classes,
-                 stacked_convs=4,
+                 in_channels,
                 feat_channels=256,
-                 octave_base_scale=4,
+                 anchor_scales=[8, 16, 32],
-                 scales_per_octave=3,
                 anchor_ratios=[0.5, 1.0, 2.0],
-                 anchor_strides=[8, 16, 32, 64, 128],
+                 anchor_strides=[4, 8, 16, 32, 64],
                 anchor_base_sizes=None,
                 target_means=(.0, .0, .0, .0),
-                 target_stds=(1.0, 1.0, 1.0, 1.0)):
+                 target_stds=(1.0, 1.0, 1.0, 1.0),
-        super(RetinaHead, self).__init__()
+                 use_sigmoid_cls=False,
+                 use_focal_loss=False):
+        super(AnchorHead, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
-        self.octave_base_scale = octave_base_scale
+        self.feat_channels = feat_channels
-        self.scales_per_octave = scales_per_octave
+        self.anchor_scales = anchor_scales
        self.anchor_ratios = anchor_ratios
        self.anchor_strides = anchor_strides
        self.anchor_base_sizes = list(
            anchor_strides) if anchor_base_sizes is None else anchor_base_sizes
        self.target_means = target_means
        self.target_stds = target_stds
+        self.use_sigmoid_cls = use_sigmoid_cls
+        self.use_focal_loss = use_focal_loss
        self.anchor_generators = []
        for anchor_base in self.anchor_base_sizes:
-            octave_scales = np.array(
-                [2**(i / scales_per_octave) for i in range(scales_per_octave)])
-            anchor_scales = octave_scales * octave_base_scale
            self.anchor_generators.append(
                AnchorGenerator(anchor_base, anchor_scales, anchor_ratios))
-        self.relu = nn.ReLU(inplace=True)
-        self.num_anchors = int(
-            len(self.anchor_ratios) * self.scales_per_octave)
-        self.cls_out_channels = self.num_classes - 1
-        self.bbox_pred_dim = 4
-        self.stacked_convs = stacked_convs
+        self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)
-        self.cls_convs = nn.ModuleList()
+        if self.use_sigmoid_cls:
-        self.reg_convs = nn.ModuleList()
+            self.cls_out_channels = self.num_classes - 1
-        for i in range(self.stacked_convs):
+        else:
-            chn = in_channels if i == 0 else feat_channels
+            self.cls_out_channels = self.num_classes
-            self.cls_convs.append(
-                nn.Conv2d(chn, feat_channels, 3, stride=1, padding=1))
+        self._init_layers()
-            self.reg_convs.append(
-                nn.Conv2d(chn, feat_channels, 3, stride=1, padding=1))
+    def _init_layers(self):
-        self.retina_cls = nn.Conv2d(
+        self.conv_cls = nn.Conv2d(self.feat_channels,
-            feat_channels,
+                                  self.num_anchors * self.cls_out_channels, 1)
-            self.num_anchors * self.cls_out_channels,
+        self.conv_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
-            3,
-            stride=1,
-            padding=1)
-        self.retina_reg = nn.Conv2d(
-            feat_channels,
-            self.num_anchors * self.bbox_pred_dim,
-            3,
-            stride=1,
-            padding=1)
-        self.debug_imgs = None
    def init_weights(self):
-        for m in self.cls_convs:
+        normal_init(self.conv_cls, std=0.01)
-            normal_init(m, std=0.01)
+        normal_init(self.conv_reg, std=0.01)
-        for m in self.reg_convs:
-            normal_init(m, std=0.01)
-        bias_cls = bias_init_with_prob(0.01)
-        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
-        normal_init(self.retina_reg, std=0.01)
    def forward_single(self, x):
-        cls_feat = x
+        cls_score = self.conv_cls(x)
-        reg_feat = x
+        bbox_pred = self.conv_reg(x)
-        for cls_conv in self.cls_convs:
-            cls_feat = self.relu(cls_conv(cls_feat))
-        for reg_conv in self.reg_convs:
-            reg_feat = self.relu(reg_conv(reg_feat))
-        cls_score = self.retina_cls(cls_feat)
-        bbox_pred = self.retina_reg(reg_feat)
        return cls_score, bbox_pred
    def forward(self, feats):
@@ -156,30 +126,47 @@ class RetinaHead(nn.Module):
        return anchor_list, valid_flag_list
    def loss_single(self, cls_score, bbox_pred, labels, label_weights,
-                    bbox_targets, bbox_weights, num_pos_samples, cfg):
+                    bbox_targets, bbox_weights, num_total_samples, cfg):
        # classification loss
-        labels = labels.contiguous().view(-1, self.cls_out_channels)
+        if self.use_sigmoid_cls:
-        label_weights = label_weights.contiguous().view(
+            labels = labels.reshape(-1, self.cls_out_channels)
-            -1, self.cls_out_channels)
+            label_weights = label_weights.reshape(-1, self.cls_out_channels)
-        cls_score = cls_score.permute(0, 2, 3, 1).contiguous().view(
+        else:
+            labels = labels.reshape(-1)
+            label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
            -1, self.cls_out_channels)
-        loss_cls = weighted_sigmoid_focal_loss(
+        if self.use_sigmoid_cls:
-            cls_score,
+            if self.use_focal_loss:
-            labels,
+                cls_criterion = weighted_sigmoid_focal_loss
-            label_weights,
+            else:
-            cfg.gamma,
+                cls_criterion = weighted_binary_cross_entropy
-            cfg.alpha,
+        else:
-            avg_factor=num_pos_samples)
+            if self.use_focal_loss:
+                raise NotImplementedError
+            else:
+                cls_criterion = weighted_cross_entropy
+        if self.use_focal_loss:
+            loss_cls = cls_criterion(
+                cls_score,
+                labels,
+                label_weights,
+                gamma=cfg.gamma,
+                alpha=cfg.alpha,
+                avg_factor=num_total_samples)
+        else:
+            loss_cls = cls_criterion(
+                cls_score, labels, label_weights, avg_factor=num_total_samples)
        # regression loss
-        bbox_targets = bbox_targets.contiguous().view(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
-        bbox_weights = bbox_weights.contiguous().view(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
-        bbox_pred = bbox_pred.permute(0, 2, 3, 1).contiguous().view(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
        loss_reg = weighted_smoothl1(
            bbox_pred,
            bbox_targets,
            bbox_weights,
            beta=cfg.smoothl1_beta,
-            avg_factor=num_pos_samples)
+            avg_factor=num_total_samples)
        return loss_cls, loss_reg
    def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas,
@@ -189,6 +176,8 @@ class RetinaHead(nn.Module):
        anchor_list, valid_flag_list = self.get_anchors(
            featmap_sizes, img_metas)
+        sampling = False if self.use_focal_loss else True
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
        cls_reg_targets = anchor_target(
            anchor_list,
            valid_flag_list,
@@ -198,13 +187,14 @@ class RetinaHead(nn.Module):
            self.target_stds,
            cfg,
            gt_labels_list=gt_labels,
-            cls_out_channels=self.cls_out_channels,
+            label_channels=label_channels,
-            sampling=False)
+            sampling=sampling)
        if cls_reg_targets is None:
            return None
        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (num_total_pos if self.use_focal_loss else
+                             num_total_pos + num_total_neg)
        losses_cls, losses_reg = multi_apply(
            self.loss_single,
            cls_scores,
@@ -213,16 +203,12 @@ class RetinaHead(nn.Module):
            label_weights_list,
            bbox_targets_list,
            bbox_weights_list,
-            num_pos_samples=num_total_pos,
+            num_total_samples=num_total_samples,
            cfg=cfg)
        return dict(loss_cls=losses_cls, loss_reg=losses_reg)
-    def get_det_bboxes(self,
+    def get_bboxes(self, cls_scores, bbox_preds, img_metas, cfg,
-                       cls_scores,
+                   rescale=False):
-                       bbox_preds,
-                       img_metas,
-                       cfg,
-                       rescale=False):
        assert len(cls_scores) == len(bbox_preds)
        num_levels = len(cls_scores)
@@ -231,7 +217,6 @@ class RetinaHead(nn.Module):
                                                   self.anchor_strides[i])
            for i in range(num_levels)
        ]
        result_list = []
        for img_id in range(len(img_metas)):
            cls_score_list = [
@@ -242,46 +227,54 @@ class RetinaHead(nn.Module):
            ]
            img_shape = img_metas[img_id]['img_shape']
            scale_factor = img_metas[img_id]['scale_factor']
-            results = self._get_det_bboxes_single(
+            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
-                cls_score_list, bbox_pred_list, mlvl_anchors, img_shape,
+                                               mlvl_anchors, img_shape,
-                scale_factor, cfg, rescale)
+                                               scale_factor, cfg, rescale)
-            result_list.append(results)
+            result_list.append(proposals)
        return result_list
-    def _get_det_bboxes_single(self,
+    def get_bboxes_single(self,
-                               cls_scores,
+                          cls_scores,
-                               bbox_preds,
+                          bbox_preds,
-                               mlvl_anchors,
+                          mlvl_anchors,
-                               img_shape,
+                          img_shape,
-                               scale_factor,
+                          scale_factor,
-                               cfg,
+                          cfg,
-                               rescale=False):
+                          rescale=False):
        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
-        mlvl_proposals = []
+        mlvl_bboxes = []
        mlvl_scores = []
        for cls_score, bbox_pred, anchors in zip(cls_scores, bbox_preds,
                                                 mlvl_anchors):
            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
-            cls_score = cls_score.permute(1, 2, 0).contiguous().view(
+            cls_score = cls_score.permute(1, 2, 0).reshape(
                -1, self.cls_out_channels)
-            scores = cls_score.sigmoid()
+            if self.use_sigmoid_cls:
-            bbox_pred = bbox_pred.permute(1, 2, 0).contiguous().view(-1, 4)
+                scores = cls_score.sigmoid()
-            proposals = delta2bbox(anchors, bbox_pred, self.target_means,
+            else:
-                                   self.target_stds, img_shape)
+                scores = cls_score.softmax(-1)
-            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
-                maxscores, _ = scores.max(dim=1)
+            nms_pre = cfg.get('nms_pre', -1)
-                _, topk_inds = maxscores.topk(cfg.nms_pre)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
-                proposals = proposals[topk_inds, :]
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, 1:].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
                scores = scores[topk_inds, :]
-            mlvl_proposals.append(proposals)
+            bboxes = delta2bbox(anchors, bbox_pred, self.target_means,
+                                self.target_stds, img_shape)
+            mlvl_bboxes.append(bboxes)
            mlvl_scores.append(scores)
-        mlvl_proposals = torch.cat(mlvl_proposals)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
        if rescale:
-            mlvl_proposals /= scale_factor
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
        mlvl_scores = torch.cat(mlvl_scores)
-        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        if self.use_sigmoid_cls:
-        mlvl_scores = torch.cat([padding, mlvl_scores], dim=1)
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
-        det_bboxes, det_labels = multiclass_nms(mlvl_proposals, mlvl_scores,
+            mlvl_scores = torch.cat([padding, mlvl_scores], dim=1)
-                                                cfg.score_thr, cfg.nms,
+        det_bboxes, det_labels = multiclass_nms(
-                                                cfg.max_per_img)
+            mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img)
        return det_bboxes, det_labels
--- a/mmdet/models/anchor_heads/retina_head.py
+++ b/mmdet/models/anchor_heads/retina_head.py
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import normal_init
+from .anchor_head import AnchorHead
+from ..registry import HEADS
+from ..utils import bias_init_with_prob
+@HEADS.register_module
+class RetinaHead(AnchorHead):
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 octave_base_scale=4,
+                 scales_per_octave=3,
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        octave_scales = np.array(
+            [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+        anchor_scales = octave_scales * octave_base_scale
+        super(RetinaHead, self).__init__(
+            num_classes,
+            in_channels,
+            anchor_scales=anchor_scales,
+            use_sigmoid_cls=True,
+            use_focal_loss=True,
+            **kwargs)
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                nn.Conv2d(chn, self.feat_channels, 3, stride=1, padding=1))
+            self.reg_convs.append(
+                nn.Conv2d(chn, self.feat_channels, 3, stride=1, padding=1))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_anchors * 4, 3, padding=1)
+    def init_weights(self):
+        for m in self.cls_convs:
+            normal_init(m, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+    def forward_single(self, x):
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = self.relu(cls_conv(cls_feat))
+        for reg_conv in self.reg_convs:
+            reg_feat = self.relu(reg_conv(reg_feat))
+        cls_score = self.retina_cls(cls_feat)
+        bbox_pred = self.retina_reg(reg_feat)
+        return cls_score, bbox_pred
--- a/mmdet/models/anchor_heads/rpn_head.py
+++ b/mmdet/models/anchor_heads/rpn_head.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import normal_init
+from mmdet.core import delta2bbox
+from mmdet.ops import nms
+from .anchor_head import AnchorHead
+from ..registry import HEADS
+@HEADS.register_module
+class RPNHead(AnchorHead):
+    def __init__(self, in_channels, **kwargs):
+        super(RPNHead, self).__init__(2, in_channels, **kwargs)
+    def _init_layers(self):
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                 self.num_anchors * self.cls_out_channels, 1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
+    def init_weights(self):
+        normal_init(self.rpn_conv, std=0.01)
+        normal_init(self.rpn_cls, std=0.01)
+        normal_init(self.rpn_reg, std=0.01)
+    def forward_single(self, x):
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        return rpn_cls_score, rpn_bbox_pred
+    def loss(self, cls_scores, bbox_preds, gt_bboxes, img_metas, cfg):
+        losses = super(RPNHead, self).loss(cls_scores, bbox_preds, gt_bboxes,
+                                           None, img_metas, cfg)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'], loss_rpn_reg=losses['loss_reg'])
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          mlvl_anchors,
+                          img_shape,
+                          scale_factor,
+                          cfg,
+                          rescale=False):
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            anchors = mlvl_anchors[idx]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                scores = rpn_cls_score.softmax(dim=1)[:, 1]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
+                                   self.target_stds, img_shape)
+            if cfg.min_bbox_size > 0:
+                w = proposals[:, 2] - proposals[:, 0] + 1
+                h = proposals[:, 3] - proposals[:, 1] + 1
+                valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
+                                           (h >= cfg.min_bbox_size)).squeeze()
+                proposals = proposals[valid_inds, :]
+                scores = scores[valid_inds]
+            proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1)
+            proposals, _ = nms(proposals, cfg.nms_thr)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.nms_across_levels:
+            proposals, _ = nms(proposals, cfg.nms_thr)
+            proposals = proposals[:cfg.max_num, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_num, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+        return proposals
--- a/mmdet/models/anchor_heads/ssd_head.py
+++ b/mmdet/models/anchor_heads/ssd_head.py
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init
+from mmdet.core import (AnchorGenerator, anchor_target, weighted_smoothl1,
+                        multi_apply)
+from .anchor_head import AnchorHead
+from ..registry import HEADS
+@HEADS.register_module
+class SSDHead(AnchorHead):
+    def __init__(self,
+                 input_size=300,
+                 num_classes=81,
+                 in_channels=(512, 1024, 512, 256, 256, 256),
+                 anchor_strides=(8, 16, 32, 64, 100, 300),
+                 basesize_ratio_range=(0.1, 0.9),
+                 anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
+                 target_means=(.0, .0, .0, .0),
+                 target_stds=(1.0, 1.0, 1.0, 1.0)):
+        super(AnchorHead, self).__init__()
+        self.input_size = input_size
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.cls_out_channels = num_classes
+        num_anchors = [len(ratios) * 2 + 2 for ratios in anchor_ratios]
+        reg_convs = []
+        cls_convs = []
+        for i in range(len(in_channels)):
+            reg_convs.append(
+                nn.Conv2d(
+                    in_channels[i],
+                    num_anchors[i] * 4,
+                    kernel_size=3,
+                    padding=1))
+            cls_convs.append(
+                nn.Conv2d(
+                    in_channels[i],
+                    num_anchors[i] * num_classes,
+                    kernel_size=3,
+                    padding=1))
+        self.reg_convs = nn.ModuleList(reg_convs)
+        self.cls_convs = nn.ModuleList(cls_convs)
+        min_ratio, max_ratio = basesize_ratio_range
+        min_ratio = int(min_ratio * 100)
+        max_ratio = int(max_ratio * 100)
+        step = int(np.floor(max_ratio - min_ratio) / (len(in_channels) - 2))
+        min_sizes = []
+        max_sizes = []
+        for r in range(int(min_ratio), int(max_ratio) + 1, step):
+            min_sizes.append(int(input_size * r / 100))
+            max_sizes.append(int(input_size * (r + step) / 100))
+        if input_size == 300:
+            if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                min_sizes.insert(0, int(input_size * 7 / 100))
+                max_sizes.insert(0, int(input_size * 15 / 100))
+            elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                min_sizes.insert(0, int(input_size * 10 / 100))
+                max_sizes.insert(0, int(input_size * 20 / 100))
+        elif input_size == 512:
+            if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                min_sizes.insert(0, int(input_size * 4 / 100))
+                max_sizes.insert(0, int(input_size * 10 / 100))
+            elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                min_sizes.insert(0, int(input_size * 7 / 100))
+                max_sizes.insert(0, int(input_size * 15 / 100))
+        self.anchor_generators = []
+        self.anchor_strides = anchor_strides
+        for k in range(len(anchor_strides)):
+            base_size = min_sizes[k]
+            stride = anchor_strides[k]
+            ctr = ((stride - 1) / 2., (stride - 1) / 2.)
+            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+            ratios = [1.]
+            for r in anchor_ratios[k]:
+                ratios += [1 / r, r]  # 4 or 6 ratio
+            anchor_generator = AnchorGenerator(
+                base_size, scales, ratios, scale_major=False, ctr=ctr)
+            indices = list(range(len(ratios)))
+            indices.insert(1, len(indices))
+            anchor_generator.base_anchors = torch.index_select(
+                anchor_generator.base_anchors, 0, torch.LongTensor(indices))
+            self.anchor_generators.append(anchor_generator)
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.use_sigmoid_cls = False
+        self.use_focal_loss = False
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform', bias=0)
+    def forward(self, feats):
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
+                                            self.cls_convs):
+            cls_scores.append(cls_conv(feat))
+            bbox_preds.append(reg_conv(feat))
+        return cls_scores, bbox_preds
+    def loss_single(self, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples, cfg):
+        loss_cls_all = F.cross_entropy(
+            cls_score, labels, reduction='none') * label_weights
+        pos_inds = (labels > 0).nonzero().view(-1)
+        neg_inds = (labels == 0).nonzero().view(-1)
+        num_pos_samples = pos_inds.size(0)
+        num_neg_samples = cfg.neg_pos_ratio * num_pos_samples
+        if num_neg_samples > neg_inds.size(0):
+            num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
+        loss_reg = weighted_smoothl1(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=cfg.smoothl1_beta,
+            avg_factor=num_total_samples)
+        return loss_cls[None], loss_reg
+    def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas,
+             cfg):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == len(self.anchor_generators)
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas)
+        cls_reg_targets = anchor_target(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            self.target_means,
+            self.target_stds,
+            cfg,
+            gt_labels_list=gt_labels,
+            label_channels=1,
+            sampling=False,
+            unmap_outputs=False)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_images = len(img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list, -1).view(
+            num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list, -2).view(
+            num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list, -2).view(
+            num_images, -1, 4)
+        losses_cls, losses_reg = multi_apply(
+            self.loss_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            num_total_samples=num_total_pos,
+            cfg=cfg)
+        return dict(loss_cls=losses_cls, loss_reg=losses_reg)
--- a/mmdet/models/backbones/__init__.py
+++ b/mmdet/models/backbones/__init__.py
 from .resnet import ResNet
 from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
-__all__ = ['ResNet', 'ResNeXt']
+__all__ = ['ResNet', 'ResNeXt', 'SSDVGG']
--- a/mmdet/models/backbones/resnet.py
+++ b/mmdet/models/backbones/resnet.py
@@ -6,6 +6,10 @@ import torch.utils.checkpoint as cp
 from mmcv.cnn import constant_init, kaiming_init
 from mmcv.runner import load_checkpoint
+from mmdet.ops import DeformConv, ModulatedDeformConv
+from ..registry import BACKBONES
+from ..utils import build_norm_layer
 def conv3x3(in_planes, out_planes, stride=1, dilation=1):
    "3x3 convolution with padding"
@@ -29,27 +33,41 @@ class BasicBlock(nn.Module):
                 dilation=1,
                 downsample=None,
                 style='pytorch',
-                 with_cp=False):
+                 with_cp=False,
+                 normalize=dict(type='BN')):
        super(BasicBlock, self).__init__()
+        self.norm1_name, norm1 = build_norm_layer(normalize, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(normalize, planes, postfix=2)
        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
-        self.bn1 = nn.BatchNorm2d(planes)
+        self.add_module(self.norm1_name, norm1)
-        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = nn.BatchNorm2d(planes)
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        assert not with_cp
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
    def forward(self, x):
        identity = x
        out = self.conv1(x)
-        out = self.bn1(out)
+        out = self.norm1(out)
        out = self.relu(out)
        out = self.conv2(out)
-        out = self.bn2(out)
+        out = self.norm2(out)
        if self.downsample is not None:
            identity = self.downsample(x)
@@ -70,46 +88,101 @@ class Bottleneck(nn.Module):
                 dilation=1,
                 downsample=None,
                 style='pytorch',
-                 with_cp=False):
+                 with_cp=False,
+                 normalize=dict(type='BN'),
+                 dcn=None):
        """Bottleneck block for ResNet.
        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
        """
        super(Bottleneck, self).__init__()
        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
        self.inplanes = inplanes
        self.planes = planes
+        self.normalize = normalize
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
        if style == 'pytorch':
            self.conv1_stride = 1
            self.conv2_stride = stride
        else:
            self.conv1_stride = stride
            self.conv2_stride = 1
+        self.norm1_name, norm1 = build_norm_layer(normalize, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(normalize, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            normalize, planes * self.expansion, postfix=3)
        self.conv1 = nn.Conv2d(
            inplanes,
            planes,
            kernel_size=1,
            stride=self.conv1_stride,
            bias=False)
-        self.conv2 = nn.Conv2d(
+        self.add_module(self.norm1_name, norm1)
-            planes,
+        fallback_on_stride = False
-            planes,
+        self.with_modulated_dcn = False
-            kernel_size=3,
+        if self.with_dcn:
-            stride=self.conv2_stride,
+            fallback_on_stride = dcn.get('fallback_on_stride', False)
-            padding=dilation,
+            self.with_modulated_dcn = dcn.get('modulated', False)
-            dilation=dilation,
+        if not self.with_dcn or fallback_on_stride:
-            bias=False)
+            self.conv2 = nn.Conv2d(
+                planes,
-        self.bn1 = nn.BatchNorm2d(planes)
+                planes,
-        self.bn2 = nn.BatchNorm2d(planes)
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            deformable_groups = dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                planes,
+                deformable_groups * offset_channels,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation)
+            self.conv2 = conv_op(
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                deformable_groups=deformable_groups,
+                bias=False)
+        self.add_module(self.norm2_name, norm2)
        self.conv3 = nn.Conv2d(
            planes, planes * self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.add_module(self.norm3_name, norm3)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        self.with_cp = with_cp
+        self.normalize = normalize
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+    @property
+    def norm3(self):
+        return getattr(self, self.norm3_name)
    def forward(self, x):
@@ -117,15 +190,24 @@ class Bottleneck(nn.Module):
            identity = x
            out = self.conv1(x)
-            out = self.bn1(out)
+            out = self.norm1(out)
            out = self.relu(out)
-            out = self.conv2(out)
+            if not self.with_dcn:
-            out = self.bn2(out)
+                out = self.conv2(out)
+            elif self.with_modulated_dcn:
+                offset_mask = self.conv2_offset(out)
+                offset = offset_mask[:, :18, :, :]
+                mask = offset_mask[:, -9:, :, :].sigmoid()
+                out = self.conv2(out, offset, mask)
+            else:
+                offset = self.conv2_offset(out)
+                out = self.conv2(out, offset)
+            out = self.norm2(out)
            out = self.relu(out)
            out = self.conv3(out)
-            out = self.bn3(out)
+            out = self.norm3(out)
            if self.downsample is not None:
                identity = self.downsample(x)
@@ -151,7 +233,9 @@ def make_res_layer(block,
                   stride=1,
                   dilation=1,
                   style='pytorch',
-                   with_cp=False):
+                   with_cp=False,
+                   normalize=dict(type='BN'),
+                   dcn=None):
    downsample = None
    if stride != 1 or inplanes != planes * block.expansion:
        downsample = nn.Sequential(
@@ -161,7 +245,7 @@ def make_res_layer(block,
                kernel_size=1,
                stride=stride,
                bias=False),
-            nn.BatchNorm2d(planes * block.expansion),
+            build_norm_layer(normalize, planes * block.expansion)[1],
        )
    layers = []
@@ -173,15 +257,26 @@ def make_res_layer(block,
            dilation,
            downsample,
            style=style,
-            with_cp=with_cp))
+            with_cp=with_cp,
+            normalize=normalize,
+            dcn=dcn))
    inplanes = planes * block.expansion
    for i in range(1, blocks):
        layers.append(
-            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+            block(
+                inplanes,
+                planes,
+                1,
+                dilation,
+                style=style,
+                with_cp=with_cp,
+                normalize=normalize,
+                dcn=dcn))
    return nn.Sequential(*layers)
+@BACKBONES.register_module
 class ResNet(nn.Module):
    """ResNet backbone.
@@ -196,11 +291,14 @@ class ResNet(nn.Module):
            the first 1x1 conv layer.
        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
            not freezing any parameters.
-        bn_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+        normalize (dict): dictionary to construct and config norm layer.
-            running stats (mean and var).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
    """
    arch_settings = {
@@ -219,9 +317,12 @@ class ResNet(nn.Module):
                 out_indices=(0, 1, 2, 3),
                 style='pytorch',
                 frozen_stages=-1,
-                 bn_eval=True,
+                 normalize=dict(type='BN', frozen=False),
-                 bn_frozen=False,
+                 norm_eval=True,
-                 with_cp=False):
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 with_cp=False,
+                 zero_init_residual=True):
        super(ResNet, self).__init__()
        if depth not in self.arch_settings:
            raise KeyError('invalid depth {} for resnet'.format(depth))
@@ -230,29 +331,29 @@ class ResNet(nn.Module):
        assert num_stages >= 1 and num_stages <= 4
        self.strides = strides
        self.dilations = dilations
-        assert len(strides) == len(dilations) == num_stages
+        assert len(strides) == len(dilations) == len(
+            stage_with_dcn) == num_stages
        self.out_indices = out_indices
        assert max(out_indices) < num_stages
        self.style = style
        self.frozen_stages = frozen_stages
-        self.bn_eval = bn_eval
+        self.normalize = normalize
-        self.bn_frozen = bn_frozen
        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        self.zero_init_residual = zero_init_residual
        self.block, stage_blocks = self.arch_settings[depth]
        self.stage_blocks = stage_blocks[:num_stages]
        self.inplanes = 64
-        self.conv1 = nn.Conv2d(
+        self._make_stem_layer()
-            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.res_layers = []
        for i, num_blocks in enumerate(self.stage_blocks):
            stride = strides[i]
            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
            planes = 64 * 2**i
            res_layer = make_res_layer(
                self.block,
@@ -262,15 +363,43 @@ class ResNet(nn.Module):
                stride=stride,
                dilation=dilation,
                style=self.style,
-                with_cp=with_cp)
+                with_cp=with_cp,
+                normalize=normalize,
+                dcn=dcn)
            self.inplanes = planes * self.block.expansion
            layer_name = 'layer{}'.format(i + 1)
            self.add_module(layer_name, res_layer)
            self.res_layers.append(layer_name)
+        self._freeze_stages()
        self.feat_dim = self.block.expansion * 64 * 2**(
            len(self.stage_blocks) - 1)
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+    def _make_stem_layer(self):
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.normalize, 64, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for m in [self.conv1, self.norm1]:
+                for param in m.parameters():
+                    param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, 'layer{}'.format(i))
+            for param in m.parameters():
+                param.requires_grad = False
    def init_weights(self, pretrained=None):
        if isinstance(pretrained, str):
            logger = logging.getLogger()
@@ -279,14 +408,27 @@ class ResNet(nn.Module):
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    kaiming_init(m)
-                elif isinstance(m, nn.BatchNorm2d):
+                elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                    constant_init(m, 1)
+            if self.dcn is not None:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck) and hasattr(
+                            m, 'conv2_offset'):
+                        constant_init(m.conv2_offset, 0)
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
        else:
            raise TypeError('pretrained must be a str or None')
    def forward(self, x):
        x = self.conv1(x)
-        x = self.bn1(x)
+        x = self.norm1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        outs = []
@@ -302,23 +444,8 @@ class ResNet(nn.Module):
    def train(self, mode=True):
        super(ResNet, self).train(mode)
-        if self.bn_eval:
+        if mode and self.norm_eval:
            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
                if isinstance(m, nn.BatchNorm2d):
                    m.eval()
-                    if self.bn_frozen:
-                        for params in m.parameters():
-                            params.requires_grad = False
-        if mode and self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-            for param in self.bn1.parameters():
-                param.requires_grad = False
-            self.bn1.eval()
-            self.bn1.weight.requires_grad = False
-            self.bn1.bias.requires_grad = False
-            for i in range(1, self.frozen_stages + 1):
-                mod = getattr(self, 'layer{}'.format(i))
-                mod.eval()
-                for param in mod.parameters():
-                    param.requires_grad = False
--- a/mmdet/models/backbones/resnext.py
+++ b/mmdet/models/backbones/resnext.py
@@ -2,8 +2,11 @@ import math
 import torch.nn as nn
-from .resnet import ResNet
+from mmdet.ops import DeformConv, ModulatedDeformConv
 from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+from ..registry import BACKBONES
+from ..utils import build_norm_layer
 class Bottleneck(_Bottleneck):
@@ -20,26 +23,65 @@ class Bottleneck(_Bottleneck):
        else:
            width = math.floor(self.planes * (base_width / 64)) * groups
+        self.norm1_name, norm1 = build_norm_layer(
+            self.normalize, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.normalize, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.normalize, self.planes * self.expansion, postfix=3)
        self.conv1 = nn.Conv2d(
            self.inplanes,
            width,
            kernel_size=1,
            stride=self.conv1_stride,
            bias=False)
-        self.bn1 = nn.BatchNorm2d(width)
+        self.add_module(self.norm1_name, norm1)
-        self.conv2 = nn.Conv2d(
+        fallback_on_stride = False
-            width,
+        self.with_modulated_dcn = False
-            width,
+        if self.with_dcn:
-            kernel_size=3,
+            fallback_on_stride = self.dcn.get('fallback_on_stride', False)
-            stride=self.conv2_stride,
+            self.with_modulated_dcn = self.dcn.get('modulated', False)
-            padding=self.dilation,
+        if not self.with_dcn or fallback_on_stride:
-            dilation=self.dilation,
+            self.conv2 = nn.Conv2d(
-            groups=groups,
+                width,
-            bias=False)
+                width,
-        self.bn2 = nn.BatchNorm2d(width)
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            groups = self.dcn.get('groups', 1)
+            deformable_groups = self.dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                width,
+                deformable_groups * offset_channels,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation)
+            self.conv2 = conv_op(
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                deformable_groups=deformable_groups,
+                bias=False)
+        self.add_module(self.norm2_name, norm2)
        self.conv3 = nn.Conv2d(
            width, self.planes * self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(self.planes * self.expansion)
+        self.add_module(self.norm3_name, norm3)
 def make_res_layer(block,
@@ -51,7 +93,9 @@ def make_res_layer(block,
                   groups=1,
                   base_width=4,
                   style='pytorch',
-                   with_cp=False):
+                   with_cp=False,
+                   normalize=dict(type='BN'),
+                   dcn=None):
    downsample = None
    if stride != 1 or inplanes != planes * block.expansion:
        downsample = nn.Sequential(
@@ -61,7 +105,7 @@ def make_res_layer(block,
                kernel_size=1,
                stride=stride,
                bias=False),
-            nn.BatchNorm2d(planes * block.expansion),
+            build_norm_layer(normalize, planes * block.expansion)[1],
        )
    layers = []
@@ -75,7 +119,9 @@ def make_res_layer(block,
            groups=groups,
            base_width=base_width,
            style=style,
-            with_cp=with_cp))
+            with_cp=with_cp,
+            normalize=normalize,
+            dcn=dcn))
    inplanes = planes * block.expansion
    for i in range(1, blocks):
        layers.append(
@@ -87,11 +133,14 @@ def make_res_layer(block,
                groups=groups,
                base_width=base_width,
                style=style,
-                with_cp=with_cp))
+                with_cp=with_cp,
+                normalize=normalize,
+                dcn=dcn))
    return nn.Sequential(*layers)
+@BACKBONES.register_module
 class ResNeXt(ResNet):
    """ResNeXt backbone.
@@ -108,11 +157,14 @@ class ResNeXt(ResNet):
            the first 1x1 conv layer.
        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
            not freezing any parameters.
-        bn_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+        normalize (dict): dictionary to construct and config norm layer.
-            running stats (mean and var).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
    """
    arch_settings = {
@@ -131,6 +183,7 @@ class ResNeXt(ResNet):
        for i, num_blocks in enumerate(self.stage_blocks):
            stride = self.strides[i]
            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
            planes = 64 * 2**i
            res_layer = make_res_layer(
                self.block,
@@ -142,8 +195,12 @@ class ResNeXt(ResNet):
                groups=self.groups,
                base_width=self.base_width,
                style=self.style,
-                with_cp=self.with_cp)
+                with_cp=self.with_cp,
+                normalize=self.normalize,
+                dcn=dcn)
            self.inplanes = planes * self.block.expansion
            layer_name = 'layer{}'.format(i + 1)
            self.add_module(layer_name, res_layer)
            self.res_layers.append(layer_name)
+        self._freeze_stages()
--- a/mmdet/models/backbones/ssd_vgg.py
+++ b/mmdet/models/backbones/ssd_vgg.py
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (VGG, xavier_init, constant_init, kaiming_init,
+                      normal_init)
+from mmcv.runner import load_checkpoint
+from ..registry import BACKBONES
+@BACKBONES.register_module
+class SSDVGG(VGG):
+    extra_setting = {
+        300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
+        512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128),
+    }
+    def __init__(self,
+                 input_size,
+                 depth,
+                 with_last_pool=False,
+                 ceil_mode=True,
+                 out_indices=(3, 4),
+                 out_feature_indices=(22, 34),
+                 l2_norm_scale=20.):
+        super(SSDVGG, self).__init__(
+            depth,
+            with_last_pool=with_last_pool,
+            ceil_mode=ceil_mode,
+            out_indices=out_indices)
+        assert input_size in (300, 512)
+        self.input_size = input_size
+        self.features.add_module(
+            str(len(self.features)),
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+        self.features.add_module(
+            str(len(self.features)),
+            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.features.add_module(
+            str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.out_feature_indices = out_feature_indices
+        self.inplanes = 1024
+        self.extra = self._make_extra_layers(self.extra_setting[input_size])
+        self.l2_norm = L2Norm(
+            self.features[out_feature_indices[0] - 1].out_channels,
+            l2_norm_scale)
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.features.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+        else:
+            raise TypeError('pretrained must be a str or None')
+        for m in self.extra.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+        constant_init(self.l2_norm, self.l2_norm.scale)
+    def forward(self, x):
+        outs = []
+        for i, layer in enumerate(self.features):
+            x = layer(x)
+            if i in self.out_feature_indices:
+                outs.append(x)
+        for i, layer in enumerate(self.extra):
+            x = F.relu(layer(x), inplace=True)
+            if i % 2 == 1:
+                outs.append(x)
+        outs[0] = self.l2_norm(outs[0])
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+    def _make_extra_layers(self, outplanes):
+        layers = []
+        kernel_sizes = (1, 3)
+        num_layers = 0
+        outplane = None
+        for i in range(len(outplanes)):
+            if self.inplanes == 'S':
+                self.inplanes = outplane
+                continue
+            k = kernel_sizes[num_layers % 2]
+            if outplanes[i] == 'S':
+                outplane = outplanes[i + 1]
+                conv = nn.Conv2d(
+                    self.inplanes, outplane, k, stride=2, padding=1)
+            else:
+                outplane = outplanes[i]
+                conv = nn.Conv2d(
+                    self.inplanes, outplane, k, stride=1, padding=0)
+            layers.append(conv)
+            self.inplanes = outplanes[i]
+            num_layers += 1
+        if self.input_size == 512:
+            layers.append(nn.Conv2d(self.inplanes, 256, 4, padding=1))
+        return nn.Sequential(*layers)
+class L2Norm(nn.Module):
+    def __init__(self, n_dims, scale=20., eps=1e-10):
+        super(L2Norm, self).__init__()
+        self.n_dims = n_dims
+        self.weight = nn.Parameter(torch.Tensor(self.n_dims))
+        self.eps = eps
+        self.scale = scale
+    def forward(self, x):
+        norm = x.pow(2).sum(1, keepdim=True).sqrt() + self.eps
+        return self.weight[None, :, None, None].expand_as(x) * x / norm
--- a/mmdet/models/bbox_heads/bbox_head.py
+++ b/mmdet/models/bbox_heads/bbox_head.py
@@ -4,8 +4,10 @@ import torch.nn.functional as F
 from mmdet.core import (delta2bbox, multiclass_nms, bbox_target,
                        weighted_cross_entropy, weighted_smoothl1, accuracy)
+from ..registry import HEADS
+@HEADS.register_module
 class BBoxHead(nn.Module):
    """Simplest RoI head, with only two fc layers for classification and
    regression respectively"""
@@ -78,8 +80,14 @@ class BBoxHead(nn.Module):
            target_stds=self.target_stds)
        return cls_reg_targets
-    def loss(self, cls_score, bbox_pred, labels, label_weights, bbox_targets,
+    def loss(self,
-             bbox_weights, reduce=True):
+             cls_score,
+             bbox_pred,
+             labels,
+             label_weights,
+             bbox_targets,
+             bbox_weights,
+             reduce=True):
        losses = dict()
        if cls_score is not None:
            losses['loss_cls'] = weighted_cross_entropy(