Initial commit

61d5313f · xinghao · 61d5313f · 61d5313f · 61d5313f · 61d5313f
Commit 61d5313f authored Aug 28, 2024 by xinghao
20 changed files
--- a/ssd/data/samplers/distributed.py
+++ b/ssd/data/samplers/distributed.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Code is copy-pasted exactly as in torch.utils.data.distributed.
+# FIXME remove this once c10d fixes the bug it has
+import math
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import Sampler
+
+
+class DistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset: offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
--- a/ssd/data/samplers/iteration_based_batch_sampler.py
+++ b/ssd/data/samplers/iteration_based_batch_sampler.py
+from torch.utils.data.sampler import BatchSampler
+
+
+class IterationBasedBatchSampler(BatchSampler):
+    """
+    Wraps a BatchSampler, re-sampling from it until
+    a specified number of iterations have been sampled
+    """
+
+    def __init__(self, batch_sampler, num_iterations, start_iter=0):
+        self.batch_sampler = batch_sampler
+        self.num_iterations = num_iterations
+        self.start_iter = start_iter
+
+    def __iter__(self):
+        iteration = self.start_iter
+        while iteration <= self.num_iterations:
+            # if the underlying sampler has a set_epoch method, like
+            # DistributedSampler, used for making each process see
+            # a different split of the dataset, then set it
+            if hasattr(self.batch_sampler.sampler, "set_epoch"):
+                self.batch_sampler.sampler.set_epoch(iteration)
+            for batch in self.batch_sampler:
+                iteration += 1
+                if iteration > self.num_iterations:
+                    break
+                yield batch
+
+    def __len__(self):
+        return self.num_iterations
--- a/ssd/data/transforms/__init__.py
+++ b/ssd/data/transforms/__init__.py
+from ssd.modeling.anchors.prior_box import PriorBox
+from .target_transform import SSDTargetTransform
+from .transforms import *
+
+
+def build_transforms(cfg, is_train=True):
+    if is_train:
+        transform = [
+            ConvertFromInts(),
+            PhotometricDistort(),
+            Expand(cfg.INPUT.PIXEL_MEAN),
+            RandomSampleCrop(),
+            RandomMirror(),
+            ToPercentCoords(),
+            Resize(cfg.INPUT.IMAGE_SIZE),
+            SubtractMeans(cfg.INPUT.PIXEL_MEAN),
+            ToTensor(),
+        ]
+    else:
+        transform = [
+            Resize(cfg.INPUT.IMAGE_SIZE),
+            SubtractMeans(cfg.INPUT.PIXEL_MEAN),
+            ToTensor()
+        ]
+    transform = Compose(transform)
+    return transform
+
+
+def build_target_transform(cfg):
+    transform = SSDTargetTransform(PriorBox(cfg)(),
+                                   cfg.MODEL.CENTER_VARIANCE,
+                                   cfg.MODEL.SIZE_VARIANCE,
+                                   cfg.MODEL.THRESHOLD)
+    return transform
--- a/ssd/data/transforms/target_transform.py
+++ b/ssd/data/transforms/target_transform.py
+import numpy as np
+import torch
+
+from ssd.utils import box_utils
+
+
+class SSDTargetTransform:
+    def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
+        self.center_form_priors = center_form_priors
+        self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
+        self.center_variance = center_variance
+        self.size_variance = size_variance
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, gt_boxes, gt_labels):
+        if type(gt_boxes) is np.ndarray:
+            gt_boxes = torch.from_numpy(gt_boxes)
+        if type(gt_labels) is np.ndarray:
+            gt_labels = torch.from_numpy(gt_labels)
+        boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
+                                                self.corner_form_priors, self.iou_threshold)
+        boxes = box_utils.corner_form_to_center_form(boxes)
+        locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance, self.size_variance)
+       
+        return locations, labels
--- a/ssd/data/transforms/transforms.py
+++ b/ssd/data/transforms/transforms.py
+# from https://github.com/amdegroot/ssd.pytorch
+
+
+import torch
+from torchvision import transforms
+import cv2
+import numpy as np
+import types
+from numpy import random
+
+
+def intersect(box_a, box_b):
+    max_xy = np.minimum(box_a[:, 2:], box_b[2:])
+    min_xy = np.maximum(box_a[:, :2], box_b[:2])
+    inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
+    return inter[:, 0] * inter[:, 1]
+
+
+def jaccard_numpy(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: Multiple bounding boxes, Shape: [num_boxes,4]
+        box_b: Single bounding box, Shape: [4]
+    Return:
+        jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2] - box_a[:, 0]) *
+              (box_a[:, 3] - box_a[:, 1]))  # [A,B]
+    area_b = ((box_b[2] - box_b[0]) *
+              (box_b[3] - box_b[1]))  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+def remove_empty_boxes(boxes, labels):
+    """Removes bounding boxes of W or H equal to 0 and its labels
+
+    Args:
+        boxes   (ndarray): NP Array with bounding boxes as lines
+                           * BBOX[x1, y1, x2, y2]
+        labels  (labels): Corresponding labels with boxes
+
+    Returns:
+        ndarray: Valid bounding boxes
+        ndarray: Corresponding labels
+    """
+    del_boxes = []
+    for idx, box in enumerate(boxes):
+        if box[0] == box[2] or box[1] == box[3]:
+            del_boxes.append(idx)
+
+    return np.delete(boxes, del_boxes, 0), np.delete(labels, del_boxes)
+
+
+class Compose(object):
+    """Composes several augmentations together.
+    Args:
+        transforms (List[Transform]): list of transforms to compose.
+    Example:
+        >>> augmentations.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, boxes=None, labels=None):
+        for t in self.transforms:
+            img, boxes, labels = t(img, boxes, labels)
+            if boxes is not None:
+                boxes, labels = remove_empty_boxes(boxes, labels)
+        return img, boxes, labels
+
+
+class Lambda(object):
+    """Applies a lambda as a transform."""
+
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+
+    def __call__(self, img, boxes=None, labels=None):
+        return self.lambd(img, boxes, labels)
+
+
+class ConvertFromInts(object):
+    def __call__(self, image, boxes=None, labels=None):
+        return image.astype(np.float32), boxes, labels
+
+
+class SubtractMeans(object):
+    def __init__(self, mean):
+        self.mean = np.array(mean, dtype=np.float32)
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = image.astype(np.float32)
+        image -= self.mean
+        return image.astype(np.float32), boxes, labels
+
+
+class ToAbsoluteCoords(object):
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, channels = image.shape
+        boxes[:, 0] *= width
+        boxes[:, 2] *= width
+        boxes[:, 1] *= height
+        boxes[:, 3] *= height
+
+        return image, boxes, labels
+
+
+class ToPercentCoords(object):
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, channels = image.shape
+        boxes[:, 0] /= width
+        boxes[:, 2] /= width
+        boxes[:, 1] /= height
+        boxes[:, 3] /= height
+
+        return image, boxes, labels
+
+
+class Resize(object):
+    def __init__(self, size=300):
+        self.size = size
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = cv2.resize(image, (self.size,
+                                   self.size))
+        return image, boxes, labels
+
+
+class RandomSaturation(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            image[:, :, 1] *= random.uniform(self.lower, self.upper)
+
+        return image, boxes, labels
+
+
+class RandomHue(object):
+    def __init__(self, delta=18.0):
+        assert delta >= 0.0 and delta <= 360.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            image[:, :, 0] += random.uniform(-self.delta, self.delta)
+            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
+            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
+        return image, boxes, labels
+
+
+class RandomLightingNoise(object):
+    def __init__(self):
+        self.perms = ((0, 1, 2), (0, 2, 1),
+                      (1, 0, 2), (1, 2, 0),
+                      (2, 0, 1), (2, 1, 0))
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            swap = self.perms[random.randint(len(self.perms))]
+            shuffle = SwapChannels(swap)  # shuffle channels
+            image = shuffle(image)
+        return image, boxes, labels
+
+
+class ConvertColor(object):
+    def __init__(self, current, transform):
+        self.transform = transform
+        self.current = current
+
+    def __call__(self, image, boxes=None, labels=None):
+        if self.current == 'BGR' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+        elif self.current == 'RGB' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
+        elif self.current == 'BGR' and self.transform == 'RGB':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        elif self.current == 'HSV' and self.transform == 'BGR':
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
+        elif self.current == 'HSV' and self.transform == "RGB":
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB)
+        else:
+            raise NotImplementedError
+        return image, boxes, labels
+
+
+class RandomContrast(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    # expects float image
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            alpha = random.uniform(self.lower, self.upper)
+            image *= alpha
+        return image, boxes, labels
+
+
+class RandomBrightness(object):
+    def __init__(self, delta=32):
+        assert delta >= 0.0
+        assert delta <= 255.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            delta = random.uniform(-self.delta, self.delta)
+            image += delta
+        return image, boxes, labels
+
+
+class ToCV2Image(object):
+    def __call__(self, tensor, boxes=None, labels=None):
+        return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
+
+
+class ToTensor(object):
+    def __call__(self, cvimage, boxes=None, labels=None):
+        return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
+
+
+class RandomSampleCrop(object):
+    """Crop
+    Arguments:
+        img (Image): the image being input during training
+        boxes (Tensor): the original bounding boxes in pt form
+        labels (Tensor): the class labels for each bbox
+        mode (float tuple): the min and max jaccard overlaps
+    Return:
+        (img, boxes, classes)
+            img (Image): the cropped image
+            boxes (Tensor): the adjusted bounding boxes in pt form
+            labels (Tensor): the class labels for each bbox
+    """
+
+    def __init__(self):
+        self.sample_options = (
+            # using entire original input image
+            None,
+            # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
+            (0.1, None),
+            (0.3, None),
+            (0.7, None),
+            (0.9, None),
+            # randomly sample a patch
+            (None, None),
+        )
+
+    def __call__(self, image, boxes=None, labels=None):
+        # guard against no boxes
+        if boxes is not None and boxes.shape[0] == 0:
+            return image, boxes, labels
+        height, width, _ = image.shape
+        while True:
+            # randomly choose a mode
+            mode = self.sample_options[random.randint(0, len(self.sample_options))]
+            if mode is None:
+                return image, boxes, labels
+
+            min_iou, max_iou = mode
+            if min_iou is None:
+                min_iou = float('-inf')
+            if max_iou is None:
+                max_iou = float('inf')
+
+            # max trails (50)
+            for _ in range(50):
+                current_image = image
+
+                w = random.uniform(0.3 * width, width)
+                h = random.uniform(0.3 * height, height)
+
+                # aspect ratio constraint b/t .5 & 2
+                if h / w < 0.5 or h / w > 2:
+                    continue
+
+                left = random.uniform(width - w)
+                top = random.uniform(height - h)
+
+                # convert to integer rect x1,y1,x2,y2
+                rect = np.array([int(left), int(top), int(left + w), int(top + h)])
+
+                # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
+                overlap = jaccard_numpy(boxes, rect)
+
+                # is min and max overlap constraint satisfied? if not try again
+                if overlap.max() < min_iou or overlap.min() > max_iou:
+                    continue
+
+                # cut the crop from the image
+                current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
+                                :]
+
+                # keep overlap with gt box IF center in sampled patch
+                centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
+
+                # mask in all gt boxes that above and to the left of centers
+                m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
+
+                # mask in all gt boxes that under and to the right of centers
+                m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
+
+                # mask in that both m1 and m2 are true
+                mask = m1 * m2
+
+                # have any valid boxes? try again if not
+                if not mask.any():
+                    continue
+
+                # take only matching gt boxes
+                current_boxes = boxes[mask, :].copy()
+
+                # take only matching gt labels
+                current_labels = labels[mask]
+
+                # should we use the box left and top corner or the crop's
+                current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
+                                                  rect[:2])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, :2] -= rect[:2]
+
+                current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
+                                                  rect[2:])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, 2:] -= rect[:2]
+
+                return current_image, current_boxes, current_labels
+
+
+class Expand(object):
+    def __init__(self, mean):
+        self.mean = mean
+
+    def __call__(self, image, boxes, labels):
+        if random.randint(2):
+            return image, boxes, labels
+
+        height, width, depth = image.shape
+        ratio = random.uniform(1, 4)
+        left = random.uniform(0, width * ratio - width)
+        top = random.uniform(0, height * ratio - height)
+
+        expand_image = np.zeros(
+            (int(height * ratio), int(width * ratio), depth),
+            dtype=image.dtype)
+        expand_image[:, :, :] = self.mean
+        expand_image[int(top):int(top + height),
+        int(left):int(left + width)] = image
+        image = expand_image
+
+        boxes = boxes.copy()
+        boxes[:, :2] += (int(left), int(top))
+        boxes[:, 2:] += (int(left), int(top))
+
+        return image, boxes, labels
+
+
+class RandomMirror(object):
+    def __call__(self, image, boxes, classes):
+        _, width, _ = image.shape
+        if random.randint(2):
+            image = image[:, ::-1]
+            boxes = boxes.copy()
+            boxes[:, 0::2] = width - boxes[:, 2::-2]
+        return image, boxes, classes
+
+
+class SwapChannels(object):
+    """Transforms a tensorized image by swapping the channels in the order
+     specified in the swap tuple.
+    Args:
+        swaps (int triple): final order of channels
+            eg: (2, 1, 0)
+    """
+
+    def __init__(self, swaps):
+        self.swaps = swaps
+
+    def __call__(self, image):
+        """
+        Args:
+            image (Tensor): image tensor to be transformed
+        Return:
+            a tensor with channels swapped according to swap
+        """
+        # if torch.is_tensor(image):
+        #     image = image.data.cpu().numpy()
+        # else:
+        #     image = np.array(image)
+        image = image[:, :, self.swaps]
+        return image
+
+
+class PhotometricDistort(object):
+    def __init__(self):
+        self.pd = [
+            RandomContrast(),  # RGB
+            ConvertColor(current="RGB", transform='HSV'),  # HSV
+            RandomSaturation(),  # HSV
+            RandomHue(),  # HSV
+            ConvertColor(current='HSV', transform='RGB'),  # RGB
+            RandomContrast()  # RGB
+        ]
+        self.rand_brightness = RandomBrightness()
+        self.rand_light_noise = RandomLightingNoise()
+
+    def __call__(self, image, boxes, labels):
+        im = image.copy()
+        im, boxes, labels = self.rand_brightness(im, boxes, labels)
+        if random.randint(2):
+            distort = Compose(self.pd[:-1])
+        else:
+            distort = Compose(self.pd[1:])
+        im, boxes, labels = distort(im, boxes, labels)
+        return self.rand_light_noise(im, boxes, labels)
--- a/ssd/engine/__init__.py
+++ b/ssd/engine/__init__.py
--- a/ssd/engine/inference.py
+++ b/ssd/engine/inference.py
+import logging
+import os
+
+import torch
+import torch.utils.data
+from tqdm import tqdm
+
+from ssd.data.build import make_data_loader
+from ssd.data.datasets.evaluation import evaluate
+
+from ssd.utils import dist_util, mkdir
+from ssd.utils.dist_util import synchronize, is_main_process
+
+
+def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
+    all_predictions = dist_util.all_gather(predictions_per_gpu)
+    if not dist_util.is_main_process():
+        return
+    # merge the list of dicts
+    predictions = {}
+    for p in all_predictions:
+        predictions.update(p)
+    # convert a dict where the key is the index in a list
+    image_ids = list(sorted(predictions.keys()))
+    if len(image_ids) != image_ids[-1] + 1:
+        logger = logging.getLogger("SSD.inference")
+        logger.warning(
+            "Number of images that were gathered from multiple processes is not "
+            "a contiguous set. Some images might be missing from the evaluation"
+        )
+
+    # convert to a list
+    predictions = [predictions[i] for i in image_ids]
+    return predictions
+
+
+def compute_on_dataset(model, data_loader, device):
+    results_dict = {}
+    for batch in tqdm(data_loader):
+        images, targets, image_ids = batch
+        cpu_device = torch.device("cpu")
+        with torch.no_grad():
+            outputs = model(images.to(device))
+
+            outputs = [o.to(cpu_device) for o in outputs]
+        results_dict.update(
+            {int(img_id): result for img_id, result in zip(image_ids, outputs)}
+        )
+    return results_dict
+
+
+def inference(model, data_loader, dataset_name, device, output_folder=None, use_cached=False, **kwargs):
+    dataset = data_loader.dataset
+    logger = logging.getLogger("SSD.inference")
+    logger.info("Evaluating {} dataset({} images):".format(dataset_name, len(dataset)))
+    predictions_path = os.path.join(output_folder, 'predictions.pth')
+    if use_cached and os.path.exists(predictions_path):
+        predictions = torch.load(predictions_path, map_location='cpu')
+    else:
+        predictions = compute_on_dataset(model, data_loader, device)
+        synchronize()
+        predictions = _accumulate_predictions_from_multiple_gpus(predictions)
+    if not is_main_process():
+        return
+    if output_folder:
+        torch.save(predictions, predictions_path)
+    return evaluate(dataset=dataset, predictions=predictions, output_dir=output_folder, **kwargs)
+
+
+@torch.no_grad()
+def do_evaluation(cfg, model, distributed, **kwargs):
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        model = model.module
+    model.eval()
+    device = torch.device(cfg.MODEL.DEVICE)
+    data_loaders_val = make_data_loader(cfg, is_train=False, distributed=distributed)
+    eval_results = []
+    for dataset_name, data_loader in zip(cfg.DATASETS.TEST, data_loaders_val):
+        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
+        if not os.path.exists(output_folder):
+            mkdir(output_folder)
+        eval_result = inference(model, data_loader, dataset_name, device, output_folder, **kwargs)
+        eval_results.append(eval_result)
+    return eval_results
--- a/ssd/engine/trainer.py
+++ b/ssd/engine/trainer.py
+import collections
+import datetime
+import logging
+import os
+import time
+import torch
+import torch.distributed as dist
+
+from ssd.engine.inference import do_evaluation
+from ssd.utils import dist_util
+from ssd.utils.metric_logger import MetricLogger
+
+
+def write_metric(eval_result, prefix, summary_writer, global_step):
+    for key in eval_result:
+        value = eval_result[key]
+        tag = '{}/{}'.format(prefix, key)
+        if isinstance(value, collections.abc.Mapping):
+            write_metric(value, tag, summary_writer, global_step)
+        else:
+            summary_writer.add_scalar(tag, value, global_step=global_step)
+
+
+def reduce_loss_dict(loss_dict):
+    """
+    Reduce the loss dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    loss_dict, after reduction.
+    """
+    world_size = dist_util.get_world_size()
+    if world_size < 2:
+        return loss_dict
+    with torch.no_grad():
+        loss_names = []
+        all_losses = []
+        for k in sorted(loss_dict.keys()):
+            loss_names.append(k)
+            all_losses.append(loss_dict[k])
+        all_losses = torch.stack(all_losses, dim=0)
+        dist.reduce(all_losses, dst=0)
+        if dist.get_rank() == 0:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            all_losses /= world_size
+        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
+    return reduced_losses
+
+
+def do_train(cfg, model,
+             data_loader,
+             optimizer,
+             scheduler,
+             checkpointer,
+             device,
+             arguments,
+             args):
+    logger = logging.getLogger("SSD.trainer")
+    logger.info("Start training ...")
+    meters = MetricLogger()
+
+    model.train()
+    save_to_disk = dist_util.get_rank() == 0
+    if args.use_tensorboard and save_to_disk:
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+        except ImportError:
+            from tensorboardX import SummaryWriter
+        summary_writer = SummaryWriter(log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs'))
+    else:
+        summary_writer = None
+
+    max_iter = len(data_loader)
+    start_iter = arguments["iteration"]
+    start_training_time = time.time()
+    end = time.time()
+    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
+        iteration = iteration + 1
+        arguments["iteration"] = iteration
+
+        images = images.to(device)
+        targets = targets.to(device)
+        loss_dict = model(images, targets=targets)
+        loss = sum(loss for loss in loss_dict.values())
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = reduce_loss_dict(loss_dict)
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+        meters.update(total_loss=losses_reduced, **loss_dict_reduced)
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+
+        batch_time = time.time() - end
+        end = time.time()
+        meters.update(time=batch_time)
+        if iteration % args.log_step == 0:
+            eta_seconds = meters.time.global_avg * (max_iter - iteration)
+            eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+            if device == "cuda":
+                logger.info(
+                    meters.delimiter.join([
+                        "iter: {iter:06d}",
+                        "lr: {lr:.5f}",
+                        '{meters}',
+                        "eta: {eta}",
+                        'mem: {mem}M',
+                    ]).format(
+                        iter=iteration,
+                        lr=optimizer.param_groups[0]['lr'],
+                        meters=str(meters),
+                        eta=eta_string,
+                        mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0),
+                    )
+                )
+            else:
+                logger.info(
+                    meters.delimiter.join([
+                        "iter: {iter:06d}",
+                        "lr: {lr:.5f}",
+                        '{meters}',
+                        "eta: {eta}",
+                    ]).format(
+                        iter=iteration,
+                        lr=optimizer.param_groups[0]['lr'],
+                        meters=str(meters),
+                        eta=eta_string,
+                    )
+                )
+            if summary_writer:
+                global_step = iteration
+                summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step)
+                for loss_name, loss_item in loss_dict_reduced.items():
+                    summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step)
+                summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step)
+
+        if iteration % args.save_step == 0:
+            checkpointer.save("model_{:06d}".format(iteration), **arguments)
+
+        if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter:
+            eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration)
+            if dist_util.get_rank() == 0 and summary_writer:
+                for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST):
+                    write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration)
+            model.train()  # *IMPORTANT*: change to train mode after eval.
+
+    checkpointer.save("model_final", **arguments)
+    # compute training time
+    total_training_time = int(time.time() - start_training_time)
+    total_time_str = str(datetime.timedelta(seconds=total_training_time))
+    logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter))
+    return model
--- a/ssd/layers/__init__.py
+++ b/ssd/layers/__init__.py
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from .separable_conv import SeparableConv2d
+
+__all__ = ['L2Norm', 'SeparableConv2d']
+
+
+class L2Norm(nn.Module):
+    def __init__(self, n_channels, scale):
+        super(L2Norm, self).__init__()
+        self.n_channels = n_channels
+        self.gamma = scale or None
+        self.eps = 1e-10
+        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.constant_(self.weight, self.gamma)
+
+    def forward(self, x):
+        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
+        x = torch.div(x, norm)
+        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
+        return out
--- a/ssd/layers/separable_conv.py
+++ b/ssd/layers/separable_conv.py
+from torch import nn
+
+
+class SeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False):
+        super().__init__()
+        ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+                      groups=in_channels, stride=stride, padding=padding),
+            nn.BatchNorm2d(in_channels),
+            ReLU(),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+        )
+
+    def forward(self, x):
+        return self.conv(x)
--- a/ssd/modeling/__init__.py
+++ b/ssd/modeling/__init__.py
--- a/ssd/modeling/anchors/__init__.py
+++ b/ssd/modeling/anchors/__init__.py
--- a/ssd/modeling/anchors/prior_box.py
+++ b/ssd/modeling/anchors/prior_box.py
+from itertools import product
+
+import torch
+from math import sqrt
+
+
+class PriorBox:
+    def __init__(self, cfg):
+        self.image_size = cfg.INPUT.IMAGE_SIZE
+        prior_config = cfg.MODEL.PRIORS
+        self.feature_maps = prior_config.FEATURE_MAPS
+        self.min_sizes = prior_config.MIN_SIZES
+        self.max_sizes = prior_config.MAX_SIZES
+        self.strides = prior_config.STRIDES
+        self.aspect_ratios = prior_config.ASPECT_RATIOS
+        self.clip = prior_config.CLIP
+
+    def __call__(self):
+        """Generate SSD Prior Boxes.
+            It returns the center, height and width of the priors. The values are relative to the image size
+            Returns:
+                priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values
+                    are relative to the image size.
+        """
+        priors = []
+        for k, f in enumerate(self.feature_maps):
+            scale = self.image_size / self.strides[k]
+            for i, j in product(range(f), repeat=2):
+                # unit center x,y
+                cx = (j + 0.5) / scale
+                cy = (i + 0.5) / scale
+
+                # small sized square box
+                size = self.min_sizes[k]
+                h = w = size / self.image_size
+                priors.append([cx, cy, w, h])
+
+                # big sized square box
+                size = sqrt(self.min_sizes[k] * self.max_sizes[k])
+                h = w = size / self.image_size
+                priors.append([cx, cy, w, h])
+
+                # change h/w ratio of the small sized box
+                size = self.min_sizes[k]
+                h = w = size / self.image_size
+                for ratio in self.aspect_ratios[k]:
+                    ratio = sqrt(ratio)
+                    priors.append([cx, cy, w * ratio, h / ratio])
+                    priors.append([cx, cy, w / ratio, h * ratio])
+
+        priors = torch.tensor(priors)
+        if self.clip:
+            priors.clamp_(max=1, min=0)
+        return priors
--- a/ssd/modeling/backbone/__init__.py
+++ b/ssd/modeling/backbone/__init__.py
+from ssd.modeling import registry
+from .vgg import VGG
+from .mobilenet import MobileNetV2
+from .efficient_net import EfficientNet
+from .mobilenetv3 import MobileNetV3
+
+__all__ = ['build_backbone', 'VGG', 'MobileNetV2', 'EfficientNet', 'MobileNetV3']
+
+
+def build_backbone(cfg):
+    return registry.BACKBONES[cfg.MODEL.BACKBONE.NAME](cfg, cfg.MODEL.BACKBONE.PRETRAINED)
--- a/ssd/modeling/backbone/efficient_net/__init__.py
+++ b/ssd/modeling/backbone/efficient_net/__init__.py
+from ssd.modeling import registry
+from .efficient_net import EfficientNet
+
+__all__ = ['efficient_net_b3', 'EfficientNet']
+
+
+@registry.BACKBONES.register('efficient_net-b3')
+def efficient_net_b3(cfg, pretrained=True):
+    if pretrained:
+        model = EfficientNet.from_pretrained('efficientnet-b3')
+    else:
+        model = EfficientNet.from_name('efficientnet-b3')
+
+    return model
--- a/ssd/modeling/backbone/efficient_net/efficient_net.py
+++ b/ssd/modeling/backbone/efficient_net/efficient_net.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .utils import (
+    relu_fn,
+    round_filters,
+    round_repeats,
+    drop_connect,
+    Conv2dSamePadding,
+    get_model_params,
+    efficientnet_params,
+    load_pretrained_weights,
+)
+
+INDICES = {
+    'efficientnet-b3': [7, 17, 25]
+}
+
+EXTRAS = {
+    'efficientnet-b3': [
+        # in,  out, k, s, p
+        [(384, 128, 1, 1, 0), (128, 256, 3, 2, 1)],  # 5 x 5
+        [(256, 128, 1, 1, 0), (128, 256, 3, 1, 0)],  # 3 x 3
+        [(256, 128, 1, 1, 0), (128, 256, 3, 1, 0)],  # 1 x 1
+
+    ]
+}
+
+
+def add_extras(cfgs):
+    extras = nn.ModuleList()
+    for cfg in cfgs:
+        extra = []
+        for params in cfg:
+            in_channels, out_channels, kernel_size, stride, padding = params
+            extra.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding))
+            extra.append(nn.ReLU())
+        extras.append(nn.Sequential(*extra))
+    return extras
+
+
+class MBConvBlock(nn.Module):
+    """
+    Mobile Inverted Residual Bottleneck Block
+
+    Args:
+        block_args (namedtuple): BlockArgs, see above
+        global_params (namedtuple): GlobalParam, see above
+
+    Attributes:
+        has_se (bool): Whether the block contains a Squeeze and Excitation layer.
+    """
+
+    def __init__(self, block_args, global_params):
+        super().__init__()
+        self._block_args = block_args
+        self._bn_mom = 1 - global_params.batch_norm_momentum
+        self._bn_eps = global_params.batch_norm_epsilon
+        self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip  # skip connection and drop connect
+
+        # Expansion phase
+        inp = self._block_args.input_filters  # number of input channels
+        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
+        if self._block_args.expand_ratio != 1:
+            self._expand_conv = Conv2dSamePadding(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
+            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+
+        # Depthwise convolution phase
+        k = self._block_args.kernel_size
+        s = self._block_args.stride
+        self._depthwise_conv = Conv2dSamePadding(
+            in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
+            kernel_size=k, stride=s, bias=False)
+        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+
+        # Squeeze and Excitation layer, if desired
+        if self.has_se:
+            num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
+            self._se_reduce = Conv2dSamePadding(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
+            self._se_expand = Conv2dSamePadding(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
+
+        # Output phase
+        final_oup = self._block_args.output_filters
+        self._project_conv = Conv2dSamePadding(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
+        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
+
+    def forward(self, inputs, drop_connect_rate=None):
+        """
+        :param inputs: input tensor
+        :param drop_connect_rate: drop connect rate (float, between 0 and 1)
+        :return: output of block
+        """
+
+        # Expansion and Depthwise Convolution
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = relu_fn(self._bn0(self._expand_conv(inputs)))
+        x = relu_fn(self._bn1(self._depthwise_conv(x)))
+
+        # Squeeze and Excitation
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self._se_expand(relu_fn(self._se_reduce(x_squeezed)))
+            x = torch.sigmoid(x_squeezed) * x
+
+        x = self._bn2(self._project_conv(x))
+
+        # Skip connection and drop connect
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            if drop_connect_rate:
+                x = drop_connect(x, p=drop_connect_rate, training=self.training)
+            x = x + inputs  # skip connection
+        return x
+
+
+class EfficientNet(nn.Module):
+    """
+    An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods
+
+    Args:
+        blocks_args (list): A list of BlockArgs to construct blocks
+        global_params (namedtuple): A set of GlobalParams shared between blocks
+
+    Example:
+        model = EfficientNet.from_pretrained('efficientnet-b0')
+
+    """
+
+    def __init__(self, model_name, blocks_args=None, global_params=None):
+        super().__init__()
+        self.indices = INDICES[model_name]
+        self.extras = add_extras(EXTRAS[model_name])
+        assert isinstance(blocks_args, list), 'blocks_args should be a list'
+        assert len(blocks_args) > 0, 'block args must be greater than 0'
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+
+        # Batch norm parameters
+        bn_mom = 1 - self._global_params.batch_norm_momentum
+        bn_eps = self._global_params.batch_norm_epsilon
+
+        # Stem
+        in_channels = 3  # rgb
+        out_channels = round_filters(32, self._global_params)  # number of output channels
+        self._conv_stem = Conv2dSamePadding(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
+        # Build blocks
+        self._blocks = nn.ModuleList([])
+        for block_args in self._blocks_args:
+
+            # Update block input and output filters based on depth multiplier.
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters, self._global_params),
+                output_filters=round_filters(block_args.output_filters, self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat, self._global_params)
+            )
+
+            # The first block needs to take care of stride and filter size increase.
+            self._blocks.append(MBConvBlock(block_args, self._global_params))
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(MBConvBlock(block_args, self._global_params))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.extras.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_uniform_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    def extract_features(self, inputs):
+        """ Returns output of the final convolution layer """
+
+        # Stem
+        x = relu_fn(self._bn0(self._conv_stem(inputs)))
+
+        features = []
+
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate)
+            if idx in self.indices:
+                features.append(x)
+
+        return x, features
+
+    def forward(self, inputs):
+        """ Calls extract_features to extract features, applies final linear layer, and returns logits. """
+
+        # Convolution layers
+        x, features = self.extract_features(inputs)
+
+        for layer in self.extras:
+            x = layer(x)
+            features.append(x)
+
+        return tuple(features)
+
+    @classmethod
+    def from_name(cls, model_name, override_params=None):
+        cls._check_model_name_is_valid(model_name)
+        blocks_args, global_params = get_model_params(model_name, override_params)
+        return EfficientNet(model_name, blocks_args, global_params)
+
+    @classmethod
+    def from_pretrained(cls, model_name):
+        model = EfficientNet.from_name(model_name)
+        load_pretrained_weights(model, model_name)
+        return model
+
+    @classmethod
+    def get_image_size(cls, model_name):
+        cls._check_model_name_is_valid(model_name)
+        _, _, res, _ = efficientnet_params(model_name)
+        return res
+
+    @classmethod
+    def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False):
+        """ Validates model name. None that pretrained weights are only available for
+        the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """
+        num_models = 4 if also_need_pretrained_weights else 8
+        valid_models = ['efficientnet_b' + str(i) for i in range(num_models)]
+        if model_name.replace('-', '_') not in valid_models:
+            raise ValueError('model_name should be one of: ' + ', '.join(valid_models))
--- a/ssd/modeling/backbone/efficient_net/utils.py
+++ b/ssd/modeling/backbone/efficient_net/utils.py
+"""
+This file contains helper functions for building the model and for loading model parameters.
+These helper functions are built to mirror those in the official TensorFlow implementation.
+"""
+
+import re
+import math
+import collections
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ssd.utils.model_zoo import load_state_dict_from_url
+
+########################################################################
+############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ###############
+########################################################################
+
+
+# Parameters for the entire model (stem, all blocks, and head)
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
+    'num_classes', 'width_coefficient', 'depth_coefficient',
+    'depth_divisor', 'min_depth', 'drop_connect_rate', ])
+
+# Parameters for an individual model block
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'])
+
+# Change namedtuple defaults
+GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
+
+
+def relu_fn(x):
+    """ Swish activation function """
+    return x * torch.sigmoid(x)
+
+
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+def drop_connect(inputs, p, training):
+    """ Drop connect. """
+    if not training: return inputs
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+    output = inputs / keep_prob * binary_tensor
+    return output
+
+
+class Conv2dSamePadding(nn.Conv2d):
+    """ 2D Convolutions like TensorFlow """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+########################################################################
+############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ##############
+########################################################################
+
+
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,res,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    }
+    return params_dict[model_name]
+
+
+class BlockDecoder(object):
+    """ Block Decoder for readability, straight from the official TensorFlow repository """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1) or
+                (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
+
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """
+        Decodes a list of string notations to specify blocks inside the network.
+
+        :param string_list: a list of strings, each string is a notation of block
+        :return: a list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def efficientnet(width_coefficient=None, depth_coefficient=None,
+                 dropout_rate=0.2, drop_connect_rate=0.2):
+    """ Creates a efficientnet model. """
+
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        # data_format='channels_last',  # removed, this is always true in PyTorch
+        num_classes=1000,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        min_depth=None
+    )
+
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, _, p = efficientnet_params(model_name)
+        # note: all models have drop connect rate = 0.2
+        blocks_args, global_params = efficientnet(width_coefficient=w, depth_coefficient=d, dropout_rate=p)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' % model_name)
+    if override_params:
+        # ValueError will be raised here if override_params has fields not included in global_params.
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+url_map = {
+    'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet-b0-08094119.pth',
+    'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet-b1-dbc7070a.pth',
+    'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet-b2-27687264.pth',
+    'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet-b3-c8376fa2.pth',
+    'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet-b4-e116e8b3.pth',
+    'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet-b5-586e6cc6.pth',
+}
+
+
+def load_pretrained_weights(model, model_name):
+    """ Loads pretrained weights, and downloads if loading for the first time. """
+    state_dict = load_state_dict_from_url(url_map[model_name])
+    model.load_state_dict(state_dict, strict=False)
+    print('Loaded pretrained weights for {}'.format(model_name))
--- a/ssd/modeling/backbone/mobilenet.py
+++ b/ssd/modeling/backbone/mobilenet.py
+from torch import nn
+
+from ssd.modeling import registry
+from ssd.utils.model_zoo import load_state_dict_from_url
+
+model_urls = {
+    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
+}
+
+
+class ConvBNReLU(nn.Sequential):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU6(inplace=True)
+        )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self, width_mult=1.0, inverted_residual_setting=None):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError("inverted_residual_setting should be non-empty "
+                             "or a 4-element list, got {}".format(inverted_residual_setting))
+
+        # building first layer
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * max(1.0, width_mult))
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+        self.extras = nn.ModuleList([
+            InvertedResidual(1280, 512, 2, 0.2),
+            InvertedResidual(512, 256, 2, 0.25),
+            InvertedResidual(256, 256, 2, 0.5),
+            InvertedResidual(256, 64, 2, 0.25)
+        ])
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x):
+        features = []
+        for i in range(14):
+            x = self.features[i](x)
+        features.append(x)
+
+        for i in range(14, len(self.features)):
+            x = self.features[i](x)
+        features.append(x)
+
+        for i in range(len(self.extras)):
+            x = self.extras[i](x)
+            features.append(x)
+
+        return tuple(features)
+
+
+@registry.BACKBONES.register('mobilenet_v2')
+def mobilenet_v2(cfg, pretrained=True):
+    model = MobileNetV2()
+    if pretrained:
+        model.load_state_dict(load_state_dict_from_url(model_urls['mobilenet_v2']), strict=False)
+    return model
--- a/ssd/modeling/backbone/mobilenetv3.py
+++ b/ssd/modeling/backbone/mobilenetv3.py
+"""
+Creates a MobileNetV3 Model as defined in:
+Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam. (2019).
+Searching for MobileNetV3
+arXiv preprint arXiv:1905.02244.
+
+
+@ Credit from https://github.com/d-li14/mobilenetv3.pytorch
+@ Modified by Chakkrit Termritthikun (https://github.com/chakkritte)
+
+"""
+
+import torch.nn as nn
+import math
+
+from ssd.modeling import registry
+from ssd.utils.model_zoo import load_state_dict_from_url
+
+model_urls = {
+    'mobilenet_v3': 'https://github.com/d-li14/mobilenetv3.pytorch/raw/master/pretrained/mobilenetv3-large-1cd25616.pth',
+}
+
+
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class h_sigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_sigmoid, self).__init__()
+        self.relu = nn.ReLU6(inplace=inplace)
+
+    def forward(self, x):
+        return self.relu(x + 3) / 6
+
+
+class h_swish(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_swish, self).__init__()
+        self.sigmoid = h_sigmoid(inplace=inplace)
+
+    def forward(self, x):
+        return x * self.sigmoid(x)
+
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, _make_divisible(channel // reduction, 8)),
+            nn.ReLU(inplace=True),
+            nn.Linear(_make_divisible(channel // reduction, 8), channel),
+            h_sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+
+def conv_3x3_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        h_swish()
+    )
+
+
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        h_swish()
+    )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
+        super(InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+
+        self.identity = stride == 1 and inp == oup
+
+        if inp == hidden_dim:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                h_swish() if use_hs else nn.ReLU(inplace=True),
+                # Squeeze-and-Excite
+                SELayer(hidden_dim) if use_se else nn.Identity(),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                h_swish() if use_hs else nn.ReLU(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                # Squeeze-and-Excite
+                SELayer(hidden_dim) if use_se else nn.Identity(),
+                h_swish() if use_hs else nn.ReLU(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+
+    def forward(self, x):
+        if self.identity:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV3(nn.Module):
+    def __init__(self, mode='large', num_classes=1000, width_mult=1.):
+        super(MobileNetV3, self).__init__()
+        # setting of inverted residual blocks
+        self.cfgs = [
+            # k, t, c, SE, HS, s
+            [3, 1, 16, 0, 0, 1],
+            [3, 4, 24, 0, 0, 2],
+            [3, 3, 24, 0, 0, 1],
+            [5, 3, 40, 1, 0, 2],
+            [5, 3, 40, 1, 0, 1],
+            [5, 3, 40, 1, 0, 1],
+            [3, 6, 80, 0, 1, 2],
+            [3, 2.5, 80, 0, 1, 1],
+            [3, 2.3, 80, 0, 1, 1],
+            [3, 2.3, 80, 0, 1, 1],
+            [3, 6, 112, 1, 1, 1],
+            [3, 6, 112, 1, 1, 1],
+            [5, 6, 160, 1, 1, 2],
+            [5, 6, 160, 1, 1, 1],
+            [5, 6, 160, 1, 1, 1]]
+
+        assert mode in ['large', 'small']
+
+        # building first layer
+        input_channel = _make_divisible(16 * width_mult, 8)
+
+        layers = [conv_3x3_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        block = InvertedResidual
+        for k, t, c, use_se, use_hs, s in self.cfgs:
+            output_channel = _make_divisible(c * width_mult, 8)
+            exp_size = _make_divisible(input_channel * t, 8)
+            layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
+            input_channel = output_channel
+        # building last several layers
+        layers.append(conv_1x1_bn(input_channel, exp_size))
+        self.features = nn.Sequential(*layers)
+        self.extras = nn.ModuleList([
+            InvertedResidual(960, _make_divisible(960 * 0.2, 8), 512, 3, 2, True, True),
+            InvertedResidual(512, _make_divisible(512 * 0.25, 8), 256, 3, 2, True, True),
+            InvertedResidual(256, _make_divisible(256 * 0.5, 8), 256, 3, 2, True, True),
+            InvertedResidual(256, _make_divisible(256 * 0.25, 8), 64, 3, 2, True, True),
+        ])
+
+        self.reset_parameters()
+
+    def forward(self, x):
+        features = []
+        for i in range(13):
+            x = self.features[i](x)
+        features.append(x)
+
+        for i in range(13, len(self.features)):
+            x = self.features[i](x)
+        features.append(x)
+
+        for i in range(len(self.extras)):
+            x = self.extras[i](x)
+            features.append(x)
+
+        return tuple(features)
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+
+
+@registry.BACKBONES.register('mobilenet_v3')
+def mobilenet_v3(cfg, pretrained=True):
+    model = MobileNetV3()
+    if pretrained:
+        model.load_state_dict(load_state_dict_from_url(model_urls['mobilenet_v3']), strict=False)
+    return model
--- a/ssd/modeling/backbone/vgg.py
+++ b/ssd/modeling/backbone/vgg.py
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ssd.layers import L2Norm
+from ssd.modeling import registry
+from ssd.utils.model_zoo import load_state_dict_from_url
+
+model_urls = {
+    'vgg': 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth',
+}
+
+
+# borrowed from https://github.com/amdegroot/ssd.pytorch/blob/master/ssd.py
+def add_vgg(cfg, batch_norm=False):
+    layers = []
+    in_channels = 3
+    for v in cfg:
+        if v == 'M':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        elif v == 'C':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
+    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
+    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
+    layers += [pool5, conv6,
+               nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
+    return layers
+
+
+def add_extras(cfg, i, size=300):
+    # Extra layers added to VGG for feature scaling
+    layers = []
+    in_channels = i
+    flag = False
+    for k, v in enumerate(cfg):
+        if in_channels != 'S':
+            if v == 'S':
+                layers += [nn.Conv2d(in_channels, cfg[k + 1], kernel_size=(1, 3)[flag], stride=2, padding=1)]
+            else:
+                layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
+            flag = not flag
+        in_channels = v
+    if size == 512:
+        layers.append(nn.Conv2d(in_channels, 128, kernel_size=1, stride=1))
+        layers.append(nn.Conv2d(128, 256, kernel_size=4, stride=1, padding=1))
+    return layers
+
+
+vgg_base = {
+    '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
+            512, 512, 512],
+    '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
+            512, 512, 512],
+}
+extras_base = {
+    '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
+    '512': [256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256],
+}
+
+
+class VGG(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        size = cfg.INPUT.IMAGE_SIZE
+        vgg_config = vgg_base[str(size)]
+        extras_config = extras_base[str(size)]
+
+        self.vgg = nn.ModuleList(add_vgg(vgg_config))
+        self.extras = nn.ModuleList(add_extras(extras_config, i=1024, size=size))
+        self.l2_norm = L2Norm(512, scale=20)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.extras.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_uniform_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    def init_from_pretrain(self, state_dict):
+        self.vgg.load_state_dict(state_dict)
+
+    def forward(self, x):
+        features = []
+        for i in range(23):
+            x = self.vgg[i](x)
+        s = self.l2_norm(x)  # Conv4_3 L2 normalization
+        features.append(s)
+
+        # apply vgg up to fc7
+        for i in range(23, len(self.vgg)):
+            x = self.vgg[i](x)
+        features.append(x)
+
+        for k, v in enumerate(self.extras):
+            x = F.relu(v(x), inplace=True)
+            if k % 2 == 1:
+                features.append(x)
+
+        return tuple(features)
+
+
+@registry.BACKBONES.register('vgg')
+def vgg(cfg, pretrained=True):
+    model = VGG(cfg)
+    if pretrained:
+        model.init_from_pretrain(load_state_dict_from_url(model_urls['vgg']))
+    return model