Initial commit

61d5313f · xinghao · 61d5313f · 61d5313f · 61d5313f · 61d5313f
Commit 61d5313f authored Aug 28, 2024 by xinghao
20 changed files
--- a/figures/losses.png
+++ b/figures/losses.png
--- a/figures/lr.png
+++ b/figures/lr.png
--- a/figures/metrics.png
+++ b/figures/metrics.png
--- a/outputs/.gitignore
+++ b/outputs/.gitignore
--- a/requirements.txt
+++ b/requirements.txt
+torch>=1.3
+torchvision>=0.3
+yacs
+tqdm
+opencv-python
+vizer
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, find_packages
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setup(
+    name="torch-ssd",
+    version="1.2.0",
+    packages=find_packages(exclude=['ext']),
+    install_requires=[
+        "torch>=1.3",
+        "torchvision>=0.3",
+        "opencv-python~=4.0",
+        "yacs==0.1.6",
+        "Vizer~=0.1.4",
+    ],
+    author="Congcong Li",
+    author_email="luffy.lcc@gmail.com",
+    description="High quality, fast, modular reference implementation of SSD in PyTorch",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/lufficc/SSD",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    license="MIT",
+    python_requires=">=3.6",
+    include_package_data=True,
+)
--- a/ssd/__init__.py
+++ b/ssd/__init__.py
--- a/ssd/config/__init__.py
+++ b/ssd/config/__init__.py
+from .defaults import _C as cfg
--- a/ssd/config/defaults.py
+++ b/ssd/config/defaults.py
+from yacs.config import CfgNode as CN
+
+_C = CN()
+
+_C.MODEL = CN()
+_C.MODEL.META_ARCHITECTURE = 'SSDDetector'
+_C.MODEL.DEVICE = "cuda"
+# match default boxes to any ground truth with jaccard overlap higher than a threshold (0.5)
+_C.MODEL.THRESHOLD = 0.5
+_C.MODEL.NUM_CLASSES = 21
+# Hard negative mining
+_C.MODEL.NEG_POS_RATIO = 3
+_C.MODEL.CENTER_VARIANCE = 0.1
+_C.MODEL.SIZE_VARIANCE = 0.2
+
+# ---------------------------------------------------------------------------- #
+# Backbone
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+_C.MODEL.BACKBONE.NAME = 'vgg'
+_C.MODEL.BACKBONE.OUT_CHANNELS = (512, 1024, 512, 256, 256, 256)
+_C.MODEL.BACKBONE.PRETRAINED = True
+
+# -----------------------------------------------------------------------------
+# PRIORS
+# -----------------------------------------------------------------------------
+_C.MODEL.PRIORS = CN()
+_C.MODEL.PRIORS.FEATURE_MAPS = [38, 19, 10, 5, 3, 1]
+_C.MODEL.PRIORS.STRIDES = [8, 16, 32, 64, 100, 300]
+_C.MODEL.PRIORS.MIN_SIZES = [30, 60, 111, 162, 213, 264]
+_C.MODEL.PRIORS.MAX_SIZES = [60, 111, 162, 213, 264, 315]
+_C.MODEL.PRIORS.ASPECT_RATIOS = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+# When has 1 aspect ratio, every location has 4 boxes, 2 ratio 6 boxes.
+# #boxes = 2 + #ratio * 2
+_C.MODEL.PRIORS.BOXES_PER_LOCATION = [4, 6, 6, 6, 4, 4]  # number of boxes per feature map location
+_C.MODEL.PRIORS.CLIP = True
+
+# -----------------------------------------------------------------------------
+# Box Head
+# -----------------------------------------------------------------------------
+_C.MODEL.BOX_HEAD = CN()
+_C.MODEL.BOX_HEAD.NAME = 'SSDBoxHead'
+_C.MODEL.BOX_HEAD.PREDICTOR = 'SSDBoxPredictor'
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Image size
+_C.INPUT.IMAGE_SIZE = 300
+# Values to be used for image normalization, RGB layout
+_C.INPUT.PIXEL_MEAN = [123, 117, 104]
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training, as present in paths_catalog.py
+_C.DATASETS.TRAIN = ()
+# List of the dataset names for testing, as present in paths_catalog.py
+_C.DATASETS.TEST = ()
+
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATA_LOADER = CN()
+# Number of data loading threads
+_C.DATA_LOADER.NUM_WORKERS = 8
+_C.DATA_LOADER.PIN_MEMORY = True
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+# train configs
+_C.SOLVER.MAX_ITER = 120000
+_C.SOLVER.LR_STEPS = [80000, 100000]
+_C.SOLVER.GAMMA = 0.1
+_C.SOLVER.BATCH_SIZE = 32
+_C.SOLVER.LR = 1e-3
+_C.SOLVER.MOMENTUM = 0.9
+_C.SOLVER.WEIGHT_DECAY = 5e-4
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 3
+_C.SOLVER.WARMUP_ITERS = 500
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+_C.TEST.NMS_THRESHOLD = 0.45
+_C.TEST.CONFIDENCE_THRESHOLD = 0.01
+_C.TEST.MAX_PER_CLASS = -1
+_C.TEST.MAX_PER_IMAGE = 100
+_C.TEST.BATCH_SIZE = 10
+
+_C.OUTPUT_DIR = 'outputs'
--- a/ssd/config/path_catlog.py
+++ b/ssd/config/path_catlog.py
+import os
+
+
+class DatasetCatalog:
+    DATA_DIR = 'datasets'
+    DATASETS = {
+        'voc_2007_train': {
+            "data_dir": "VOC2007",
+            "split": "train"
+        },
+        'voc_2007_val': {
+            "data_dir": "VOC2007",
+            "split": "val"
+        },
+        'voc_2007_trainval': {
+            "data_dir": "VOC2007",
+            "split": "trainval"
+        },
+        'voc_2007_test': {
+            "data_dir": "VOC2007",
+            "split": "test"
+        },
+        'voc_2012_train': {
+            "data_dir": "VOC2012",
+            "split": "train"
+        },
+        'voc_2012_val': {
+            "data_dir": "VOC2012",
+            "split": "val"
+        },
+        'voc_2012_trainval': {
+            "data_dir": "VOC2012",
+            "split": "trainval"
+        },
+        'voc_2012_test': {
+            "data_dir": "VOC2012",
+            "split": "test"
+        },
+        'coco_2014_valminusminival': {
+            "data_dir": "val2014",
+            "ann_file": "annotations/instances_valminusminival2014.json"
+        },
+        'coco_2014_minival': {
+            "data_dir": "val2014",
+            "ann_file": "annotations/instances_minival2014.json"
+        },
+        'coco_2014_train': {
+            "data_dir": "train2014",
+            "ann_file": "annotations/instances_train2014.json"
+        },
+        'coco_2014_val': {
+            "data_dir": "val2014",
+            "ann_file": "annotations/instances_val2014.json"
+        },
+    }
+
+    @staticmethod
+    def get(name):
+        if "voc" in name:
+            voc_root = DatasetCatalog.DATA_DIR
+            if 'VOC_ROOT' in os.environ:
+                voc_root = os.environ['VOC_ROOT']
+
+            attrs = DatasetCatalog.DATASETS[name]
+            args = dict(
+                data_dir=os.path.join(voc_root, attrs["data_dir"]),
+                split=attrs["split"],
+            )
+            return dict(factory="VOCDataset", args=args)
+        elif "coco" in name:
+            coco_root = DatasetCatalog.DATA_DIR
+            if 'COCO_ROOT' in os.environ:
+                coco_root = os.environ['COCO_ROOT']
+
+            attrs = DatasetCatalog.DATASETS[name]
+            args = dict(
+                data_dir=os.path.join(coco_root, attrs["data_dir"]),
+                ann_file=os.path.join(coco_root, attrs["ann_file"]),
+            )
+            return dict(factory="COCODataset", args=args)
+
+        raise RuntimeError("Dataset not available: {}".format(name))
--- a/ssd/data/__init__.py
+++ b/ssd/data/__init__.py
--- a/ssd/data/build.py
+++ b/ssd/data/build.py
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.data.dataloader import default_collate
+
+from ssd.data import samplers
+from ssd.data.datasets import build_dataset
+from ssd.data.transforms import build_transforms, build_target_transform
+from ssd.structures.container import Container
+
+
+class BatchCollator:
+    def __init__(self, is_train=True):
+        self.is_train = is_train
+
+    def __call__(self, batch):
+        transposed_batch = list(zip(*batch))
+        images = default_collate(transposed_batch[0])
+        img_ids = default_collate(transposed_batch[2])
+
+        if self.is_train:
+            list_targets = transposed_batch[1]
+            targets = Container(
+                {key: default_collate([d[key] for d in list_targets]) for key in list_targets[0]}
+            )
+        else:
+            targets = None
+        return images, targets, img_ids
+
+
+def make_data_loader(cfg, is_train=True, distributed=False, max_iter=None, start_iter=0):
+    train_transform = build_transforms(cfg, is_train=is_train)
+    target_transform = build_target_transform(cfg) if is_train else None
+    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST
+    datasets = build_dataset(dataset_list, transform=train_transform, target_transform=target_transform, is_train=is_train)
+
+    shuffle = is_train
+
+    data_loaders = []
+
+    for dataset in datasets:
+        if distributed:
+            sampler = samplers.DistributedSampler(dataset, shuffle=shuffle)
+        elif shuffle:
+            sampler = torch.utils.data.RandomSampler(dataset)
+        else:
+            sampler = torch.utils.data.sampler.SequentialSampler(dataset)
+
+        batch_size = cfg.SOLVER.BATCH_SIZE if is_train else cfg.TEST.BATCH_SIZE
+        batch_sampler = torch.utils.data.sampler.BatchSampler(sampler=sampler, batch_size=batch_size, drop_last=False)
+        if max_iter is not None:
+            batch_sampler = samplers.IterationBasedBatchSampler(batch_sampler, num_iterations=max_iter, start_iter=start_iter)
+
+        data_loader = DataLoader(dataset, num_workers=cfg.DATA_LOADER.NUM_WORKERS, batch_sampler=batch_sampler,
+                                 pin_memory=cfg.DATA_LOADER.PIN_MEMORY, collate_fn=BatchCollator(is_train))
+        data_loaders.append(data_loader)
+
+    if is_train:
+        # during training, a single (possibly concatenated) data_loader is returned
+        assert len(data_loaders) == 1
+        return data_loaders[0]
+    return data_loaders
--- a/ssd/data/datasets/__init__.py
+++ b/ssd/data/datasets/__init__.py
+from torch.utils.data import ConcatDataset
+
+from ssd.config.path_catlog import DatasetCatalog
+from .voc import VOCDataset
+from .coco import COCODataset
+
+_DATASETS = {
+    'VOCDataset': VOCDataset,
+    'COCODataset': COCODataset,
+}
+
+
+def build_dataset(dataset_list, transform=None, target_transform=None, is_train=True):
+    assert len(dataset_list) > 0
+    datasets = []
+    for dataset_name in dataset_list:
+        data = DatasetCatalog.get(dataset_name)
+        args = data['args']
+        factory = _DATASETS[data['factory']]
+        args['transform'] = transform
+        args['target_transform'] = target_transform
+        if factory == VOCDataset:
+            args['keep_difficult'] = not is_train
+        elif factory == COCODataset:
+            args['remove_empty'] = is_train
+        dataset = factory(**args)
+        datasets.append(dataset)
+    # for testing, return a list of datasets
+    if not is_train:
+        return datasets
+    dataset = datasets[0]
+    if len(datasets) > 1:
+        dataset = ConcatDataset(datasets)
+
+    return [dataset]
--- a/ssd/data/datasets/coco.py
+++ b/ssd/data/datasets/coco.py
+import os
+import torch.utils.data
+import numpy as np
+from PIL import Image
+
+from ssd.structures.container import Container
+
+
+class COCODataset(torch.utils.data.Dataset):
+    class_names = ('__background__',
+                   'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+                   'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+                   'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+                   'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
+                   'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
+                   'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+                   'kite', 'baseball bat', 'baseball glove', 'skateboard',
+                   'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
+                   'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+                   'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+                   'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
+                   'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+                   'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+                   'refrigerator', 'book', 'clock', 'vase', 'scissors',
+                   'teddy bear', 'hair drier', 'toothbrush')
+
+    def __init__(self, data_dir, ann_file, transform=None, target_transform=None, remove_empty=False):
+        from pycocotools.coco import COCO
+        self.coco = COCO(ann_file)
+        self.data_dir = data_dir
+        self.transform = transform
+        self.target_transform = target_transform
+        self.remove_empty = remove_empty
+        if self.remove_empty:
+            # when training, images without annotations are removed.
+            self.ids = list(self.coco.imgToAnns.keys())
+        else:
+            # when testing, all images used.
+            self.ids = list(self.coco.imgs.keys())
+        coco_categories = sorted(self.coco.getCatIds())
+        self.coco_id_to_contiguous_id = {coco_id: i + 1 for i, coco_id in enumerate(coco_categories)}
+        self.contiguous_id_to_coco_id = {v: k for k, v in self.coco_id_to_contiguous_id.items()}
+
+    def __getitem__(self, index):
+        image_id = self.ids[index]
+        boxes, labels = self._get_annotation(image_id)
+        image = self._read_image(image_id)
+        if self.transform:
+            image, boxes, labels = self.transform(image, boxes, labels)
+        if self.target_transform:
+            boxes, labels = self.target_transform(boxes, labels)
+        targets = Container(
+            boxes=boxes,
+            labels=labels,
+        )
+        return image, targets, index
+
+    def get_annotation(self, index):
+        image_id = self.ids[index]
+        return image_id, self._get_annotation(image_id)
+
+    def __len__(self):
+        return len(self.ids)
+
+    def _get_annotation(self, image_id):
+        ann_ids = self.coco.getAnnIds(imgIds=image_id)
+        ann = self.coco.loadAnns(ann_ids)
+        # filter crowd annotations
+        ann = [obj for obj in ann if obj["iscrowd"] == 0]
+        boxes = np.array([self._xywh2xyxy(obj["bbox"]) for obj in ann], np.float32).reshape((-1, 4))
+        labels = np.array([self.coco_id_to_contiguous_id[obj["category_id"]] for obj in ann], np.int64).reshape((-1,))
+        # remove invalid boxes
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        labels = labels[keep]
+        return boxes, labels
+
+    def _xywh2xyxy(self, box):
+        x1, y1, w, h = box
+        return [x1, y1, x1 + w, y1 + h]
+
+    def get_img_info(self, index):
+        image_id = self.ids[index]
+        img_data = self.coco.imgs[image_id]
+        return img_data
+
+    def _read_image(self, image_id):
+        file_name = self.coco.loadImgs(image_id)[0]['file_name']
+        image_file = os.path.join(self.data_dir, file_name)
+        image = Image.open(image_file).convert("RGB")
+        image = np.array(image)
+        return image
--- a/ssd/data/datasets/evaluation/__init__.py
+++ b/ssd/data/datasets/evaluation/__init__.py
+from ssd.data.datasets import VOCDataset, COCODataset
+from .coco import coco_evaluation
+from .voc import voc_evaluation
+
+
+def evaluate(dataset, predictions, output_dir, **kwargs):
+    """evaluate dataset using different methods based on dataset type.
+    Args:
+        dataset: Dataset object
+        predictions(list[(boxes, labels, scores)]): Each item in the list represents the
+            prediction results for one image. And the index should match the dataset index.
+        output_dir: output folder, to save evaluation files or results.
+    Returns:
+        evaluation result
+    """
+    args = dict(
+        dataset=dataset, predictions=predictions, output_dir=output_dir, **kwargs,
+    )
+    if isinstance(dataset, VOCDataset):
+        return voc_evaluation(**args)
+    elif isinstance(dataset, COCODataset):
+        return coco_evaluation(**args)
+    else:
+        raise NotImplementedError
--- a/ssd/data/datasets/evaluation/coco/__init__.py
+++ b/ssd/data/datasets/evaluation/coco/__init__.py
+import json
+import logging
+import os
+from datetime import datetime
+
+
+def coco_evaluation(dataset, predictions, output_dir, iteration=None):
+    coco_results = []
+    for i, prediction in enumerate(predictions):
+        img_info = dataset.get_img_info(i)
+        prediction = prediction.resize((img_info['width'], img_info['height'])).numpy()
+        boxes, labels, scores = prediction['boxes'], prediction['labels'], prediction['scores']
+
+        image_id, annotation = dataset.get_annotation(i)
+        class_mapper = dataset.contiguous_id_to_coco_id
+        if labels.shape[0] == 0:
+            continue
+
+        boxes = boxes.tolist()
+        labels = labels.tolist()
+        scores = scores.tolist()
+        coco_results.extend(
+            [
+                {
+                    "image_id": image_id,
+                    "category_id": class_mapper[labels[k]],
+                    "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]],  # to xywh format
+                    "score": scores[k],
+                }
+                for k, box in enumerate(boxes)
+            ]
+        )
+    iou_type = 'bbox'
+    json_result_file = os.path.join(output_dir, iou_type + ".json")
+    logger = logging.getLogger("SSD.inference")
+    logger.info('Writing results to {}...'.format(json_result_file))
+    with open(json_result_file, "w") as f:
+        json.dump(coco_results, f)
+    from pycocotools.cocoeval import COCOeval
+    coco_gt = dataset.coco
+    coco_dt = coco_gt.loadRes(json_result_file)
+    coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    result_strings = []
+    keys = ["AP", "AP50", "AP75", "APs", "APm", "APl"]
+    metrics = {}
+    for i, key in enumerate(keys):
+        metrics[key] = coco_eval.stats[i]
+        logger.info('{:<10}: {}'.format(key, round(coco_eval.stats[i], 3)))
+        result_strings.append('{:<10}: {}'.format(key, round(coco_eval.stats[i], 3)))
+
+    if iteration is not None:
+        result_path = os.path.join(output_dir, 'result_{:07d}.txt'.format(iteration))
+    else:
+        result_path = os.path.join(output_dir, 'result_{}.txt'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))
+    with open(result_path, "w") as f:
+        f.write('\n'.join(result_strings))
+
+    return dict(metrics=metrics)
--- a/ssd/data/datasets/evaluation/voc/__init__.py
+++ b/ssd/data/datasets/evaluation/voc/__init__.py
+import logging
+import os
+from datetime import datetime
+
+import numpy as np
+
+from .eval_detection_voc import eval_detection_voc
+
+
+def voc_evaluation(dataset, predictions, output_dir, iteration=None):
+    class_names = dataset.class_names
+
+    pred_boxes_list = []
+    pred_labels_list = []
+    pred_scores_list = []
+    gt_boxes_list = []
+    gt_labels_list = []
+    gt_difficults = []
+
+    for i in range(len(dataset)):
+        image_id, annotation = dataset.get_annotation(i)
+        gt_boxes, gt_labels, is_difficult = annotation
+        gt_boxes_list.append(gt_boxes)
+        gt_labels_list.append(gt_labels)
+        gt_difficults.append(is_difficult.astype(bool))
+
+        img_info = dataset.get_img_info(i)
+        prediction = predictions[i]
+        prediction = prediction.resize((img_info['width'], img_info['height'])).numpy()
+        boxes, labels, scores = prediction['boxes'], prediction['labels'], prediction['scores']
+
+        pred_boxes_list.append(boxes)
+        pred_labels_list.append(labels)
+        pred_scores_list.append(scores)
+    result = eval_detection_voc(pred_bboxes=pred_boxes_list,
+                                pred_labels=pred_labels_list,
+                                pred_scores=pred_scores_list,
+                                gt_bboxes=gt_boxes_list,
+                                gt_labels=gt_labels_list,
+                                gt_difficults=gt_difficults,
+                                iou_thresh=0.5,
+                                use_07_metric=True)
+    logger = logging.getLogger("SSD.inference")
+    result_str = "mAP: {:.4f}\n".format(result["map"])
+    metrics = {'mAP': result["map"]}
+    for i, ap in enumerate(result["ap"]):
+        if i == 0:  # skip background
+            continue
+        metrics[class_names[i]] = ap
+        result_str += "{:<16}: {:.4f}\n".format(class_names[i], ap)
+    logger.info(result_str)
+
+    if iteration is not None:
+        result_path = os.path.join(output_dir, 'result_{:07d}.txt'.format(iteration))
+    else:
+        result_path = os.path.join(output_dir, 'result_{}.txt'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))
+    with open(result_path, "w") as f:
+        f.write(result_str)
+
+    return dict(metrics=metrics)
--- a/ssd/data/datasets/evaluation/voc/eval_detection_voc.py
+++ b/ssd/data/datasets/evaluation/voc/eval_detection_voc.py
+from __future__ import division
+
+from collections import defaultdict
+import itertools
+import numpy as np
+import six
+
+
+def bbox_iou(bbox_a, bbox_b):
+    """Calculate the Intersection of Unions (IoUs) between bounding boxes.
+    IoU is calculated as a ratio of area of the intersection
+    and area of the union.
+    This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
+    inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be
+    same type.
+    The output is same type as the type of the inputs.
+    Args:
+        bbox_a (array): An array whose shape is :math:`(N, 4)`.
+            :math:`N` is the number of bounding boxes.
+            The dtype should be :obj:`numpy.float32`.
+        bbox_b (array): An array similar to :obj:`bbox_a`,
+            whose shape is :math:`(K, 4)`.
+            The dtype should be :obj:`numpy.float32`.
+    Returns:
+        array:
+        An array whose shape is :math:`(N, K)`. \
+        An element at index :math:`(n, k)` contains IoUs between \
+        :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
+        box in :obj:`bbox_b`.
+    """
+    if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
+        raise IndexError
+
+    # top left
+    tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
+    # bottom right
+    br = np.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
+
+    area_i = np.prod(br - tl, axis=2) * (tl < br).all(axis=2)
+    area_a = np.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
+    area_b = np.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def eval_detection_voc(
+        pred_bboxes,
+        pred_labels,
+        pred_scores,
+        gt_bboxes,
+        gt_labels,
+        gt_difficults=None,
+        iou_thresh=0.5,
+        use_07_metric=False):
+    """Calculate average precisions based on evaluation code of PASCAL VOC.
+
+    This function evaluates predicted bounding boxes obtained from a dataset
+    which has :math:`N` images by using average precision for each class.
+    The code is based on the evaluation code used in PASCAL VOC Challenge.
+
+    Args:
+        pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
+            sets of bounding boxes.
+            Its index corresponds to an index for the base dataset.
+            Each element of :obj:`pred_bboxes` is a set of coordinates
+            of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
+            where :math:`R` corresponds
+            to the number of bounding boxes, which may vary among boxes.
+            The second axis corresponds to
+            :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
+        pred_labels (iterable of numpy.ndarray): An iterable of labels.
+            Similar to :obj:`pred_bboxes`, its index corresponds to an
+            index for the base dataset. Its length is :math:`N`.
+        pred_scores (iterable of numpy.ndarray): An iterable of confidence
+            scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
+            its index corresponds to an index for the base dataset.
+            Its length is :math:`N`.
+        gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
+            bounding boxes
+            whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
+            bounding box whose shape is :math:`(R, 4)`. Note that the number of
+            bounding boxes in each image does not need to be same as the number
+            of corresponding predicted boxes.
+        gt_labels (iterable of numpy.ndarray): An iterable of ground truth
+            labels which are organized similarly to :obj:`gt_bboxes`.
+        gt_difficults (iterable of numpy.ndarray): An iterable of boolean
+            arrays which is organized similarly to :obj:`gt_bboxes`.
+            This tells whether the
+            corresponding ground truth bounding box is difficult or not.
+            By default, this is :obj:`None`. In that case, this function
+            considers all bounding boxes to be not difficult.
+        iou_thresh (float): A prediction is correct if its Intersection over
+            Union with the ground truth is above this value.
+        use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
+            for calculating average precision. The default value is
+            :obj:`False`.
+
+    Returns:
+        dict:
+
+        The keys, value-types and the description of the values are listed
+        below.
+
+        * **ap** (*numpy.ndarray*): An array of average precisions. \
+            The :math:`l`-th value corresponds to the average precision \
+            for class :math:`l`. If class :math:`l` does not exist in \
+            either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \
+            value is set to :obj:`numpy.nan`.
+        * **map** (*float*): The average of Average Precisions over classes.
+
+    """
+
+    prec, rec = calc_detection_voc_prec_rec(pred_bboxes,
+                                            pred_labels,
+                                            pred_scores,
+                                            gt_bboxes,
+                                            gt_labels,
+                                            gt_difficults,
+                                            iou_thresh=iou_thresh)
+
+    ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric)
+
+    return {'ap': ap, 'map': np.nanmean(ap)}
+
+
+def calc_detection_voc_prec_rec(
+        pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
+        gt_difficults=None,
+        iou_thresh=0.5):
+    """Calculate precision and recall based on evaluation code of PASCAL VOC.
+
+    This function calculates precision and recall of
+    predicted bounding boxes obtained from a dataset which has :math:`N`
+    images.
+    The code is based on the evaluation code used in PASCAL VOC Challenge.
+
+    Args:
+        pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
+            sets of bounding boxes.
+            Its index corresponds to an index for the base dataset.
+            Each element of :obj:`pred_bboxes` is a set of coordinates
+            of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
+            where :math:`R` corresponds
+            to the number of bounding boxes, which may vary among boxes.
+            The second axis corresponds to
+            :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
+        pred_labels (iterable of numpy.ndarray): An iterable of labels.
+            Similar to :obj:`pred_bboxes`, its index corresponds to an
+            index for the base dataset. Its length is :math:`N`.
+        pred_scores (iterable of numpy.ndarray): An iterable of confidence
+            scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
+            its index corresponds to an index for the base dataset.
+            Its length is :math:`N`.
+        gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
+            bounding boxes
+            whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
+            bounding box whose shape is :math:`(R, 4)`. Note that the number of
+            bounding boxes in each image does not need to be same as the number
+            of corresponding predicted boxes.
+        gt_labels (iterable of numpy.ndarray): An iterable of ground truth
+            labels which are organized similarly to :obj:`gt_bboxes`.
+        gt_difficults (iterable of numpy.ndarray): An iterable of boolean
+            arrays which is organized similarly to :obj:`gt_bboxes`.
+            This tells whether the
+            corresponding ground truth bounding box is difficult or not.
+            By default, this is :obj:`None`. In that case, this function
+            considers all bounding boxes to be not difficult.
+        iou_thresh (float): A prediction is correct if its Intersection over
+            Union with the ground truth is above this value..
+
+    Returns:
+        tuple of two lists:
+        This function returns two lists: :obj:`prec` and :obj:`rec`.
+
+        * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \
+            for class :math:`l`. If class :math:`l` does not exist in \
+            either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \
+            set to :obj:`None`.
+        * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \
+            for class :math:`l`. If class :math:`l` that is not marked as \
+            difficult does not exist in \
+            :obj:`gt_labels`, :obj:`rec[l]` is \
+            set to :obj:`None`.
+
+    """
+
+    pred_bboxes = iter(pred_bboxes)
+    pred_labels = iter(pred_labels)
+    pred_scores = iter(pred_scores)
+    gt_bboxes = iter(gt_bboxes)
+    gt_labels = iter(gt_labels)
+    if gt_difficults is None:
+        gt_difficults = itertools.repeat(None)
+    else:
+        gt_difficults = iter(gt_difficults)
+
+    n_pos = defaultdict(int)
+    score = defaultdict(list)
+    match = defaultdict(list)
+
+    for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \
+            six.moves.zip(
+                pred_bboxes, pred_labels, pred_scores,
+                gt_bboxes, gt_labels, gt_difficults):
+
+        if gt_difficult is None:
+            gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool)
+
+        for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
+            pred_mask_l = pred_label == l
+            pred_bbox_l = pred_bbox[pred_mask_l]
+            pred_score_l = pred_score[pred_mask_l]
+            # sort by score
+            order = pred_score_l.argsort()[::-1]
+            pred_bbox_l = pred_bbox_l[order]
+            pred_score_l = pred_score_l[order]
+
+            gt_mask_l = gt_label == l
+            gt_bbox_l = gt_bbox[gt_mask_l]
+            gt_difficult_l = gt_difficult[gt_mask_l]
+
+            n_pos[l] += np.logical_not(gt_difficult_l).sum()
+            score[l].extend(pred_score_l)
+
+            if len(pred_bbox_l) == 0:
+                continue
+            if len(gt_bbox_l) == 0:
+                match[l].extend((0,) * pred_bbox_l.shape[0])
+                continue
+
+            # VOC evaluation follows integer typed bounding boxes.
+            pred_bbox_l = pred_bbox_l.copy()
+            pred_bbox_l[:, 2:] += 1
+            gt_bbox_l = gt_bbox_l.copy()
+            gt_bbox_l[:, 2:] += 1
+
+            iou = bbox_iou(pred_bbox_l, gt_bbox_l)
+            gt_index = iou.argmax(axis=1)
+            # set -1 if there is no matching ground truth
+            gt_index[iou.max(axis=1) < iou_thresh] = -1
+            del iou
+
+            selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
+            for gt_idx in gt_index:
+                if gt_idx >= 0:
+                    if gt_difficult_l[gt_idx]:
+                        match[l].append(-1)
+                    else:
+                        if not selec[gt_idx]:
+                            match[l].append(1)
+                        else:
+                            match[l].append(0)
+                    selec[gt_idx] = True
+                else:
+                    match[l].append(0)
+
+    for iter_ in (
+            pred_bboxes, pred_labels, pred_scores,
+            gt_bboxes, gt_labels, gt_difficults):
+        if next(iter_, None) is not None:
+            raise ValueError('Length of input iterables need to be same.')
+
+    n_fg_class = max(n_pos.keys()) + 1
+    prec = [None] * n_fg_class
+    rec = [None] * n_fg_class
+
+    for l in n_pos.keys():
+        score_l = np.array(score[l])
+        match_l = np.array(match[l], dtype=np.int8)
+
+        order = score_l.argsort()[::-1]
+        match_l = match_l[order]
+
+        tp = np.cumsum(match_l == 1)
+        fp = np.cumsum(match_l == 0)
+
+        # If an element of fp + tp is 0,
+        # the corresponding element of prec[l] is nan.
+        prec[l] = tp / (fp + tp)
+        # If n_pos[l] is 0, rec[l] is None.
+        if n_pos[l] > 0:
+            rec[l] = tp / n_pos[l]
+
+    return prec, rec
+
+
+def calc_detection_voc_ap(prec, rec, use_07_metric=False):
+    """Calculate average precisions based on evaluation code of PASCAL VOC.
+
+    This function calculates average precisions
+    from given precisions and recalls.
+    The code is based on the evaluation code used in PASCAL VOC Challenge.
+
+    Args:
+        prec (list of numpy.array): A list of arrays.
+            :obj:`prec[l]` indicates precision for class :math:`l`.
+            If :obj:`prec[l]` is :obj:`None`, this function returns
+            :obj:`numpy.nan` for class :math:`l`.
+        rec (list of numpy.array): A list of arrays.
+            :obj:`rec[l]` indicates recall for class :math:`l`.
+            If :obj:`rec[l]` is :obj:`None`, this function returns
+            :obj:`numpy.nan` for class :math:`l`.
+        use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
+            for calculating average precision. The default value is
+            :obj:`False`.
+
+    Returns:
+        ~numpy.ndarray:
+        This function returns an array of average precisions.
+        The :math:`l`-th value corresponds to the average precision
+        for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
+        :obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
+
+    """
+
+    n_fg_class = len(prec)
+    ap = np.empty(n_fg_class)
+    for l in six.moves.range(n_fg_class):
+        if prec[l] is None or rec[l] is None:
+            ap[l] = np.nan
+            continue
+
+        if use_07_metric:
+            # 11 point metric
+            ap[l] = 0
+            for t in np.arange(0., 1.1, 0.1):
+                if np.sum(rec[l] >= t) == 0:
+                    p = 0
+                else:
+                    p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
+                ap[l] += p / 11
+        else:
+            # correct AP calculation
+            # first append sentinel values at the end
+            mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
+            mrec = np.concatenate(([0], rec[l], [1]))
+
+            mpre = np.maximum.accumulate(mpre[::-1])[::-1]
+
+            # to calculate area under PR curve, look for points
+            # where X axis (recall) changes value
+            i = np.where(mrec[1:] != mrec[:-1])[0]
+
+            # and sum (\Delta recall) * prec
+            ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+
+    return ap
--- a/ssd/data/datasets/voc.py
+++ b/ssd/data/datasets/voc.py
+import os
+import torch.utils.data
+import numpy as np
+import xml.etree.ElementTree as ET
+from PIL import Image
+
+from ssd.structures.container import Container
+
+
+class VOCDataset(torch.utils.data.Dataset):
+    class_names = ('__background__',
+                   'aeroplane', 'bicycle', 'bird', 'boat',
+                   'bottle', 'bus', 'car', 'cat', 'chair',
+                   'cow', 'diningtable', 'dog', 'horse',
+                   'motorbike', 'person', 'pottedplant',
+                   'sheep', 'sofa', 'train', 'tvmonitor')
+
+    def __init__(self, data_dir, split, transform=None, target_transform=None, keep_difficult=False):
+        """Dataset for VOC data.
+        Args:
+            data_dir: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories:
+                Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject.
+        """
+        self.data_dir = data_dir
+        self.split = split
+        self.transform = transform
+        self.target_transform = target_transform
+        image_sets_file = os.path.join(self.data_dir, "ImageSets", "Main", "%s.txt" % self.split)
+        self.ids = VOCDataset._read_image_ids(image_sets_file)
+        self.keep_difficult = keep_difficult
+
+        self.class_dict = {class_name: i for i, class_name in enumerate(self.class_names)}
+
+    def __getitem__(self, index):
+        image_id = self.ids[index]
+        boxes, labels, is_difficult = self._get_annotation(image_id)
+        if not self.keep_difficult:
+            boxes = boxes[is_difficult == 0]
+            labels = labels[is_difficult == 0]
+        image = self._read_image(image_id)
+        if self.transform:
+            image, boxes, labels = self.transform(image, boxes, labels)
+        if self.target_transform:
+            boxes, labels = self.target_transform(boxes, labels)
+        targets = Container(
+            boxes=boxes,
+            labels=labels,
+        )
+        return image, targets, index
+
+    def get_annotation(self, index):
+        image_id = self.ids[index]
+        return image_id, self._get_annotation(image_id)
+
+    def __len__(self):
+        return len(self.ids)
+
+    @staticmethod
+    def _read_image_ids(image_sets_file):
+        ids = []
+        with open(image_sets_file) as f:
+            for line in f:
+                ids.append(line.rstrip())
+        return ids
+
+    def _get_annotation(self, image_id):
+        annotation_file = os.path.join(self.data_dir, "Annotations", "%s.xml" % image_id)
+        objects = ET.parse(annotation_file).findall("object")
+        boxes = []
+        labels = []
+        is_difficult = []
+        for obj in objects:
+            class_name = obj.find('name').text.lower().strip()
+            bbox = obj.find('bndbox')
+            # VOC dataset format follows Matlab, in which indexes start from 0
+            x1 = float(bbox.find('xmin').text) - 1
+            y1 = float(bbox.find('ymin').text) - 1
+            x2 = float(bbox.find('xmax').text) - 1
+            y2 = float(bbox.find('ymax').text) - 1
+            boxes.append([x1, y1, x2, y2])
+            labels.append(self.class_dict[class_name])
+            is_difficult_str = obj.find('difficult').text
+            is_difficult.append(int(is_difficult_str) if is_difficult_str else 0)
+
+        return (np.array(boxes, dtype=np.float32),
+                np.array(labels, dtype=np.int64),
+                np.array(is_difficult, dtype=np.uint8))
+
+    def get_img_info(self, index):
+        img_id = self.ids[index]
+        annotation_file = os.path.join(self.data_dir, "Annotations", "%s.xml" % img_id)
+        anno = ET.parse(annotation_file).getroot()
+        size = anno.find("size")
+        im_info = tuple(map(int, (size.find("height").text, size.find("width").text)))
+        return {"height": im_info[0], "width": im_info[1]}
+
+    def _read_image(self, image_id):
+        image_file = os.path.join(self.data_dir, "JPEGImages", "%s.jpg" % image_id)
+        image = Image.open(image_file).convert("RGB")
+        image = np.array(image)
+        return image
--- a/ssd/data/samplers/__init__.py
+++ b/ssd/data/samplers/__init__.py
+from .iteration_based_batch_sampler import IterationBasedBatchSampler
+from .distributed import DistributedSampler
+
+__all__ = ['IterationBasedBatchSampler', 'DistributedSampler']