First Commit.

b952e97b · chenych · b952e97b · b952e97b · b952e97b · b952e97b
Commit b952e97b authored Nov 03, 2023 by chenych
20 changed files
--- a/src/tools/reval.py
+++ b/src/tools/reval.py
+#!/usr/bin/env python
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# Modified by Xingyi Zhou
+# --------------------------------------------------------
+# Reval = re-eval. Re-evaluate saved detections.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os.path as osp
+sys.path.insert(0, osp.join(osp.dirname(__file__), 'voc_eval_lib'))
+from model.test import apply_nms
+from datasets.pascal_voc import pascal_voc
+import pickle
+import os, argparse
+import numpy as np
+import json
+def parse_args():
+  """
+  Parse input arguments
+  """
+  parser = argparse.ArgumentParser(description='Re-evaluate results')
+  parser.add_argument('detection_file', type=str)
+  parser.add_argument('--output_dir', help='results directory', type=str)
+  parser.add_argument('--imdb', dest='imdb_name',
+                      help='dataset to re-evaluate',
+                      default='voc_2007_test', type=str)
+  parser.add_argument('--matlab', dest='matlab_eval',
+                      help='use matlab for evaluation',
+                      action='store_true')
+  parser.add_argument('--comp', dest='comp_mode', help='competition mode',
+                      action='store_true')
+  parser.add_argument('--nms', dest='apply_nms', help='apply nms',
+                      action='store_true')
+  if len(sys.argv) == 1:
+    parser.print_help()
+    sys.exit(1)
+  args = parser.parse_args()
+  return args
+def from_dets(imdb_name, detection_file, args):
+  imdb = pascal_voc('test', '2007')
+  imdb.competition_mode(args.comp_mode)
+  imdb.config['matlab_eval'] = args.matlab_eval
+  with open(os.path.join(detection_file), 'rb') as f:
+    if 'json' in detection_file:
+      dets = json.load(f)
+    else:
+      dets = pickle.load(f, encoding='latin1')
+  # import pdb; pdb.set_trace()
+  if args.apply_nms:
+    print('Applying NMS to all detections')
+    test_nms = 0.3
+    nms_dets = apply_nms(dets, test_nms)
+  else:
+    nms_dets = dets
+  print('Evaluating detections')
+  imdb.evaluate_detections(nms_dets)
+if __name__ == '__main__':
+  args = parse_args()
+  imdb_name = args.imdb_name
+  from_dets(imdb_name, args.detection_file, args)
--- a/src/tools/vis_pred.py
+++ b/src/tools/vis_pred.py
+import pycocotools.coco as coco
+from pycocotools.cocoeval import COCOeval
+import sys
+import cv2
+import numpy as np
+import pickle
+IMG_PATH = '../../data/coco/val2017/'
+ANN_PATH = '../../data/coco/annotations/instances_val2017.json'
+DEBUG = True
+def _coco_box_to_bbox(box):
+  bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                  dtype=np.int32)
+  return bbox
+_cat_ids = [
+  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 
+  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
+  24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 
+  37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
+  58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 
+  72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 
+  82, 84, 85, 86, 87, 88, 89, 90
+]
+num_classes = 80
+_classes = {
+  ind + 1: cat_id for ind, cat_id in enumerate(_cat_ids)
+}
+_to_order = {cat_id: ind for ind, cat_id in enumerate(_cat_ids)}
+coco = coco.COCO(ANN_PATH)
+CAT_NAMES = [coco.loadCats([_classes[i + 1]])[0]['name'] \
+              for i in range(num_classes)]
+COLORS = [((np.random.random((3, )) * 0.6 + 0.4)*255).astype(np.uint8) \
+              for _ in range(num_classes)]
+def add_box(image, bbox, sc, cat_id):
+  cat_id = _to_order[cat_id]
+  cat_name = CAT_NAMES[cat_id]
+  cat_size  = cv2.getTextSize(cat_name + '0', cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
+  color = np.array(COLORS[cat_id]).astype(np.int32).tolist()
+  txt = '{}{:.0f}'.format(cat_name, sc * 10)
+  if bbox[1] - cat_size[1] - 2 < 0:
+    cv2.rectangle(image,
+                  (bbox[0], bbox[1] + 2),
+                  (bbox[0] + cat_size[0], bbox[1] + cat_size[1] + 2),
+                  color, -1)
+    cv2.putText(image, txt, 
+                (bbox[0], bbox[1] + cat_size[1] + 2), 
+                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), thickness=1)
+  else:
+    cv2.rectangle(image,
+                  (bbox[0], bbox[1] - cat_size[1] - 2),
+                  (bbox[0] + cat_size[0], bbox[1] - 2),
+                  color, -1)
+    cv2.putText(image, txt, 
+                (bbox[0], bbox[1] - 2), 
+                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), thickness=1)
+  cv2.rectangle(image,
+                (bbox[0], bbox[1]),
+                (bbox[2], bbox[3]),
+                color, 2)
+  return image
+if __name__ == '__main__':
+  dets = []
+  img_ids = coco.getImgIds()
+  num_images = len(img_ids)
+  for k in range(1, len(sys.argv)):
+    pred_path = sys.argv[k]
+    dets.append(coco.loadRes(pred_path))
+  # import pdb; pdb.set_trace()
+  for i, img_id in enumerate(img_ids):
+    img_info = coco.loadImgs(ids=[img_id])[0]
+    img_path = IMG_PATH + img_info['file_name']
+    img = cv2.imread(img_path)
+    gt_ids = coco.getAnnIds(imgIds=[img_id])
+    gts = coco.loadAnns(gt_ids)
+    gt_img = img.copy()
+    for j, pred in enumerate(gts):
+      bbox = _coco_box_to_bbox(pred['bbox'])
+      cat_id = pred['category_id']
+      gt_img = add_box(gt_img, bbox, 0, cat_id)
+    for k in range(len(dets)):
+      pred_ids = dets[k].getAnnIds(imgIds=[img_id])
+      preds = dets[k].loadAnns(pred_ids)
+      pred_img = img.copy()
+      for j, pred in enumerate(preds):
+        bbox = _coco_box_to_bbox(pred['bbox'])
+        sc = pred['score']
+        cat_id = pred['category_id']
+        if sc > 0.2:
+          pred_img = add_box(pred_img, bbox, sc, cat_id)
+      cv2.imshow('pred{}'.format(k), pred_img)
+      # cv2.imwrite('vis/{}_pred{}.png'.format(i, k), pred_img)
+    cv2.imshow('gt', gt_img)
+    # cv2.imwrite('vis/{}_gt.png'.format(i), gt_img)
+    cv2.waitKey()
+  # coco_eval.evaluate()
+  # coco_eval.accumulate()
+  # coco_eval.summarize()
--- a/src/tools/voc_eval_lib/LICENSE
+++ b/src/tools/voc_eval_lib/LICENSE
+MIT License
+Copyright (c) 2017 Xinlei Chen
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/src/tools/voc_eval_lib/Makefile
+++ b/src/tools/voc_eval_lib/Makefile
+all:
+	python setup.py build_ext --inplace
+	rm -rf build
+clean:
+	rm -rf */*.pyc
+	rm -rf */*.so
--- a/src/tools/voc_eval_lib/__init__.py
+++ b/src/tools/voc_eval_lib/__init__.py
--- a/src/tools/voc_eval_lib/datasets/__init__.py
+++ b/src/tools/voc_eval_lib/datasets/__init__.py
--- a/src/tools/voc_eval_lib/datasets/bbox.pyx
+++ b/src/tools/voc_eval_lib/datasets/bbox.pyx
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+cimport cython
+import numpy as np
+cimport numpy as np
+DTYPE = np.float
+ctypedef np.float_t DTYPE_t
+def bbox_overlaps(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + 1
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + 1
+                )
+                if ih > 0:
+                    ua = float(
+                        (boxes[n, 2] - boxes[n, 0] + 1) *
+                        (boxes[n, 3] - boxes[n, 1] + 1) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
--- a/src/tools/voc_eval_lib/datasets/ds_utils.py
+++ b/src/tools/voc_eval_lib/datasets/ds_utils.py
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def unique_boxes(boxes, scale=1.0):
+  """Return indices of unique boxes."""
+  v = np.array([1, 1e3, 1e6, 1e9])
+  hashes = np.round(boxes * scale).dot(v)
+  _, index = np.unique(hashes, return_index=True)
+  return np.sort(index)
+def xywh_to_xyxy(boxes):
+  """Convert [x y w h] box format to [x1 y1 x2 y2] format."""
+  return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1))
+def xyxy_to_xywh(boxes):
+  """Convert [x1 y1 x2 y2] box format to [x y w h] format."""
+  return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1))
+def validate_boxes(boxes, width=0, height=0):
+  """Check that a set of boxes are valid."""
+  x1 = boxes[:, 0]
+  y1 = boxes[:, 1]
+  x2 = boxes[:, 2]
+  y2 = boxes[:, 3]
+  assert (x1 >= 0).all()
+  assert (y1 >= 0).all()
+  assert (x2 >= x1).all()
+  assert (y2 >= y1).all()
+  assert (x2 < width).all()
+  assert (y2 < height).all()
+def filter_small_boxes(boxes, min_size):
+  w = boxes[:, 2] - boxes[:, 0]
+  h = boxes[:, 3] - boxes[:, 1]
+  keep = np.where((w >= min_size) & (h > min_size))[0]
+  return keep
--- a/src/tools/voc_eval_lib/datasets/imdb.py
+++ b/src/tools/voc_eval_lib/datasets/imdb.py
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Xinlei Chen
+# Modified by Xingyi Zhou
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import os.path as osp
+import PIL
+# from utils.cython_bbox import bbox_overlaps
+import numpy as np
+import scipy.sparse
+from model.config import cfg
+def bbox_overlaps(box1, box2):
+  area1 = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
+  area2 = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
+  inter = max(min(box1[2], box2[2]) - max(box1[0], box2[0]) + 1, 0) * \
+          max(min(box1[3], box2[3]) - max(box1[1], box2[1]) + 1, 0)
+  iou = 1.0 * inter / (area1 + area2 - inter)
+  return iou
+class imdb(object):
+  """Image database."""
+  def __init__(self, name, classes=None):
+    self._name = name
+    self._num_classes = 0
+    if not classes:
+      self._classes = []
+    else:
+      self._classes = classes
+    self._image_index = []
+    self._obj_proposer = 'gt'
+    self._roidb = None
+    self._roidb_handler = self.default_roidb
+    # Use this dict for storing dataset specific config options
+    self.config = {}
+  @property
+  def name(self):
+    return self._name
+  @property
+  def num_classes(self):
+    return len(self._classes)
+  @property
+  def classes(self):
+    return self._classes
+  @property
+  def image_index(self):
+    return self._image_index
+  @property
+  def roidb_handler(self):
+    return self._roidb_handler
+  @roidb_handler.setter
+  def roidb_handler(self, val):
+    self._roidb_handler = val
+  def set_proposal_method(self, method):
+    method = eval('self.' + method + '_roidb')
+    self.roidb_handler = method
+  @property
+  def roidb(self):
+    # A roidb is a list of dictionaries, each with the following keys:
+    #   boxes
+    #   gt_overlaps
+    #   gt_classes
+    #   flipped
+    if self._roidb is not None:
+      return self._roidb
+    self._roidb = self.roidb_handler()
+    return self._roidb
+  @property
+  def cache_path(self):
+    cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
+    if not os.path.exists(cache_path):
+      os.makedirs(cache_path)
+    return cache_path
+  @property
+  def num_images(self):
+    return len(self.image_index)
+  def image_path_at(self, i):
+    raise NotImplementedError
+  def default_roidb(self):
+    raise NotImplementedError
+  def evaluate_detections(self, all_boxes, output_dir=None):
+    """
+    all_boxes is a list of length number-of-classes.
+    Each list element is a list of length number-of-images.
+    Each of those list elements is either an empty list []
+    or a numpy array of detection.
+    all_boxes[class][image] = [] or np.array of shape #dets x 5
+    """
+    raise NotImplementedError
+  def _get_widths(self):
+    return [PIL.Image.open(self.image_path_at(i)).size[0]
+            for i in range(self.num_images)]
+  def append_flipped_images(self):
+    num_images = self.num_images
+    widths = self._get_widths()
+    for i in range(num_images):
+      boxes = self.roidb[i]['boxes'].copy()
+      oldx1 = boxes[:, 0].copy()
+      oldx2 = boxes[:, 2].copy()
+      boxes[:, 0] = widths[i] - oldx2 - 1
+      boxes[:, 2] = widths[i] - oldx1 - 1
+      assert (boxes[:, 2] >= boxes[:, 0]).all()
+      entry = {'boxes': boxes,
+               'gt_overlaps': self.roidb[i]['gt_overlaps'],
+               'gt_classes': self.roidb[i]['gt_classes'],
+               'flipped': True}
+      self.roidb.append(entry)
+    self._image_index = self._image_index * 2
+  def evaluate_recall(self, candidate_boxes=None, thresholds=None,
+                      area='all', limit=None):
+    """Evaluate detection proposal recall metrics.
+    Returns:
+        results: dictionary of results with keys
+            'ar': average recall
+            'recalls': vector recalls at each IoU overlap threshold
+            'thresholds': vector of IoU overlap thresholds
+            'gt_overlaps': vector of all ground-truth overlaps
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3,
+             '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
+    area_ranges = [[0 ** 2, 1e5 ** 2],  # all
+                   [0 ** 2, 32 ** 2],  # small
+                   [32 ** 2, 96 ** 2],  # medium
+                   [96 ** 2, 1e5 ** 2],  # large
+                   [96 ** 2, 128 ** 2],  # 96-128
+                   [128 ** 2, 256 ** 2],  # 128-256
+                   [256 ** 2, 512 ** 2],  # 256-512
+                   [512 ** 2, 1e5 ** 2],  # 512-inf
+                   ]
+    assert area in areas, 'unknown area range: {}'.format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = np.zeros(0)
+    num_pos = 0
+    for i in range(self.num_images):
+      # Checking for max_overlaps == 1 avoids including crowd annotations
+      # (...pretty hacking :/)
+      max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1)
+      gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) &
+                         (max_gt_overlaps == 1))[0]
+      gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
+      gt_areas = self.roidb[i]['seg_areas'][gt_inds]
+      valid_gt_inds = np.where((gt_areas >= area_range[0]) &
+                               (gt_areas <= area_range[1]))[0]
+      gt_boxes = gt_boxes[valid_gt_inds, :]
+      num_pos += len(valid_gt_inds)
+      if candidate_boxes is None:
+        # If candidate_boxes is not supplied, the default is to use the
+        # non-ground-truth boxes from this roidb
+        non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0]
+        boxes = self.roidb[i]['boxes'][non_gt_inds, :]
+      else:
+        boxes = candidate_boxes[i]
+      if boxes.shape[0] == 0:
+        continue
+      if limit is not None and boxes.shape[0] > limit:
+        boxes = boxes[:limit, :]
+      overlaps = bbox_overlaps(boxes.astype(np.float),
+                               gt_boxes.astype(np.float))
+      _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+      for j in range(gt_boxes.shape[0]):
+        # find which proposal box maximally covers each gt box
+        argmax_overlaps = overlaps.argmax(axis=0)
+        # and get the iou amount of coverage for each gt box
+        max_overlaps = overlaps.max(axis=0)
+        # find which gt box is 'best' covered (i.e. 'best' = most iou)
+        gt_ind = max_overlaps.argmax()
+        gt_ovr = max_overlaps.max()
+        assert (gt_ovr >= 0)
+        # find the proposal box that covers the best covered gt box
+        box_ind = argmax_overlaps[gt_ind]
+        # record the iou coverage of this gt box
+        _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+        assert (_gt_overlaps[j] == gt_ovr)
+        # mark the proposal box and the gt box as used
+        overlaps[box_ind, :] = -1
+        overlaps[:, gt_ind] = -1
+      # append recorded iou coverage level
+      gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+    gt_overlaps = np.sort(gt_overlaps)
+    if thresholds is None:
+      step = 0.05
+      thresholds = np.arange(0.5, 0.95 + 1e-5, step)
+    recalls = np.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+      recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds,
+            'gt_overlaps': gt_overlaps}
+  def create_roidb_from_box_list(self, box_list, gt_roidb):
+    assert len(box_list) == self.num_images, \
+      'Number of boxes must match number of ground-truth images'
+    roidb = []
+    for i in range(self.num_images):
+      boxes = box_list[i]
+      num_boxes = boxes.shape[0]
+      overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
+      if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
+        gt_boxes = gt_roidb[i]['boxes']
+        gt_classes = gt_roidb[i]['gt_classes']
+        gt_overlaps = bbox_overlaps(boxes.astype(np.float),
+                                    gt_boxes.astype(np.float))
+        argmaxes = gt_overlaps.argmax(axis=1)
+        maxes = gt_overlaps.max(axis=1)
+        I = np.where(maxes > 0)[0]
+        overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
+      overlaps = scipy.sparse.csr_matrix(overlaps)
+      roidb.append({
+        'boxes': boxes,
+        'gt_classes': np.zeros((num_boxes,), dtype=np.int32),
+        'gt_overlaps': overlaps,
+        'flipped': False,
+        'seg_areas': np.zeros((num_boxes,), dtype=np.float32),
+      })
+    return roidb
+  @staticmethod
+  def merge_roidbs(a, b):
+    assert len(a) == len(b)
+    for i in range(len(a)):
+      a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
+      a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
+                                      b[i]['gt_classes']))
+      a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
+                                                 b[i]['gt_overlaps']])
+      a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'],
+                                     b[i]['seg_areas']))
+    return a
+  def competition_mode(self, on):
+    """Turn competition mode on or off."""
+    pass
--- a/src/tools/voc_eval_lib/datasets/pascal_voc.py
+++ b/src/tools/voc_eval_lib/datasets/pascal_voc.py
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Xinlei Chen
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from datasets.imdb import imdb
+import datasets.ds_utils as ds_utils
+import xml.etree.ElementTree as ET
+import numpy as np
+import scipy.sparse
+import scipy.io as sio
+# import utils.cython_bbox
+import pickle
+import subprocess
+import uuid
+from .voc_eval import voc_eval
+from model.config import cfg
+class pascal_voc(imdb):
+  def __init__(self, image_set, year, use_diff=False):
+    name = 'voc_' + year + '_' + image_set
+    if use_diff:
+      name += '_diff'
+    imdb.__init__(self, name)
+    self._year = year
+    self._image_set = image_set
+    self._devkit_path = self._get_default_path()
+    self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
+    self._classes = ('__background__',  # always index 0
+                     'aeroplane', 'bicycle', 'bird', 'boat',
+                     'bottle', 'bus', 'car', 'cat', 'chair',
+                     'cow', 'diningtable', 'dog', 'horse',
+                     'motorbike', 'person', 'pottedplant',
+                     'sheep', 'sofa', 'train', 'tvmonitor')
+    self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes)))))
+    self._image_ext = '.jpg'
+    self._image_index = self._load_image_set_index()
+    # Default to roidb handler
+    self._roidb_handler = self.gt_roidb
+    self._salt = str(uuid.uuid4())
+    self._comp_id = 'comp4'
+    # PASCAL specific config options
+    self.config = {'cleanup': True,
+                   'use_salt': True,
+                   'use_diff': use_diff,
+                   'matlab_eval': False,
+                   'rpn_file': None}
+    assert os.path.exists(self._devkit_path), \
+      'VOCdevkit path does not exist: {}'.format(self._devkit_path)
+    assert os.path.exists(self._data_path), \
+      'Path does not exist: {}'.format(self._data_path)
+  def image_path_at(self, i):
+    """
+    Return the absolute path to image i in the image sequence.
+    """
+    return self.image_path_from_index(self._image_index[i])
+  def image_path_from_index(self, index):
+    """
+    Construct an image path from the image's "index" identifier.
+    """
+    image_path = os.path.join(self._data_path, 'JPEGImages',
+                              index + self._image_ext)
+    assert os.path.exists(image_path), \
+      'Path does not exist: {}'.format(image_path)
+    return image_path
+  def _load_image_set_index(self):
+    """
+    Load the indexes listed in this dataset's image set file.
+    """
+    # Example path to image set file:
+    # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
+    image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
+                                  self._image_set + '.txt')
+    assert os.path.exists(image_set_file), \
+      'Path does not exist: {}'.format(image_set_file)
+    with open(image_set_file) as f:
+      image_index = [x.strip() for x in f.readlines()]
+    return image_index
+  def _get_default_path(self):
+    """
+    Return the default path where PASCAL VOC is expected to be installed.
+    """
+    return os.path.join(cfg.DATA_DIR, 'voc', 'VOCdevkit')
+  def gt_roidb(self):
+    """
+    Return the database of ground-truth regions of interest.
+    This function loads/saves from/to a cache file to speed up future calls.
+    """
+    cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
+    if os.path.exists(cache_file):
+      with open(cache_file, 'rb') as fid:
+        try:
+          roidb = pickle.load(fid)
+        except:
+          roidb = pickle.load(fid, encoding='bytes')
+      print('{} gt roidb loaded from {}'.format(self.name, cache_file))
+      return roidb
+    gt_roidb = [self._load_pascal_annotation(index)
+                for index in self.image_index]
+    with open(cache_file, 'wb') as fid:
+      pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
+    print('wrote gt roidb to {}'.format(cache_file))
+    return gt_roidb
+  def rpn_roidb(self):
+    if int(self._year) == 2007 or self._image_set != 'test':
+      gt_roidb = self.gt_roidb()
+      rpn_roidb = self._load_rpn_roidb(gt_roidb)
+      roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
+    else:
+      roidb = self._load_rpn_roidb(None)
+    return roidb
+  def _load_rpn_roidb(self, gt_roidb):
+    filename = self.config['rpn_file']
+    print('loading {}'.format(filename))
+    assert os.path.exists(filename), \
+      'rpn data not found at: {}'.format(filename)
+    with open(filename, 'rb') as f:
+      box_list = pickle.load(f)
+    return self.create_roidb_from_box_list(box_list, gt_roidb)
+  def _load_pascal_annotation(self, index):
+    """
+    Load image and bounding boxes info from XML file in the PASCAL VOC
+    format.
+    """
+    filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
+    tree = ET.parse(filename)
+    objs = tree.findall('object')
+    if not self.config['use_diff']:
+      # Exclude the samples labeled as difficult
+      non_diff_objs = [
+        obj for obj in objs if int(obj.find('difficult').text) == 0]
+      # if len(non_diff_objs) != len(objs):
+      #     print 'Removed {} difficult objects'.format(
+      #         len(objs) - len(non_diff_objs))
+      objs = non_diff_objs
+    num_objs = len(objs)
+    boxes = np.zeros((num_objs, 4), dtype=np.uint16)
+    gt_classes = np.zeros((num_objs), dtype=np.int32)
+    overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
+    # "Seg" area for pascal is just the box area
+    seg_areas = np.zeros((num_objs), dtype=np.float32)
+    # Load object bounding boxes into a data frame.
+    for ix, obj in enumerate(objs):
+      bbox = obj.find('bndbox')
+      # Make pixel indexes 0-based
+      x1 = float(bbox.find('xmin').text) - 1
+      y1 = float(bbox.find('ymin').text) - 1
+      x2 = float(bbox.find('xmax').text) - 1
+      y2 = float(bbox.find('ymax').text) - 1
+      cls = self._class_to_ind[obj.find('name').text.lower().strip()]
+      boxes[ix, :] = [x1, y1, x2, y2]
+      gt_classes[ix] = cls
+      overlaps[ix, cls] = 1.0
+      seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
+    overlaps = scipy.sparse.csr_matrix(overlaps)
+    return {'boxes': boxes,
+            'gt_classes': gt_classes,
+            'gt_overlaps': overlaps,
+            'flipped': False,
+            'seg_areas': seg_areas}
+  def _get_comp_id(self):
+    comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
+               else self._comp_id)
+    return comp_id
+  def _get_voc_results_file_template(self):
+    # VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
+    filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
+    path = os.path.join(
+      self._devkit_path,
+      'results',
+      'VOC' + self._year,
+      'Main',
+      filename)
+    return path
+  def _write_voc_results_file(self, all_boxes):
+    for cls_ind, cls in enumerate(self.classes):
+      if cls == '__background__':
+        continue
+      # print('Writing {} VOC results file'.format(cls))
+      filename = self._get_voc_results_file_template().format(cls)
+      # print(filename)
+      with open(filename, 'wt') as f:
+        for im_ind, index in enumerate(self.image_index):
+          dets = np.array(all_boxes[cls_ind][im_ind])
+          if len(dets) == 0:
+            continue
+          # the VOCdevkit expects 1-based indices
+          for k in range(dets.shape[0]):
+            f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
+                    format(index, dets[k, -1],
+                           dets[k, 0] + 1, dets[k, 1] + 1,
+                           dets[k, 2] + 1, dets[k, 3] + 1))
+  def _do_python_eval(self, output_dir=None):
+    annopath = os.path.join(
+      self._devkit_path,
+      'VOC' + self._year,
+      'Annotations',
+      '{:s}.xml')
+    imagesetfile = os.path.join(
+      self._devkit_path,
+      'VOC' + self._year,
+      'ImageSets',
+      'Main',
+      self._image_set + '.txt')
+    cachedir = os.path.join(self._devkit_path, 'annotations_cache')
+    aps = []
+    # The PASCAL VOC metric changed in 2010
+    use_07_metric = True if int(self._year) < 2010 else False
+    print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
+    if output_dir is not None and not os.path.isdir(output_dir):
+      os.mkdir(output_dir)
+    for i, cls in enumerate(self._classes):
+      if cls == '__background__':
+        continue
+      filename = self._get_voc_results_file_template().format(cls)
+      rec, prec, ap = voc_eval(
+        filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
+        use_07_metric=use_07_metric, use_diff=self.config['use_diff'])
+      aps += [ap]
+      print(('AP for {} = {:.4f}'.format(cls, ap)))
+      if output_dir is not None:
+        with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
+          pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
+    print(('Mean AP = {:.4f}'.format(np.mean(aps))))
+    print('~~~~~~~~')
+    '''
+    print('Results:')
+    for ap in aps:
+      print(('{:.3f}'.format(ap)))
+    print(('{:.3f}'.format(np.mean(aps))))
+    print('~~~~~~~~')
+    print('')
+    print('--------------------------------------------------------------')
+    print('Results computed with the **unofficial** Python eval code.')
+    print('Results should be very close to the official MATLAB eval code.')
+    print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
+    print('-- Thanks, The Management')
+    print('--------------------------------------------------------------')
+    '''
+  def _do_matlab_eval(self, output_dir='output'):
+    print('-----------------------------------------------------')
+    print('Computing results with the official MATLAB eval code.')
+    print('-----------------------------------------------------')
+    path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
+                        'VOCdevkit-matlab-wrapper')
+    cmd = 'cd {} && '.format(path)
+    cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
+    cmd += '-r "dbstop if error; '
+    cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
+      .format(self._devkit_path, self._get_comp_id(),
+              self._image_set, output_dir)
+    print(('Running:\n{}'.format(cmd)))
+    status = subprocess.call(cmd, shell=True)
+  def evaluate_detections(self, all_boxes, output_dir=None):
+    self._write_voc_results_file(all_boxes)
+    self._do_python_eval(output_dir)
+    if self.config['matlab_eval']:
+      self._do_matlab_eval(output_dir)
+    if self.config['cleanup']:
+      for cls in self._classes:
+        if cls == '__background__':
+          continue
+        filename = self._get_voc_results_file_template().format(cls)
+        os.remove(filename)
+  def competition_mode(self, on):
+    if on:
+      self.config['use_salt'] = False
+      self.config['cleanup'] = False
+    else:
+      self.config['use_salt'] = True
+      self.config['cleanup'] = True
+if __name__ == '__main__':
+  from datasets.pascal_voc import pascal_voc
+  d = pascal_voc('trainval', '2007')
+  res = d.roidb
+  from IPython import embed;
+  embed()
--- a/src/tools/voc_eval_lib/datasets/voc_eval.py
+++ b/src/tools/voc_eval_lib/datasets/voc_eval.py
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import xml.etree.ElementTree as ET
+import os
+import pickle
+import numpy as np
+def parse_rec(filename):
+  """ Parse a PASCAL VOC xml file """
+  tree = ET.parse(filename)
+  objects = []
+  for obj in tree.findall('object'):
+    obj_struct = {}
+    obj_struct['name'] = obj.find('name').text
+    obj_struct['pose'] = obj.find('pose').text
+    obj_struct['truncated'] = int(obj.find('truncated').text)
+    obj_struct['difficult'] = int(obj.find('difficult').text)
+    bbox = obj.find('bndbox')
+    obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                          int(bbox.find('ymin').text),
+                          int(bbox.find('xmax').text),
+                          int(bbox.find('ymax').text)]
+    objects.append(obj_struct)
+  return objects
+def voc_ap(rec, prec, use_07_metric=False):
+  """ ap = voc_ap(rec, prec, [use_07_metric])
+  Compute VOC AP given precision and recall.
+  If use_07_metric is true, uses the
+  VOC 07 11 point method (default:False).
+  """
+  if use_07_metric:
+    # 11 point metric
+    ap = 0.
+    for t in np.arange(0., 1.1, 0.1):
+      if np.sum(rec >= t) == 0:
+        p = 0
+      else:
+        p = np.max(prec[rec >= t])
+        # print(t, p)
+      ap = ap + p / 11.
+  else:
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], rec, [1.]))
+    mpre = np.concatenate(([0.], prec, [0.]))
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+      mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+  return ap
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=False,
+             use_diff=False):
+  """rec, prec, ap = voc_eval(detpath,
+                              annopath,
+                              imagesetfile,
+                              classname,
+                              [ovthresh],
+                              [use_07_metric])
+  Top level function that does the PASCAL VOC evaluation.
+  detpath: Path to detections
+      detpath.format(classname) should produce the detection results file.
+  annopath: Path to annotations
+      annopath.format(imagename) should be the xml annotations file.
+  imagesetfile: Text file containing the list of images, one image per line.
+  classname: Category name (duh)
+  cachedir: Directory for caching the annotations
+  [ovthresh]: Overlap threshold (default = 0.5)
+  [use_07_metric]: Whether to use VOC07's 11 point AP computation
+      (default False)
+  """
+  # assumes detections are in detpath.format(classname)
+  # assumes annotations are in annopath.format(imagename)
+  # assumes imagesetfile is a text file with each line an image name
+  # cachedir caches the annotations in a pickle file
+  # first load gt
+  if not os.path.isdir(cachedir):
+    os.mkdir(cachedir)
+  cachefile = os.path.join(cachedir, '%s_annots.pkl' % imagesetfile)
+  # read list of images
+  with open(imagesetfile, 'r') as f:
+    lines = f.readlines()
+  imagenames = [x.strip() for x in lines]
+  if not os.path.isfile(cachefile):
+    # load annotations
+    recs = {}
+    for i, imagename in enumerate(imagenames):
+      recs[imagename] = parse_rec(annopath.format(imagename))
+      if i % 100 == 0:
+        print('Reading annotation for {:d}/{:d}'.format(
+          i + 1, len(imagenames)))
+    # save
+    print('Saving cached annotations to {:s}'.format(cachefile))
+    with open(cachefile, 'wb') as f:
+      pickle.dump(recs, f)
+  else:
+    # load
+    with open(cachefile, 'rb') as f:
+      try:
+        recs = pickle.load(f)
+      except:
+        recs = pickle.load(f, encoding='bytes')
+  # extract gt objects for this class
+  class_recs = {}
+  npos = 0
+  for imagename in imagenames:
+    R = [obj for obj in recs[imagename] if obj['name'] == classname]
+    bbox = np.array([x['bbox'] for x in R])
+    if use_diff:
+      difficult = np.array([False for x in R]).astype(np.bool)
+    else:
+      difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+    det = [False] * len(R)
+    npos = npos + sum(~difficult)
+    class_recs[imagename] = {'bbox': bbox,
+                             'difficult': difficult,
+                             'det': det}
+  # read dets
+  detfile = detpath.format(classname)
+  with open(detfile, 'r') as f:
+    lines = f.readlines()
+  splitlines = [x.strip().split(' ') for x in lines]
+  image_ids = [x[0] for x in splitlines]
+  confidence = np.array([float(x[1]) for x in splitlines])
+  BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+  nd = len(image_ids)
+  tp = np.zeros(nd)
+  fp = np.zeros(nd)
+  if BB.shape[0] > 0:
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    sorted_scores = np.sort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+    # go down dets and mark TPs and FPs
+    for d in range(nd):
+      R = class_recs[image_ids[d]]
+      bb = BB[d, :].astype(float)
+      ovmax = -np.inf
+      BBGT = R['bbox'].astype(float)
+      if BBGT.size > 0:
+        # compute overlaps
+        # intersection
+        ixmin = np.maximum(BBGT[:, 0], bb[0])
+        iymin = np.maximum(BBGT[:, 1], bb[1])
+        ixmax = np.minimum(BBGT[:, 2], bb[2])
+        iymax = np.minimum(BBGT[:, 3], bb[3])
+        iw = np.maximum(ixmax - ixmin + 1., 0.)
+        ih = np.maximum(iymax - iymin + 1., 0.)
+        inters = iw * ih
+        # union
+        uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+               (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+               (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+        overlaps = inters / uni
+        ovmax = np.max(overlaps)
+        jmax = np.argmax(overlaps)
+      if ovmax > ovthresh:
+        if not R['difficult'][jmax]:
+          if not R['det'][jmax]:
+            tp[d] = 1.
+            R['det'][jmax] = 1
+          else:
+            fp[d] = 1.
+      else:
+        fp[d] = 1.
+  # compute precision recall
+  fp = np.cumsum(fp)
+  tp = np.cumsum(tp)
+  rec = tp / float(npos)
+  # avoid divide by zero in case the first detection matches a difficult
+  # ground truth
+  prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+  ap = voc_ap(rec, prec, use_07_metric)
+  return rec, prec, ap
--- a/src/tools/voc_eval_lib/model/__init__.py
+++ b/src/tools/voc_eval_lib/model/__init__.py
--- a/src/tools/voc_eval_lib/model/bbox_transform.py
+++ b/src/tools/voc_eval_lib/model/bbox_transform.py
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def bbox_transform(ex_rois, gt_rois):
+  ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+  ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+  ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+  ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
+  gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+  gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+  gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+  gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
+  targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
+  targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
+  targets_dw = np.log(gt_widths / ex_widths)
+  targets_dh = np.log(gt_heights / ex_heights)
+  targets = np.vstack(
+    (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+  return targets
+def bbox_transform_inv(boxes, deltas):
+  if boxes.shape[0] == 0:
+    return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+  boxes = boxes.astype(deltas.dtype, copy=False)
+  widths = boxes[:, 2] - boxes[:, 0] + 1.0
+  heights = boxes[:, 3] - boxes[:, 1] + 1.0
+  ctr_x = boxes[:, 0] + 0.5 * widths
+  ctr_y = boxes[:, 1] + 0.5 * heights
+  dx = deltas[:, 0::4]
+  dy = deltas[:, 1::4]
+  dw = deltas[:, 2::4]
+  dh = deltas[:, 3::4]
+  pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+  pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+  pred_w = np.exp(dw) * widths[:, np.newaxis]
+  pred_h = np.exp(dh) * heights[:, np.newaxis]
+  pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+  # x1
+  pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+  # y1
+  pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+  # x2
+  pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
+  # y2
+  pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
+  return pred_boxes
+def clip_boxes(boxes, im_shape):
+  """
+  Clip boxes to image boundaries.
+  """
+  # x1 >= 0
+  boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+  # y1 >= 0
+  boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+  # x2 < im_shape[1]
+  boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+  # y2 < im_shape[0]
+  boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+  return boxes
--- a/src/tools/voc_eval_lib/model/config.py
+++ b/src/tools/voc_eval_lib/model/config.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import os.path as osp
+import numpy as np
+# `pip install easydict` if you don't have it
+from easydict import EasyDict as edict
+__C = edict()
+# Consumers can get config by:
+#   from fast_rcnn_config import cfg
+cfg = __C
+#
+# Training options
+#
+__C.TRAIN = edict()
+# Initial learning rate
+__C.TRAIN.LEARNING_RATE = 0.001
+# Momentum
+__C.TRAIN.MOMENTUM = 0.9
+# Weight decay, for regularization
+__C.TRAIN.WEIGHT_DECAY = 0.0001
+# Factor for reducing the learning rate
+__C.TRAIN.GAMMA = 0.1
+# Step size for reducing the learning rate, currently only support one step
+__C.TRAIN.STEPSIZE = [30000]
+# Iteration intervals for showing the loss during training, on command line interface
+__C.TRAIN.DISPLAY = 10
+# Whether to double the learning rate for bias
+__C.TRAIN.DOUBLE_BIAS = True
+# Whether to initialize the weights with truncated normal distribution 
+__C.TRAIN.TRUNCATED = False
+# Whether to have weight decay on bias as well
+__C.TRAIN.BIAS_DECAY = False
+# Whether to add ground truth boxes to the pool when sampling regions
+__C.TRAIN.USE_GT = False
+# Whether to use aspect-ratio grouping of training images, introduced merely for saving
+# GPU memory
+__C.TRAIN.ASPECT_GROUPING = False
+# The number of snapshots kept, older ones are deleted to save space
+__C.TRAIN.SNAPSHOT_KEPT = 3
+# The time interval for saving tensorflow summaries
+__C.TRAIN.SUMMARY_INTERVAL = 180
+# Scale to use during training (can list multiple scales)
+# The scale is the pixel size of an image's shortest side
+__C.TRAIN.SCALES = (600,)
+# Max pixel size of the longest side of a scaled input image
+__C.TRAIN.MAX_SIZE = 1000
+# Images to use per minibatch
+__C.TRAIN.IMS_PER_BATCH = 1
+# Minibatch size (number of regions of interest [ROIs])
+__C.TRAIN.BATCH_SIZE = 128
+# Fraction of minibatch that is labeled foreground (i.e. class > 0)
+__C.TRAIN.FG_FRACTION = 0.25
+# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
+__C.TRAIN.FG_THRESH = 0.5
+# Overlap threshold for a ROI to be considered background (class = 0 if
+# overlap in [LO, HI))
+__C.TRAIN.BG_THRESH_HI = 0.5
+__C.TRAIN.BG_THRESH_LO = 0.1
+# Use horizontally-flipped images during training?
+__C.TRAIN.USE_FLIPPED = True
+# Train bounding-box regressors
+__C.TRAIN.BBOX_REG = True
+# Overlap required between a ROI and ground-truth box in order for that ROI to
+# be used as a bounding-box regression training example
+__C.TRAIN.BBOX_THRESH = 0.5
+# Iterations between snapshots
+__C.TRAIN.SNAPSHOT_ITERS = 5000
+# solver.prototxt specifies the snapshot path prefix, this adds an optional
+# infix to yield the path: <prefix>[_<infix>]_iters_XYZ.caffemodel
+__C.TRAIN.SNAPSHOT_PREFIX = 'res101_faster_rcnn'
+# Normalize the targets (subtract empirical mean, divide by empirical stddev)
+__C.TRAIN.BBOX_NORMALIZE_TARGETS = True
+# Deprecated (inside weights)
+__C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# Normalize the targets using "precomputed" (or made up) means and stdevs
+# (BBOX_NORMALIZE_TARGETS must also be True)
+__C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = True
+__C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0)
+__C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2)
+# Train using these proposals
+__C.TRAIN.PROPOSAL_METHOD = 'gt'
+# Make minibatches from images that have similar aspect ratios (i.e. both
+# tall and thin or both short and wide) in order to avoid wasting computation
+# on zero-padding.
+# Use RPN to detect objects
+__C.TRAIN.HAS_RPN = True
+# IOU >= thresh: positive example
+__C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
+# IOU < thresh: negative example
+__C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
+# If an anchor satisfied by positive and negative conditions set to negative
+__C.TRAIN.RPN_CLOBBER_POSITIVES = False
+# Max number of foreground examples
+__C.TRAIN.RPN_FG_FRACTION = 0.5
+# Total number of examples
+__C.TRAIN.RPN_BATCHSIZE = 256
+# NMS threshold used on RPN proposals
+__C.TRAIN.RPN_NMS_THRESH = 0.7
+# Number of top scoring boxes to keep before apply NMS to RPN proposals
+__C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
+# Number of top scoring boxes to keep after applying NMS to RPN proposals
+__C.TRAIN.RPN_POST_NMS_TOP_N = 2000
+# Deprecated (outside weights)
+__C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# Give the positive RPN examples weight of p * 1 / {num positives}
+# and give negatives a weight of (1 - p)
+# Set to -1.0 to use uniform example weighting
+__C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
+# Whether to use all ground truth bounding boxes for training, 
+# For COCO, setting USE_ALL_GT to False will exclude boxes that are flagged as ''iscrowd''
+__C.TRAIN.USE_ALL_GT = True
+#
+# Testing options
+#
+__C.TEST = edict()
+# Scale to use during testing (can NOT list multiple scales)
+# The scale is the pixel size of an image's shortest side
+__C.TEST.SCALES = (600,)
+# Max pixel size of the longest side of a scaled input image
+__C.TEST.MAX_SIZE = 1000
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+__C.TEST.NMS = 0.3
+# Experimental: treat the (K+1) units in the cls_score layer as linear
+# predictors (trained, eg, with one-vs-rest SVMs).
+__C.TEST.SVM = False
+# Test using bounding-box regressors
+__C.TEST.BBOX_REG = True
+# Propose boxes
+__C.TEST.HAS_RPN = False
+# Test using these proposals
+__C.TEST.PROPOSAL_METHOD = 'gt'
+## NMS threshold used on RPN proposals
+__C.TEST.RPN_NMS_THRESH = 0.7
+# Number of top scoring boxes to keep before apply NMS to RPN proposals
+__C.TEST.RPN_PRE_NMS_TOP_N = 6000
+# Number of top scoring boxes to keep after applying NMS to RPN proposals
+__C.TEST.RPN_POST_NMS_TOP_N = 300
+# Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
+# __C.TEST.RPN_MIN_SIZE = 16
+# Testing mode, default to be 'nms', 'top' is slower but better
+# See report for details
+__C.TEST.MODE = 'nms'
+# Only useful when TEST.MODE is 'top', specifies the number of top proposals to select
+__C.TEST.RPN_TOP_N = 5000
+#
+# ResNet options
+#
+__C.RESNET = edict()
+# Option to set if max-pooling is appended after crop_and_resize. 
+# if true, the region will be resized to a square of 2xPOOLING_SIZE, 
+# then 2x2 max-pooling is applied; otherwise the region will be directly
+# resized to a square of POOLING_SIZE
+__C.RESNET.MAX_POOL = False
+# Number of fixed blocks during training, by default the first of all 4 blocks is fixed
+# Range: 0 (none) to 3 (all)
+__C.RESNET.FIXED_BLOCKS = 1
+#
+# MobileNet options
+#
+__C.MOBILENET = edict()
+# Whether to regularize the depth-wise filters during training
+__C.MOBILENET.REGU_DEPTH = False
+# Number of fixed layers during training, by default the bottom 5 of 14 layers is fixed
+# Range: 0 (none) to 12 (all)
+__C.MOBILENET.FIXED_LAYERS = 5
+# Weight decay for the mobilenet weights
+__C.MOBILENET.WEIGHT_DECAY = 0.00004
+# Depth multiplier
+__C.MOBILENET.DEPTH_MULTIPLIER = 1.
+#
+# MISC
+#
+# Pixel mean values (BGR order) as a (1, 1, 3) array
+# We use the same pixel mean for all networks even though it's not exactly what
+# they were trained with
+__C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]])
+# For reproducibility
+__C.RNG_SEED = 3
+# Root directory of project
+__C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..', '..', '..'))
+# Data directory
+__C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data'))
+# Name (or path to) the matlab executable
+__C.MATLAB = 'matlab'
+# Place outputs under an experiments directory
+__C.EXP_DIR = 'default'
+# Use GPU implementation of non-maximum suppression
+__C.USE_GPU_NMS = True
+# Use an end-to-end tensorflow model.
+# Note: models in E2E tensorflow mode have only been tested in feed-forward mode,
+#       but these models are exportable to other tensorflow instances as GraphDef files.
+__C.USE_E2E_TF = True
+# Default pooling mode, only 'crop' is available
+__C.POOLING_MODE = 'crop'
+# Size of the pooled region after RoI pooling
+__C.POOLING_SIZE = 7
+# Anchor scales for RPN
+__C.ANCHOR_SCALES = [8,16,32]
+# Anchor ratios for RPN
+__C.ANCHOR_RATIOS = [0.5,1,2]
+# Number of filters for the RPN layer
+__C.RPN_CHANNELS = 512
+def get_output_dir(imdb, weights_filename):
+  """Return the directory where experimental artifacts are placed.
+  If the directory does not exist, it is created.
+  A canonical path is built using the name from an imdb and a network
+  (if not None).
+  """
+  outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name))
+  if weights_filename is None:
+    weights_filename = 'default'
+  outdir = osp.join(outdir, weights_filename)
+  if not os.path.exists(outdir):
+    os.makedirs(outdir)
+  return outdir
+def get_output_tb_dir(imdb, weights_filename):
+  """Return the directory where tensorflow summaries are placed.
+  If the directory does not exist, it is created.
+  A canonical path is built using the name from an imdb and a network
+  (if not None).
+  """
+  outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'tensorboard', __C.EXP_DIR, imdb.name))
+  if weights_filename is None:
+    weights_filename = 'default'
+  outdir = osp.join(outdir, weights_filename)
+  if not os.path.exists(outdir):
+    os.makedirs(outdir)
+  return outdir
+def _merge_a_into_b(a, b):
+  """Merge config dictionary a into config dictionary b, clobbering the
+  options in b whenever they are also specified in a.
+  """
+  if type(a) is not edict:
+    return
+  for k, v in a.items():
+    # a must specify keys that are in b
+    if k not in b:
+      raise KeyError('{} is not a valid config key'.format(k))
+    # the types must match, too
+    old_type = type(b[k])
+    if old_type is not type(v):
+      if isinstance(b[k], np.ndarray):
+        v = np.array(v, dtype=b[k].dtype)
+      else:
+        raise ValueError(('Type mismatch ({} vs. {}) '
+                          'for config key: {}').format(type(b[k]),
+                                                       type(v), k))
+    # recursively merge dicts
+    if type(v) is edict:
+      try:
+        _merge_a_into_b(a[k], b[k])
+      except:
+        print(('Error under config key: {}'.format(k)))
+        raise
+    else:
+      b[k] = v
+def cfg_from_file(filename):
+  """Load a config file and merge it into the default options."""
+  import yaml
+  with open(filename, 'r') as f:
+    yaml_cfg = edict(yaml.load(f))
+  _merge_a_into_b(yaml_cfg, __C)
+def cfg_from_list(cfg_list):
+  """Set config keys via list (e.g., from command line)."""
+  from ast import literal_eval
+  assert len(cfg_list) % 2 == 0
+  for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
+    key_list = k.split('.')
+    d = __C
+    for subkey in key_list[:-1]:
+      assert subkey in d
+      d = d[subkey]
+    subkey = key_list[-1]
+    assert subkey in d
+    try:
+      value = literal_eval(v)
+    except:
+      # handle the case when v is a string literal
+      value = v
+    assert type(value) == type(d[subkey]), \
+      'type {} does not match original type {}'.format(
+        type(value), type(d[subkey]))
+    d[subkey] = value
--- a/src/tools/voc_eval_lib/model/nms_wrapper.py
+++ b/src/tools/voc_eval_lib/model/nms_wrapper.py
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from model.config import cfg
+from nms.gpu_nms import gpu_nms
+from nms.cpu_nms import cpu_nms
+def nms(dets, thresh, force_cpu=False):
+  """Dispatch to either CPU or GPU NMS implementations."""
+  if dets.shape[0] == 0:
+    return []
+  if cfg.USE_GPU_NMS and not force_cpu:
+    return gpu_nms(dets, thresh, device_id=0)
+  else:
+    return cpu_nms(dets, thresh)
--- a/src/tools/voc_eval_lib/model/test.py
+++ b/src/tools/voc_eval_lib/model/test.py
+# --------------------------------------------------------
+# Tensorflow Faster R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xinlei Chen
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import cv2
+import numpy as np
+try:
+  import cPickle as pickle
+except ImportError:
+  import pickle
+import os
+import math
+from utils.timer import Timer
+from utils.blob import im_list_to_blob
+from model.config import cfg, get_output_dir
+from model.bbox_transform import clip_boxes, bbox_transform_inv
+# from model.nms_wrapper import nms  # need to compile cython nms before import nms
+nms = None  # not needed in pascal evaluation
+def _get_image_blob(im):
+  """Converts an image into a network input.
+  Arguments:
+    im (ndarray): a color image in BGR order
+  Returns:
+    blob (ndarray): a data blob holding an image pyramid
+    im_scale_factors (list): list of image scales (relative to im) used
+      in the image pyramid
+  """
+  im_orig = im.astype(np.float32, copy=True)
+  im_orig -= cfg.PIXEL_MEANS
+  im_shape = im_orig.shape
+  im_size_min = np.min(im_shape[0:2])
+  im_size_max = np.max(im_shape[0:2])
+  processed_ims = []
+  im_scale_factors = []
+  for target_size in cfg.TEST.SCALES:
+    im_scale = float(target_size) / float(im_size_min)
+    # Prevent the biggest axis from being more than MAX_SIZE
+    if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
+      im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
+    im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
+            interpolation=cv2.INTER_LINEAR)
+    im_scale_factors.append(im_scale)
+    processed_ims.append(im)
+  # Create a blob to hold the input images
+  blob = im_list_to_blob(processed_ims)
+  return blob, np.array(im_scale_factors)
+def _get_blobs(im):
+  """Convert an image and RoIs within that image into network inputs."""
+  blobs = {}
+  blobs['data'], im_scale_factors = _get_image_blob(im)
+  return blobs, im_scale_factors
+def _clip_boxes(boxes, im_shape):
+  """Clip boxes to image boundaries."""
+  # x1 >= 0
+  boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
+  # y1 >= 0
+  boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
+  # x2 < im_shape[1]
+  boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
+  # y2 < im_shape[0]
+  boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
+  return boxes
+def _rescale_boxes(boxes, inds, scales):
+  """Rescale boxes according to image rescaling."""
+  for i in range(boxes.shape[0]):
+    boxes[i,:] = boxes[i,:] / scales[int(inds[i])]
+  return boxes
+def im_detect(sess, net, im):
+  blobs, im_scales = _get_blobs(im)
+  assert len(im_scales) == 1, "Only single-image batch implemented"
+  im_blob = blobs['data']
+  blobs['im_info'] = np.array([im_blob.shape[1], im_blob.shape[2], im_scales[0]], dtype=np.float32)
+  _, scores, bbox_pred, rois = net.test_image(sess, blobs['data'], blobs['im_info'])
+  boxes = rois[:, 1:5] / im_scales[0]
+  scores = np.reshape(scores, [scores.shape[0], -1])
+  bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1])
+  if cfg.TEST.BBOX_REG:
+    # Apply bounding-box regression deltas
+    box_deltas = bbox_pred
+    pred_boxes = bbox_transform_inv(boxes, box_deltas)
+    pred_boxes = _clip_boxes(pred_boxes, im.shape)
+  else:
+    # Simply repeat the boxes, once for each class
+    pred_boxes = np.tile(boxes, (1, scores.shape[1]))
+  return scores, pred_boxes
+def apply_nms(all_boxes, thresh):
+  """Apply non-maximum suppression to all predicted boxes output by the
+  test_net method.
+  """
+  num_classes = len(all_boxes)
+  num_images = len(all_boxes[0])
+  nms_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
+  for cls_ind in range(num_classes):
+    for im_ind in range(num_images):
+      dets = np.array(all_boxes[cls_ind][im_ind], dtype=np.float32)
+      if len(dets) == 0:
+        continue
+      #print('dets', dets)
+      x1 = dets[:, 0]
+      y1 = dets[:, 1]
+      x2 = dets[:, 2]
+      y2 = dets[:, 3]
+      scores = dets[:, 4]
+      inds = np.where((x2 > x1) & (y2 > y1))[0]
+      dets = dets[inds,:]
+      if dets == []:
+        continue
+      keep = nms(dets, thresh)
+      if len(keep) == 0:
+        continue
+      nms_boxes[cls_ind][im_ind] = dets[keep, :].copy()
+  return nms_boxes
+def test_net(sess, net, imdb, weights_filename, max_per_image=100, thresh=0.):
+  np.random.seed(cfg.RNG_SEED)
+  """Test a Fast R-CNN network on an image database."""
+  num_images = len(imdb.image_index)
+  # all detections are collected into:
+  #  all_boxes[cls][image] = N x 5 array of detections in
+  #  (x1, y1, x2, y2, score)
+  all_boxes = [[[] for _ in range(num_images)]
+         for _ in range(imdb.num_classes)]
+  output_dir = get_output_dir(imdb, weights_filename)
+  # timers
+  _t = {'im_detect' : Timer(), 'misc' : Timer()}
+  for i in range(num_images):
+    im = cv2.imread(imdb.image_path_at(i))
+    _t['im_detect'].tic()
+    scores, boxes = im_detect(sess, net, im)
+    _t['im_detect'].toc()
+    _t['misc'].tic()
+    # skip j = 0, because it's the background class
+    for j in range(1, imdb.num_classes):
+      inds = np.where(scores[:, j] > thresh)[0]
+      cls_scores = scores[inds, j]
+      cls_boxes = boxes[inds, j*4:(j+1)*4]
+      cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
+        .astype(np.float32, copy=False)
+      keep = nms(cls_dets, cfg.TEST.NMS)
+      cls_dets = cls_dets[keep, :]
+      all_boxes[j][i] = cls_dets
+    # Limit to max_per_image detections *over all classes*
+    if max_per_image > 0:
+      image_scores = np.hstack([all_boxes[j][i][:, -1]
+                    for j in range(1, imdb.num_classes)])
+      if len(image_scores) > max_per_image:
+        image_thresh = np.sort(image_scores)[-max_per_image]
+        for j in range(1, imdb.num_classes):
+          keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
+          all_boxes[j][i] = all_boxes[j][i][keep, :]
+    _t['misc'].toc()
+    print('im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \
+        .format(i + 1, num_images, _t['im_detect'].average_time,
+            _t['misc'].average_time))
+  det_file = os.path.join(output_dir, 'detections.pkl')
+  with open(det_file, 'wb') as f:
+    pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
+  print('Evaluating detections')
+  imdb.evaluate_detections(all_boxes, output_dir)
--- a/src/tools/voc_eval_lib/nms/.gitignore
+++ b/src/tools/voc_eval_lib/nms/.gitignore
--- a/src/tools/voc_eval_lib/nms/__init__.py
+++ b/src/tools/voc_eval_lib/nms/__init__.py
--- a/src/tools/voc_eval_lib/nms/cpu_nms.c
+++ b/src/tools/voc_eval_lib/nms/cpu_nms.c
--- a/src/tools/voc_eval_lib/nms/cpu_nms.pyx
+++ b/src/tools/voc_eval_lib/nms/cpu_nms.pyx
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+import numpy as np
+cimport numpy as np
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+    return keep