Make data pre-processing pipeline customizable (#935)

* define data pipelines * update two config files * minor fix for config files * allow img_scale to be optional and update config * add some docstrings * add extra aug to transform * bug fix for mask resizing * fix cropping * add faster rcnn example * fix imports * fix robustness testing * add img_norm_cfg to img_meta * fix the inference api with the new data pipeline * fix proposal loading * delete args of DefaultFormatBundle * add more configs * update configs * bug fix * add a brief doc * update gt_labels in RandomCrop * fix key error for new apis * bug fix for masks of crowd bboxes * add argument data_root * minor fix * update new hrnet configs * update docs * rename MultiscaleFlipAug to MultiScaleFlipAug * add __repr__ for all transforms * move DATA_PIPELINE.md to docs/ * fix image url

Make data pre-processing pipeline customizable (#935)
* define data pipelines * update two config files * minor fix for config files * allow img_scale to be optional and update config * add some docstrings * add extra aug to transform * bug fix for mask resizing * fix cropping * add faster rcnn example * fix imports * fix robustness testing * add img_norm_cfg to img_meta * fix the inference api with the new data pipeline * fix proposal loading * delete args of DefaultFormatBundle * add more configs * update configs * bug fix * add a brief doc * update gt_labels in RandomCrop * fix key error for new apis * bug fix for masks of crowd bboxes * add argument data_root * minor fix * update new hrnet configs * update docs * rename MultiscaleFlipAug to MultiScaleFlipAug * add __repr__ for all transforms * move DATA_PIPELINE.md to docs/ * fix image url
0d5233a3 · Kai Chen · GitHub · 7bb38af4 · 0d5233a3 · 0d5233a3
Unverified Commit 0d5233a3 authored Aug 23, 2019 by Kai Chen Committed by GitHub Aug 23, 2019
20 changed files
--- a/configs/retinanet_x101_32x4d_fpn_1x.py
+++ b/configs/retinanet_x101_32x4d_fpn_1x.py
@@ -59,6 +59,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -66,36 +91,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=False,
-        with_label=True),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=True),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
 optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

--- a/configs/retinanet_x101_64x4d_fpn_1x.py
+++ b/configs/retinanet_x101_64x4d_fpn_1x.py
@@ -59,6 +59,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -66,36 +91,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=False,
-        with_label=True),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=True),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
 optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

--- a/configs/rpn_r101_fpn_1x.py
+++ b/configs/rpn_r101_fpn_1x.py
@@ -57,6 +57,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -64,35 +89,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 # runner configs

--- a/configs/rpn_r50_caffe_c4_1x.py
+++ b/configs/rpn_r50_caffe_c4_1x.py
@@ -57,6 +57,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -64,35 +89,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 # runner configs

--- a/configs/rpn_r50_fpn_1x.py
+++ b/configs/rpn_r50_fpn_1x.py
@@ -57,6 +57,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -64,35 +89,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 # runner configs

--- a/configs/rpn_x101_32x4d_fpn_1x.py
+++ b/configs/rpn_x101_32x4d_fpn_1x.py
@@ -59,6 +59,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -66,35 +91,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 # runner configs

--- a/configs/rpn_x101_64x4d_fpn_1x.py
+++ b/configs/rpn_x101_64x4d_fpn_1x.py
@@ -59,6 +59,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -66,35 +91,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=False,
-        with_label=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 # runner configs

--- a/configs/scratch/scratch_faster_rcnn_r50_fpn_gn_6x.py
+++ b/configs/scratch/scratch_faster_rcnn_r50_fpn_gn_6x.py
@@ -107,6 +107,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -114,35 +139,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=False,
-        with_crowd=True,
-        with_label=True),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_crowd=True,
-        with_label=True),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(
    type='SGD',

--- a/configs/scratch/scratch_mask_rcnn_r50_fpn_gn_6x.py
+++ b/configs/scratch/scratch_mask_rcnn_r50_fpn_gn_6x.py
@@ -124,6 +124,31 @@ dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=2,
    workers_per_gpu=2,
@@ -131,35 +156,17 @@ data = dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_train2017.json',
        img_prefix=data_root + 'train2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0.5,
-        with_mask=True,
-        with_crowd=True,
-        with_label=True),
+        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=True,
-        with_crowd=True,
-        with_label=True),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(1333, 800),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=32,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(
    type='SGD',

--- a/configs/ssd300_coco.py
+++ b/configs/ssd300_coco.py
@@ -47,6 +47,43 @@ test_cfg = dict(
 dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=8,
    workers_per_gpu=3,
@@ -57,51 +94,17 @@ data = dict(
            type=dataset_type,
            ann_file=data_root + 'annotations/instances_train2017.json',
            img_prefix=data_root + 'train2017/',
-            img_scale=(300, 300),
-            img_norm_cfg=img_norm_cfg,
-            size_divisor=None,
-            flip_ratio=0.5,
-            with_mask=False,
-            with_crowd=False,
-            with_label=True,
-            test_mode=False,
-            extra_aug=dict(
-                photo_metric_distortion=dict(
-                    brightness_delta=32,
-                    contrast_range=(0.5, 1.5),
-                    saturation_range=(0.5, 1.5),
-                    hue_delta=18),
-                expand=dict(
-                    mean=img_norm_cfg['mean'],
-                    to_rgb=img_norm_cfg['to_rgb'],
-                    ratio_range=(1, 4)),
-                random_crop=dict(
-                    min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)),
-            resize_keep_ratio=False)),
+            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(300, 300),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=None,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True,
-        resize_keep_ratio=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(300, 300),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=None,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True,
-        resize_keep_ratio=False))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
 optimizer_config = dict()

--- a/configs/ssd512_coco.py
+++ b/configs/ssd512_coco.py
@@ -47,6 +47,43 @@ test_cfg = dict(
 dataset_type = 'CocoDataset'
 data_root = 'data/coco/'
 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=8,
    workers_per_gpu=3,
@@ -57,51 +94,17 @@ data = dict(
            type=dataset_type,
            ann_file=data_root + 'annotations/instances_train2017.json',
            img_prefix=data_root + 'train2017/',
-            img_scale=(512, 512),
-            img_norm_cfg=img_norm_cfg,
-            size_divisor=None,
-            flip_ratio=0.5,
-            with_mask=False,
-            with_crowd=False,
-            with_label=True,
-            test_mode=False,
-            extra_aug=dict(
-                photo_metric_distortion=dict(
-                    brightness_delta=32,
-                    contrast_range=(0.5, 1.5),
-                    saturation_range=(0.5, 1.5),
-                    hue_delta=18),
-                expand=dict(
-                    mean=img_norm_cfg['mean'],
-                    to_rgb=img_norm_cfg['to_rgb'],
-                    ratio_range=(1, 4)),
-                random_crop=dict(
-                    min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)),
-            resize_keep_ratio=False)),
+            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(512, 512),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=None,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True,
-        resize_keep_ratio=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        ann_file=data_root + 'annotations/instances_val2017.json',
        img_prefix=data_root + 'val2017/',
-        img_scale=(512, 512),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=None,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True,
-        resize_keep_ratio=False))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)
 optimizer_config = dict()

--- a/configs/wider_face/ssd300_wider_face.py
+++ b/configs/wider_face/ssd300_wider_face.py
@@ -23,6 +23,7 @@ model = dict(
        anchor_ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
        target_means=(.0, .0, .0, .0),
        target_stds=(0.1, 0.1, 0.2, 0.2)))
+# model training and testing settings
 cudnn_benchmark = True
 train_cfg = dict(
    assigner=dict(
@@ -42,11 +43,47 @@ test_cfg = dict(
    min_bbox_size=0,
    score_thr=0.02,
    max_per_img=200)
-# model training and testing settings
 # dataset settings
 dataset_type = 'WIDERFaceDataset'
 data_root = 'data/WIDERFace/'
 img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
 data = dict(
    imgs_per_gpu=60,
    workers_per_gpu=2,
@@ -55,57 +92,20 @@ data = dict(
        times=2,
        dataset=dict(
            type=dataset_type,
-            ann_file=[
-                data_root + 'train.txt',
-            ],
-            img_prefix=[data_root + 'WIDER_train/'],
-            img_scale=(300, 300),
-            min_size=17,  # throw away very small faces to improve training,
-            # because 300x300 is too low resolution to detect them
-            img_norm_cfg=img_norm_cfg,
-            size_divisor=None,
-            flip_ratio=0.5,
-            with_mask=False,
-            with_crowd=False,
-            with_label=True,
-            test_mode=False,
-            extra_aug=dict(
-                photo_metric_distortion=dict(
-                    brightness_delta=32,
-                    contrast_range=(0.5, 1.5),
-                    saturation_range=(0.5, 1.5),
-                    hue_delta=18),
-                expand=dict(
-                    mean=img_norm_cfg['mean'],
-                    to_rgb=img_norm_cfg['to_rgb'],
-                    ratio_range=(1, 4)),
-                random_crop=dict(
-                    min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3)),
-            resize_keep_ratio=False)),
+            ann_file=data_root + 'train.txt',
+            img_prefix=data_root + 'WIDER_train/',
+            min_size=17,
+            pipeline=train_pipeline)),
    val=dict(
        type=dataset_type,
-        ann_file=data_root + '/val.txt',
+        ann_file=data_root + 'val.txt',
        img_prefix=data_root + 'WIDER_val/',
-        img_scale=(300, 300),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=None,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True,
-        resize_keep_ratio=False),
+        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
-        ann_file=data_root + '/val.txt',
+        ann_file=data_root + 'val.txt',
        img_prefix=data_root + 'WIDER_val/',
-        img_scale=(300, 300),
-        img_norm_cfg=img_norm_cfg,
-        size_divisor=None,
-        flip_ratio=0,
-        with_mask=False,
-        with_label=False,
-        test_mode=True,
-        resize_keep_ratio=False))
+        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4)
 optimizer_config = dict()
@@ -122,7 +122,7 @@ log_config = dict(
    interval=1,
    hooks=[
        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
+        # dict(type='TensorboardLoggerHook')
    ])
 # yapf:enable
 # runtime settings

--- a/demo/data_pipeline.png
+++ b/demo/data_pipeline.png
--- a/docs/DATA_PIPELINE.md
+++ b/docs/DATA_PIPELINE.md
+## Data preparation pipeline
+
+The data preparation pipeline and the dataset is decomposed. Usually a dataset
+defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
+A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
+
+We present a classical pipeline in the following figure. The blue blocks are pipeline operations. With the pipeline going on, each operator can add new keys (marked as green) to the result dict or update the existing keys (marked as orange).
+![pipeline figure](../demo/data_pipeline.png)
+
+The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
+
+Here is an pipeline example for Faster R-CNN.
+```python
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+```
+
+For each operation, we list the related dict fields that are added/updated/removed.
+
+### Data loading
+
+`LoadImageFromFile`
+- add: img, img_shape, ori_shape
+
+`LoadAnnotations`
+- add: gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg, bbox_fields, mask_fields
+
+`LoadProposals`
+- add: proposals
+
+### Pre-processing
+
+`Resize`
+- add: scale, scale_idx, pad_shape, scale_factor, keep_ratio
+- update: img, img_shape, *bbox_fields, *mask_fields
+
+`RandomFlip`
+- add: flip
+- update: img, *bbox_fields, *mask_fields
+
+`Pad`
+- add: pad_fixed_size, pad_size_divisor
+- update: img, pad_shape, *mask_fields
+
+`RandomCrop`
+- update: img, pad_shape, gt_bboxes, gt_labels, gt_masks, *bbox_fields
+
+`Normalize`
+- add: img_norm_cfg
+- update: img
+
+`SegResizeFlipPadRescale`
+- update: gt_semantic_seg
+
+`PhotoMetricDistortion`
+- update: img
+
+`Expand`
+- update: img, gt_bboxes
+
+`MinIoURandomCrop`
+- update: img, gt_bboxes, gt_labels
+
+`Corrupt`
+- update: img
+
+### Formatting
+
+`ToTensor`
+- update: specified by `keys`.
+
+`ImageToTensor`
+- update: specified by `keys`.
+
+`Transpose`
+- update: specified by `keys`.
+
+`ToDataContainer`
+- update: specified by `fields`.
+
+`DefaultFormatBundle`
+- update: img, proposals, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg
+
+`Collect`
+- add: img_meta (the keys of img_meta is specified by `meta_keys`)
+- remove: all other keys except for those specified by `keys`
+
+### Test time augmentation
+
+`MultiScaleFlipAug`
\ No newline at end of file
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@@ -89,12 +89,10 @@ model = init_detector(config_file, checkpoint_file, device='cuda:0')
 # test a single image and show the results
 img = 'test.jpg'  # or img = mmcv.imread(img), which will only load it once
 result = inference_detector(model, img)
+# visualize the results in a new window
 show_result(img, result, model.CLASSES)
-
-# test a list of images and write the results to image files
-imgs = ['test1.jpg', 'test2.jpg']
-for i, result in enumerate(inference_detector(model, imgs)):
-    show_result(imgs[i], result, model.CLASSES, out_file='result_{}.jpg'.format(i))
+# or save the visualization results to image files
+show_result(img, result, model.CLASSES, out_file='result.jpg')

 # test a video and show the results
 video = mmcv.VideoReader('video.mp4')

--- a/mmdet/apis/inference.py
+++ b/mmdet/apis/inference.py
@@ -5,11 +5,11 @@ import mmcv
 import numpy as np
 import pycocotools.mask as maskUtils
 import torch
+from mmcv.parallel import collate, scatter
 from mmcv.runner import load_checkpoint

 from mmdet.core import get_classes
-from mmdet.datasets import to_tensor
-from mmdet.datasets.transforms import ImageTransform
+from mmdet.datasets.pipelines import Compose
 from mmdet.models import build_detector


@@ -46,7 +46,16 @@ def init_detector(config, checkpoint=None, device='cuda:0'):
    return model


-def inference_detector(model, imgs):
+class LoadImage(object):
+
+    def __call__(self, results):
+        img = mmcv.imread(results['img'])
+        results['img'] = img
+        results['ori_shape'] = img.shape
+        return results
+
+
+def inference_detector(model, img):
    """Inference image(s) with the detector.

    Args:
@@ -59,45 +68,19 @@ def inference_detector(model, imgs):
        detection results directly.
    """
    cfg = model.cfg
-    img_transform = ImageTransform(
-        size_divisor=cfg.data.test.size_divisor, **cfg.img_norm_cfg)
-
    device = next(model.parameters()).device  # model device
-    if not isinstance(imgs, list):
-        return _inference_single(model, imgs, img_transform, device)
-    else:
-        return _inference_generator(model, imgs, img_transform, device)
-
-
-def _prepare_data(img, img_transform, cfg, device):
-    ori_shape = img.shape
-    img, img_shape, pad_shape, scale_factor = img_transform(
-        img,
-        scale=cfg.data.test.img_scale,
-        keep_ratio=cfg.data.test.get('resize_keep_ratio', True))
-    img = to_tensor(img).to(device).unsqueeze(0)
-    img_meta = [
-        dict(
-            ori_shape=ori_shape,
-            img_shape=img_shape,
-            pad_shape=pad_shape,
-            scale_factor=scale_factor,
-            flip=False)
-    ]
-    return dict(img=[img], img_meta=[img_meta])
-
-
-def _inference_single(model, img, img_transform, device):
-    img = mmcv.imread(img)
-    data = _prepare_data(img, img_transform, model.cfg, device)
+    # build the data pipeline
+    test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    data = dict(img=img)
+    data = test_pipeline(data)
+    data = scatter(collate([data], samples_per_gpu=1), [device])[0]
+    # forward the model
    with torch.no_grad():
        result = model(return_loss=False, rescale=True, **data)
-    return result
-

-def _inference_generator(model, imgs, img_transform, device):
-    for img in imgs:
-        yield _inference_single(model, img, img_transform, device)
+    return result


 # TODO: merge this method with the one in BaseDetector

--- a/mmdet/core/evaluation/eval_hooks.py
+++ b/mmdet/core/evaluation/eval_hooks.py
@@ -78,12 +78,12 @@ class DistEvalmAPHook(DistEvalHook):
    def evaluate(self, runner, results):
        gt_bboxes = []
        gt_labels = []
-        gt_ignore = [] if self.dataset.with_crowd else None
+        gt_ignore = []
        for i in range(len(self.dataset)):
            ann = self.dataset.get_ann_info(i)
            bboxes = ann['bboxes']
            labels = ann['labels']
-            if gt_ignore is not None:
+            if 'bboxes_ignore' in ann:
                ignore = np.concatenate([
                    np.zeros(bboxes.shape[0], dtype=np.bool),
                    np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool)
@@ -93,6 +93,8 @@ class DistEvalmAPHook(DistEvalHook):
                labels = np.concatenate([labels, ann['labels_ignore']])
            gt_bboxes.append(bboxes)
            gt_labels.append(labels)
+        if not gt_ignore:
+            gt_ignore = None
        # If the dataset is VOC2007, then use 11 points mAP evaluation.
        if hasattr(self.dataset, 'year') and self.dataset.year == 2007:
            ds_name = 'voc07'

--- a/mmdet/datasets/__init__.py
+++ b/mmdet/datasets/__init__.py
@@ -6,7 +6,6 @@ from .dataset_wrappers import ConcatDataset, RepeatDataset
 from .extra_aug import ExtraAugmentation
 from .loader import DistributedGroupSampler, GroupSampler, build_dataloader
 from .registry import DATASETS
-from .utils import random_scale, show_ann, to_tensor
 from .voc import VOCDataset
 from .wider_face import WIDERFaceDataset
 from .xml_style import XMLDataset
@@ -14,7 +13,6 @@ from .xml_style import XMLDataset
 __all__ = [
    'CustomDataset', 'XMLDataset', 'CocoDataset', 'VOCDataset',
    'CityscapesDataset', 'GroupSampler', 'DistributedGroupSampler',
-    'build_dataloader', 'to_tensor', 'random_scale', 'show_ann',
-    'ConcatDataset', 'RepeatDataset', 'ExtraAugmentation', 'WIDERFaceDataset',
-    'DATASETS', 'build_dataset'
+    'build_dataloader', 'ConcatDataset', 'RepeatDataset', 'ExtraAugmentation',
+    'WIDERFaceDataset', 'DATASETS', 'build_dataset'
 ]
--- a/mmdet/datasets/coco.py
+++ b/mmdet/datasets/coco.py
@@ -42,7 +42,7 @@ class CocoDataset(CustomDataset):
        img_id = self.img_infos[idx]['id']
        ann_ids = self.coco.getAnnIds(imgIds=[img_id])
        ann_info = self.coco.loadAnns(ann_ids)
-        return self._parse_ann_info(ann_info, self.with_mask)
+        return self._parse_ann_info(self.img_infos[idx], ann_info)

    def _filter_imgs(self, min_size=32):
        """Filter images too small or without ground truths."""
@@ -55,7 +55,7 @@ class CocoDataset(CustomDataset):
                valid_inds.append(i)
        return valid_inds

-    def _parse_ann_info(self, ann_info, with_mask=True):
+    def _parse_ann_info(self, img_info, ann_info):
        """Parse bbox and mask annotation.

        Args:
@@ -64,19 +64,14 @@ class CocoDataset(CustomDataset):

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
-                labels, masks, mask_polys, poly_lens.
+                labels, masks, seg_map. "masks" are raw annotations and not
+                decoded into binary masks.
        """
        gt_bboxes = []
        gt_labels = []
        gt_bboxes_ignore = []
-        # Two formats are provided.
-        # 1. mask: a binary map of the same size of the image.
-        # 2. polys: each mask consists of one or several polys, each poly is a
-        # list of float.
-        if with_mask:
-            gt_masks = []
-            gt_mask_polys = []
-            gt_poly_lens = []
+        gt_masks_ann = []
+
        for i, ann in enumerate(ann_info):
            if ann.get('ignore', False):
                continue
@@ -84,19 +79,13 @@ class CocoDataset(CustomDataset):
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
-            if ann['iscrowd']:
+            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_labels.append(self.cat2label[ann['category_id']])
-            if with_mask:
-                gt_masks.append(self.coco.annToMask(ann))
-                mask_polys = [
-                    p for p in ann['segmentation'] if len(p) >= 6
-                ]  # valid polygons have >= 3 points (6 coordinates)
-                poly_lens = [len(p) for p in mask_polys]
-                gt_mask_polys.append(mask_polys)
-                gt_poly_lens.extend(poly_lens)
+                gt_masks_ann.append(ann['segmentation'])
+
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
@@ -109,12 +98,13 @@ class CocoDataset(CustomDataset):
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

+        seg_map = img_info['filename'].replace('jpg', 'png')
+
        ann = dict(
-            bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)

-        if with_mask:
-            ann['masks'] = gt_masks
-            # poly format is not used in the current implementation
-            ann['mask_polys'] = gt_mask_polys
-            ann['poly_lens'] = gt_poly_lens
        return ann
--- a/mmdet/datasets/custom.py
+++ b/mmdet/datasets/custom.py
 import os.path as osp
-import warnings

 import mmcv
 import numpy as np
-from imagecorruptions import corrupt
-from mmcv.parallel import DataContainer as DC
 from torch.utils.data import Dataset

-from .extra_aug import ExtraAugmentation
+from .pipelines import Compose
 from .registry import DATASETS
-from .transforms import (BboxTransform, ImageTransform, MaskTransform,
-                         Numpy2Tensor, SegMapTransform)
-from .utils import random_scale, to_tensor


 @DATASETS.register_module
@@ -27,7 +21,7 @@ class CustomDataset(Dataset):
            'ann': {
                'bboxes': <np.ndarray> (n, 4),
                'labels': <np.ndarray> (n, ),
-                'bboxes_ignore': <np.ndarray> (k, 4),
+                'bboxes_ignore': <np.ndarray> (k, 4), (optional field)
                'labels_ignore': <np.ndarray> (k, 4) (optional field)
            }
        },
@@ -41,33 +35,35 @@ class CustomDataset(Dataset):

    def __init__(self,
                 ann_file,
-                 img_prefix,
-                 img_scale,
-                 img_norm_cfg,
-                 multiscale_mode='value',
-                 size_divisor=None,
-                 proposal_file=None,
-                 num_max_proposals=1000,
-                 flip_ratio=0,
-                 with_mask=True,
-                 with_crowd=True,
-                 with_label=True,
-                 with_semantic_seg=False,
+                 pipeline,
+                 data_root=None,
+                 img_prefix=None,
                 seg_prefix=None,
-                 seg_scale_factor=1,
-                 extra_aug=None,
-                 resize_keep_ratio=True,
-                 corruption=None,
-                 corruption_severity=1,
-                 skip_img_without_anno=True,
+                 proposal_file=None,
                 test_mode=False):
-        # prefix of images path
+        self.ann_file = ann_file
+        self.data_root = data_root
        self.img_prefix = img_prefix
+        self.seg_prefix = seg_prefix
+        self.proposal_file = proposal_file
+        self.test_mode = test_mode

+        # join paths if data_root is specified
+        if self.data_root is not None:
+            if not osp.isabs(self.ann_file):
+                self.ann_file = osp.join(self.data_root, self.ann_file)
+            if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
+                self.img_prefix = osp.join(self.data_root, self.img_prefix)
+            if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
+                self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
+            if not (self.proposal_file is None
+                    or osp.isabs(self.proposal_file)):
+                self.proposal_file = osp.join(self.data_root,
+                                              self.proposal_file)
        # load annotations (and proposals)
-        self.img_infos = self.load_annotations(ann_file)
-        if proposal_file is not None:
-            self.proposals = self.load_proposals(proposal_file)
+        self.img_infos = self.load_annotations(self.ann_file)
+        if self.proposal_file is not None:
+            self.proposals = self.load_proposals(self.proposal_file)
        else:
            self.proposals = None
        # filter images with no annotation during training
@@ -76,67 +72,11 @@ class CustomDataset(Dataset):
            self.img_infos = [self.img_infos[i] for i in valid_inds]
            if self.proposals is not None:
                self.proposals = [self.proposals[i] for i in valid_inds]
-
-        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
-        self.img_scales = img_scale if isinstance(img_scale,
-                                                  list) else [img_scale]
-        assert mmcv.is_list_of(self.img_scales, tuple)
-        # normalization configs
-        self.img_norm_cfg = img_norm_cfg
-
-        # multi-scale mode (only applicable for multi-scale training)
-        self.multiscale_mode = multiscale_mode
-        assert multiscale_mode in ['value', 'range']
-
-        # max proposals per image
-        self.num_max_proposals = num_max_proposals
-        # flip ratio
-        self.flip_ratio = flip_ratio
-        assert flip_ratio >= 0 and flip_ratio <= 1
-        # padding border to ensure the image size can be divided by
-        # size_divisor (used for FPN)
-        self.size_divisor = size_divisor
-
-        # with mask or not (reserved field, takes no effect)
-        self.with_mask = with_mask
-        # some datasets provide bbox annotations as ignore/crowd/difficult,
-        # if `with_crowd` is True, then these info is returned.
-        self.with_crowd = with_crowd
-        # with label is False for RPN
-        self.with_label = with_label
-        # with semantic segmentation (stuff) annotation or not
-        self.with_seg = with_semantic_seg
-        # prefix of semantic segmentation map path
-        self.seg_prefix = seg_prefix
-        # rescale factor for segmentation maps
-        self.seg_scale_factor = seg_scale_factor
-        # in test mode or not
-        self.test_mode = test_mode
-
        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
-        # transforms
-        self.img_transform = ImageTransform(
-            size_divisor=self.size_divisor, **self.img_norm_cfg)
-        self.bbox_transform = BboxTransform()
-        self.mask_transform = MaskTransform()
-        self.seg_transform = SegMapTransform(self.size_divisor)
-        self.numpy2tensor = Numpy2Tensor()
-
-        # if use extra augmentation
-        if extra_aug is not None:
-            self.extra_aug = ExtraAugmentation(**extra_aug)
-        else:
-            self.extra_aug = None
-
-        # image rescale if keep ratio
-        self.resize_keep_ratio = resize_keep_ratio
-        self.skip_img_without_anno = skip_img_without_anno
-
-        # corruptions
-        self.corruption = corruption
-        self.corruption_severity = corruption_severity
+        # processing pipeline
+        self.pipeline = Compose(pipeline)

    def __len__(self):
        return len(self.img_infos)
@@ -150,6 +90,13 @@ class CustomDataset(Dataset):
    def get_ann_info(self, idx):
        return self.img_infos[idx]['ann']

+    def pre_pipeline(self, results):
+        results['img_prefix'] = self.img_prefix
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+
    def _filter_imgs(self, min_size=32):
        """Filter images too small."""
        valid_inds = []
@@ -186,164 +133,17 @@ class CustomDataset(Dataset):

    def prepare_train_img(self, idx):
        img_info = self.img_infos[idx]
-        # load image
-        img = mmcv.imread(osp.join(self.img_prefix, img_info['filename']))
-        # corruption
-        if self.corruption is not None:
-            img = corrupt(
-                img,
-                severity=self.corruption_severity,
-                corruption_name=self.corruption)
-        # load proposals if necessary
-        if self.proposals is not None:
-            proposals = self.proposals[idx][:self.num_max_proposals]
-            # TODO: Handle empty proposals properly. Currently images with
-            # no proposals are just ignored, but they can be used for
-            # training in concept.
-            if len(proposals) == 0:
-                return None
-            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
-                raise AssertionError(
-                    'proposals should have shapes (n, 4) or (n, 5), '
-                    'but found {}'.format(proposals.shape))
-            if proposals.shape[1] == 5:
-                scores = proposals[:, 4, None]
-                proposals = proposals[:, :4]
-            else:
-                scores = None
-
-        ann = self.get_ann_info(idx)
-        gt_bboxes = ann['bboxes']
-        gt_labels = ann['labels']
-        if self.with_crowd:
-            gt_bboxes_ignore = ann['bboxes_ignore']
-
-        # skip the image if there is no valid gt bbox
-        if len(gt_bboxes) == 0 and self.skip_img_without_anno:
-            warnings.warn('Skip the image "%s" that has no valid gt bbox' %
-                          osp.join(self.img_prefix, img_info['filename']))
-            return None
-
-        # extra augmentation
-        if self.extra_aug is not None:
-            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
-                                                       gt_labels)
-
-        # apply transforms
-        flip = True if np.random.rand() < self.flip_ratio else False
-        # randomly sample a scale
-        img_scale = random_scale(self.img_scales, self.multiscale_mode)
-        img, img_shape, pad_shape, scale_factor = self.img_transform(
-            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
-        img = img.copy()
-        if self.with_seg:
-            gt_seg = mmcv.imread(
-                osp.join(self.seg_prefix,
-                         img_info['filename'].replace('jpg', 'png')),
-                flag='unchanged')
-            gt_seg = self.seg_transform(gt_seg.squeeze(), img_scale, flip)
-            gt_seg = mmcv.imrescale(
-                gt_seg, self.seg_scale_factor, interpolation='nearest')
-            gt_seg = gt_seg[None, ...]
+        ann_info = self.get_ann_info(idx)
+        results = dict(img_info=img_info, ann_info=ann_info)
        if self.proposals is not None:
-            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
-                                            flip)
-            proposals = np.hstack([proposals, scores
-                                   ]) if scores is not None else proposals
-        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
-                                        flip)
-        if self.with_crowd:
-            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
-                                                   scale_factor, flip)
-        if self.with_mask:
-            gt_masks = self.mask_transform(ann['masks'], pad_shape,
-                                           scale_factor, flip)
-
-        ori_shape = (img_info['height'], img_info['width'], 3)
-        img_meta = dict(
-            ori_shape=ori_shape,
-            img_shape=img_shape,
-            pad_shape=pad_shape,
-            scale_factor=scale_factor,
-            flip=flip)
-
-        data = dict(
-            img=DC(to_tensor(img), stack=True),
-            img_meta=DC(img_meta, cpu_only=True),
-            gt_bboxes=DC(to_tensor(gt_bboxes)))
-        if self.proposals is not None:
-            data['proposals'] = DC(to_tensor(proposals))
-        if self.with_label:
-            data['gt_labels'] = DC(to_tensor(gt_labels))
-        if self.with_crowd:
-            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
-        if self.with_mask:
-            data['gt_masks'] = DC(gt_masks, cpu_only=True)
-        if self.with_seg:
-            data['gt_semantic_seg'] = DC(to_tensor(gt_seg), stack=True)
-        return data
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)

    def prepare_test_img(self, idx):
-        """Prepare an image for testing (multi-scale and flipping)"""
        img_info = self.img_infos[idx]
-        img = mmcv.imread(osp.join(self.img_prefix, img_info['filename']))
-        # corruption
-        if self.corruption is not None:
-            img = corrupt(
-                img,
-                severity=self.corruption_severity,
-                corruption_name=self.corruption)
-        # load proposals if necessary
-        if self.proposals is not None:
-            proposal = self.proposals[idx][:self.num_max_proposals]
-            if not (proposal.shape[1] == 4 or proposal.shape[1] == 5):
-                raise AssertionError(
-                    'proposals should have shapes (n, 4) or (n, 5), '
-                    'but found {}'.format(proposal.shape))
-        else:
-            proposal = None
-
-        def prepare_single(img, scale, flip, proposal=None):
-            _img, img_shape, pad_shape, scale_factor = self.img_transform(
-                img, scale, flip, keep_ratio=self.resize_keep_ratio)
-            _img = to_tensor(_img)
-            _img_meta = dict(
-                ori_shape=(img_info['height'], img_info['width'], 3),
-                img_shape=img_shape,
-                pad_shape=pad_shape,
-                scale_factor=scale_factor,
-                flip=flip)
-            if proposal is not None:
-                if proposal.shape[1] == 5:
-                    score = proposal[:, 4, None]
-                    proposal = proposal[:, :4]
-                else:
-                    score = None
-                _proposal = self.bbox_transform(proposal, img_shape,
-                                                scale_factor, flip)
-                _proposal = np.hstack([_proposal, score
-                                       ]) if score is not None else _proposal
-                _proposal = to_tensor(_proposal)
-            else:
-                _proposal = None
-            return _img, _img_meta, _proposal
-
-        imgs = []
-        img_metas = []
-        proposals = []
-        for scale in self.img_scales:
-            _img, _img_meta, _proposal = prepare_single(
-                img, scale, False, proposal)
-            imgs.append(_img)
-            img_metas.append(DC(_img_meta, cpu_only=True))
-            proposals.append(_proposal)
-            if self.flip_ratio > 0:
-                _img, _img_meta, _proposal = prepare_single(
-                    img, scale, True, proposal)
-                imgs.append(_img)
-                img_metas.append(DC(_img_meta, cpu_only=True))
-                proposals.append(_proposal)
-        data = dict(img=imgs, img_meta=img_metas)
+        results = dict(img_info=img_info)
        if self.proposals is not None:
-            data['proposals'] = proposals
-        return data
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)