crowd_human detection

2aa165a7 · wofmanaf · zhe chen · 76b26f09 · 2aa165a7 · 2aa165a7
Commit 2aa165a7 authored Mar 22, 2023 by wofmanaf Committed by zhe chen Mar 22, 2023
9 changed files
--- a/detection/configs/_base_/datasets/crowd_human.py
+++ b/detection/configs/_base_/datasets/crowd_human.py
+# dataset settings
+dataset_type = 'CrowdHumanDataset'
+data_root = 'data/CrowdHuman/'
+classes = ('person',)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        classes=classes,
+        filter_empty_gt=True,
+        ann_file=data_root + 'annotations/annotation_train.json',
+        img_prefix=data_root + 'Images',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'annotations/annotation_val.json',
+        img_prefix=data_root + 'Images',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        classes=classes,
+        ann_file=data_root + 'annotations/annotation_val.json',
+        img_prefix=data_root + 'Images',
+        pipeline=test_pipeline))
+evaluation = dict(interval=100, metric='bbox')
--- a/detection/configs/_base_/models/cascade_mask_rcnn_r50_fpn_crowdhuman.py
+++ b/detection/configs/_base_/models/cascade_mask_rcnn_r50_fpn_crowdhuman.py
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
--- a/detection/configs/crowd_human/README.md
+++ b/detection/configs/crowd_human/README.md
+# CrowdHuman
+## Introduction
+Introduced by Shao et al. in [CrowdHuman: A Benchmark for Detecting Human in a Crowd](https://arxiv.org/pdf/1805.00123.pdf)
+CrowdHuman is a benchmark dataset to better evaluate detectors in crowd scenarios. The CrowdHuman dataset is large, rich-annotated and contains high diversity. CrowdHuman contains 15000, 4370 and 5000 images for training, validation, and testing, respectively. There are a total of 470K human instances from train and validation subsets and 23 persons per image, with various kinds of occlusions in the dataset. Each human instance is annotated with a head bounding-box, human visible-region bounding-box and human full-body bounding-box. We hope our dataset will serve as a solid baseline and help promote future research in human detection tasks.
+## Prepare the data
+Download the original dataset from [CrowdHuman](https://www.crowdhuman.org/download.html). Then convert annotations by detection/tools/create_crowd_anno.py
+- Data Tree of CrowdHuman should look like:
+  ```bash
+  $ tree CrowdHuman
+  CrowdHuman
+  ├── annotations
+  │   ├── annotation_train.json
+  │   ├── annotation_train.odgt
+  │   ├── annotation_val.json
+  │   ├── annotation_val.odgt
+  │   └── ...
+  └── Images
+      ├── 1074488,79b360006b38332b.jpg
+      ├── 1074488,79d54000c6f9d9e5.jpg
+      └── ...
+## Model Zoo
+### Cascade Mask R-CNN + InternImage
+|    backbone    |         schd | box mAP | mask mAP | train speed | 	train time | #param | FLOPs | Config | Download |
+| :------------: |  :---------: |:-------:|:--------:|:-----------:|:-----------:|:------:|:-----:| :---: |:--------:|
+| InternImage-XL |        3x      |   TBD   |   TBD    |     TBD     |     TBD     |  TBD   |  TBD  | [config](./cascade_internimage_xl_fpn_3x_crowd_human.py) |   TBD    |
+- Training speed is measured with A100 GPUs using current code and may be faster than the speed in logs.
+- Some logs are our recent newly trained ones. There might be slight differences between the results in logs and our paper.
--- a/detection/configs/crowd_human/cascade_internimage_xl_fpn_3x_crowd_human.py
+++ b/detection/configs/crowd_human/cascade_internimage_xl_fpn_3x_crowd_human.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn_crowdhuman.py',
+    '../_base_/datasets/crowd_human.py',
+    '../_base_/schedules/schedule_3x.py',
+    '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_xl_22k_192to384.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=192,
+        depths=[5, 5, 24, 5],
+        groups=[12, 24, 48, 96],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(
+        type='FPN',
+        in_channels=[192, 384, 768, 1536],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ]))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='AutoAugment',
+         policies=[
+             [
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                                 (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                                 (736, 1333), (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True)
+             ],
+             [
+                 dict(type='Resize',
+                      img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                      multiscale_mode='value',
+                      keep_ratio=True),
+                 dict(type='RandomCrop',
+                      crop_type='absolute_range',
+                      crop_size=(384, 600),
+                      allow_negative_crop=True),
+                 dict(type='Resize',
+                      img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                 (576, 1333), (608, 1333), (640, 1333),
+                                 (672, 1333), (704, 1333), (736, 1333),
+                                 (768, 1333), (800, 1333)],
+                      multiscale_mode='value',
+                      override=True,
+                      keep_ratio=True)
+             ]
+         ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+# we use 8 nodes to train this model, with a total batch size of 128
+data = dict(
+    samples_per_gpu=2,
+    train=dict(pipeline=train_pipeline))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=39, layer_decay_rate=0.94,
+                       depths=[5, 5, 24, 5]))
+optimizer_config = dict(grad_clip=None)
+# fp16 = dict(loss_scale=dict(init_scale=512))
+evaluation = dict(save_best='auto')
+checkpoint_config = dict(
+    interval=1,
+    max_keep_ckpts=3,
+    save_last=True,
+)
--- a/detection/mmdet_custom/__init__.py
+++ b/detection/mmdet_custom/__init__.py
@@ -4,4 +4,5 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from .models import *  # noqa: F401,F403
\ No newline at end of file
+from .datasets import *
\ No newline at end of file
--- a/detection/mmdet_custom/datasets/__init__.py
+++ b/detection/mmdet_custom/datasets/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from .crowd_human import CrowdHumanDataset
\ No newline at end of file
--- a/detection/mmdet_custom/datasets/crowd_human.py
+++ b/detection/mmdet_custom/datasets/crowd_human.py
--- a/detection/tools/create_crowd_anno.py
+++ b/detection/tools/create_crowd_anno.py
+import argparse
+import os
+import pickle as pkl
+import numpy as np
+import random
+from PIL import Image
+import concurrent.futures
+import json
+import mmcv
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate MMDetection Annotations for Crowdhuman-like dataset')
+    parser.add_argument('--dataset', help='dataset name', type=str)
+    parser.add_argument('--dataset-split', help='dataset split, e.g. train, val', type=str)
+    args = parser.parse_args()
+    return args.dataset, args.dataset_split
+def load_func(fpath):
+    assert os.path.exists(fpath)
+    with open(fpath, 'r') as fid:
+        lines = fid.readlines()
+    records = [json.loads(line.strip('\n')) for line in lines]
+    return records
+def decode_annotations(records, dataset_path):
+    rec_ids = list(range(len(records)))
+    img_list = []
+    ann_list = []
+    ann_id = 1
+    for idx, rec_id in enumerate(rec_ids):
+        img_id = records[rec_id]['ID']
+        img_url = dataset_path + 'Images/' + img_id + '.jpg'
+        assert os.path.exists(img_url)
+        im = Image.open(img_url)
+        im_w, im_h = im.width, im.height
+        gt_box = records[rec_id]['gtboxes']
+        gt_box_len = len(gt_box)
+        img_dict = dict(
+            file_name=img_id + '.jpg',
+            height=im_h,
+            width=im_w,
+            id=idx
+        )
+        img_list.append(img_dict)
+        for ii in range(gt_box_len):
+            each_data = gt_box[ii]
+            x, y, w, h = each_data['fbox']
+            if w <= 0 or h <= 0:
+                continue
+            # x1 = x; y1 = y; x2 = x + w; y2 = y + h
+            valid_bbox = [x, y, w, h]
+            if each_data['tag'] == 'person':
+                tag = 1
+            else:
+                tag = -2
+            if 'extra' in each_data:
+                if 'ignore' in each_data['extra']:
+                    if each_data['extra']['ignore'] != 0:
+                        tag = -2
+            ann_dict = dict(
+                area=w * h,
+                iscrowd=1 if tag == -2 else 0,
+                image_id=idx,
+                bbox=[x, y, w, h],
+                category_id=1,
+                id=ann_id,
+                # ignore=1 if tag == -2 else 1,
+            )
+            ann_id += 1
+            ann_list.append(ann_dict)
+    cate_list = [{'supercategory': 'none', 'id': 1, 'name': 'person'}]
+    json_dict = dict(
+        images=img_list,
+        annotations=ann_list,
+        categories=cate_list
+    )
+    return json_dict
+if __name__ == "__main__":
+    dataset_name, dataset_type = parse_args()
+    dataset_path = 'data/%s/' % dataset_name
+    ch_file_path = dataset_path + 'annotations/annotation_%s.odgt' % dataset_type
+    json_file_path = dataset_path + 'annotations/annotation_%s.json' % dataset_type
+    records = load_func(ch_file_path)
+    print("Loading Annotations Done")
+    json_dict = decode_annotations(records, dataset_path)
+    print("Parsing Bbox Number: %d" % len(json_dict['annotations']))
+    mmcv.dump(json_dict, json_file_path)
--- a/detection/tools/evaluate/__init__.py
+++ b/detection/tools/evaluate/__init__.py
+from .compute_APMR import compute_APMR
+from .compute_JI import compute_JI_with_ignore
\ No newline at end of file