Commit 2aa165a7 authored by wofmanaf's avatar wofmanaf Committed by zhe chen
Browse files

crowd_human detection

parent 76b26f09
# dataset settings
dataset_type = 'CrowdHumanDataset'
data_root = 'data/CrowdHuman/'
classes = ('person',)
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
classes=classes,
filter_empty_gt=True,
ann_file=data_root + 'annotations/annotation_train.json',
img_prefix=data_root + 'Images',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
classes=classes,
ann_file=data_root + 'annotations/annotation_val.json',
img_prefix=data_root + 'Images',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
classes=classes,
ann_file=data_root + 'annotations/annotation_val.json',
img_prefix=data_root + 'Images',
pipeline=test_pipeline))
evaluation = dict(interval=100, metric='bbox')
# model settings
model = dict(
type='CascadeRCNN',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
roi_head=dict(
type='CascadeRoIHead',
num_stages=3,
stage_loss_weights=[1, 0.5, 0.25],
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.033, 0.033, 0.067, 0.067]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
],),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=2000,
max_per_img=2000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False)
]),
test_cfg=dict(
rpn=dict(
nms_pre=1000,
max_per_img=1000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100,
mask_thr_binary=0.5)))
# CrowdHuman
## Introduction
Introduced by Shao et al. in [CrowdHuman: A Benchmark for Detecting Human in a Crowd](https://arxiv.org/pdf/1805.00123.pdf)
CrowdHuman is a benchmark dataset to better evaluate detectors in crowd scenarios. The CrowdHuman dataset is large, rich-annotated and contains high diversity. CrowdHuman contains 15000, 4370 and 5000 images for training, validation, and testing, respectively. There are a total of 470K human instances from train and validation subsets and 23 persons per image, with various kinds of occlusions in the dataset. Each human instance is annotated with a head bounding-box, human visible-region bounding-box and human full-body bounding-box. We hope our dataset will serve as a solid baseline and help promote future research in human detection tasks.
## Prepare the data
Download the original dataset from [CrowdHuman](https://www.crowdhuman.org/download.html). Then convert annotations by detection/tools/create_crowd_anno.py
- Data Tree of CrowdHuman should look like:
```bash
$ tree CrowdHuman
CrowdHuman
├── annotations
│ ├── annotation_train.json
│ ├── annotation_train.odgt
│ ├── annotation_val.json
│ ├── annotation_val.odgt
│ └── ...
└── Images
├── 1074488,79b360006b38332b.jpg
├── 1074488,79d54000c6f9d9e5.jpg
└── ...
## Model Zoo
### Cascade Mask R-CNN + InternImage
| backbone | schd | box mAP | mask mAP | train speed | train time | #param | FLOPs | Config | Download |
| :------------: | :---------: |:-------:|:--------:|:-----------:|:-----------:|:------:|:-----:| :---: |:--------:|
| InternImage-XL | 3x | TBD | TBD | TBD | TBD | TBD | TBD | [config](./cascade_internimage_xl_fpn_3x_crowd_human.py) | TBD |
- Training speed is measured with A100 GPUs using current code and may be faster than the speed in logs.
- Some logs are our recent newly trained ones. There might be slight differences between the results in logs and our paper.
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/models/cascade_mask_rcnn_r50_fpn_crowdhuman.py',
'../_base_/datasets/crowd_human.py',
'../_base_/schedules/schedule_3x.py',
'../_base_/default_runtime.py'
]
pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_xl_22k_192to384.pth'
model = dict(
backbone=dict(
_delete_=True,
type='InternImage',
core_op='DCNv3',
channels=192,
depths=[5, 5, 24, 5],
groups=[12, 24, 48, 96],
mlp_ratio=4.,
drop_path_rate=0.4,
norm_layer='LN',
layer_scale=1.0,
offset_scale=2.0,
post_norm=True,
with_cp=False,
out_indices=(0, 1, 2, 3),
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
neck=dict(
type='FPN',
in_channels=[192, 384, 768, 1536],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
roi_head=dict(
bbox_head=[
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=1,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=1,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[0.05, 0.05, 0.1, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=1,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[0.033, 0.033, 0.067, 0.067]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
]))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='AutoAugment',
policies=[
[
dict(type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
multiscale_mode='value',
keep_ratio=True)
],
[
dict(type='Resize',
img_scale=[(400, 1333), (500, 1333), (600, 1333)],
multiscale_mode='value',
keep_ratio=True),
dict(type='RandomCrop',
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=True),
dict(type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333),
(576, 1333), (608, 1333), (640, 1333),
(672, 1333), (704, 1333), (736, 1333),
(768, 1333), (800, 1333)],
multiscale_mode='value',
override=True,
keep_ratio=True)
]
]),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
# we use 8 nodes to train this model, with a total batch size of 128
data = dict(
samples_per_gpu=2,
train=dict(pipeline=train_pipeline))
optimizer = dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=39, layer_decay_rate=0.94,
depths=[5, 5, 24, 5]))
optimizer_config = dict(grad_clip=None)
# fp16 = dict(loss_scale=dict(init_scale=512))
evaluation = dict(save_best='auto')
checkpoint_config = dict(
interval=1,
max_keep_ckpts=3,
save_last=True,
)
...@@ -4,4 +4,5 @@ ...@@ -4,4 +4,5 @@
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
from .models import * # noqa: F401,F403 from .models import * # noqa: F401,F403
\ No newline at end of file from .datasets import *
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from .crowd_human import CrowdHumanDataset
\ No newline at end of file
This diff is collapsed.
import argparse
import os
import pickle as pkl
import numpy as np
import random
from PIL import Image
import concurrent.futures
import json
import mmcv
def parse_args():
parser = argparse.ArgumentParser(description='Generate MMDetection Annotations for Crowdhuman-like dataset')
parser.add_argument('--dataset', help='dataset name', type=str)
parser.add_argument('--dataset-split', help='dataset split, e.g. train, val', type=str)
args = parser.parse_args()
return args.dataset, args.dataset_split
def load_func(fpath):
assert os.path.exists(fpath)
with open(fpath, 'r') as fid:
lines = fid.readlines()
records = [json.loads(line.strip('\n')) for line in lines]
return records
def decode_annotations(records, dataset_path):
rec_ids = list(range(len(records)))
img_list = []
ann_list = []
ann_id = 1
for idx, rec_id in enumerate(rec_ids):
img_id = records[rec_id]['ID']
img_url = dataset_path + 'Images/' + img_id + '.jpg'
assert os.path.exists(img_url)
im = Image.open(img_url)
im_w, im_h = im.width, im.height
gt_box = records[rec_id]['gtboxes']
gt_box_len = len(gt_box)
img_dict = dict(
file_name=img_id + '.jpg',
height=im_h,
width=im_w,
id=idx
)
img_list.append(img_dict)
for ii in range(gt_box_len):
each_data = gt_box[ii]
x, y, w, h = each_data['fbox']
if w <= 0 or h <= 0:
continue
# x1 = x; y1 = y; x2 = x + w; y2 = y + h
valid_bbox = [x, y, w, h]
if each_data['tag'] == 'person':
tag = 1
else:
tag = -2
if 'extra' in each_data:
if 'ignore' in each_data['extra']:
if each_data['extra']['ignore'] != 0:
tag = -2
ann_dict = dict(
area=w * h,
iscrowd=1 if tag == -2 else 0,
image_id=idx,
bbox=[x, y, w, h],
category_id=1,
id=ann_id,
# ignore=1 if tag == -2 else 1,
)
ann_id += 1
ann_list.append(ann_dict)
cate_list = [{'supercategory': 'none', 'id': 1, 'name': 'person'}]
json_dict = dict(
images=img_list,
annotations=ann_list,
categories=cate_list
)
return json_dict
if __name__ == "__main__":
dataset_name, dataset_type = parse_args()
dataset_path = 'data/%s/' % dataset_name
ch_file_path = dataset_path + 'annotations/annotation_%s.odgt' % dataset_type
json_file_path = dataset_path + 'annotations/annotation_%s.json' % dataset_type
records = load_func(ch_file_path)
print("Loading Annotations Done")
json_dict = decode_annotations(records, dataset_path)
print("Parsing Bbox Number: %d" % len(json_dict['annotations']))
mmcv.dump(json_dict, json_file_path)
from .compute_APMR import compute_APMR
from .compute_JI import compute_JI_with_ignore
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment