Commit 37c8cebc authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
Pipeline #318 failed with stages
in 0 seconds
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-3,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained='torchvision://resnet50',
backbone=dict(
type='MSPN',
unit_channels=256,
num_stages=3,
num_units=4,
num_blocks=[3, 4, 6, 3],
norm_cfg=dict(type='BN')),
keypoint_head=dict(
type='TopdownHeatmapMSMUHead',
out_shape=(64, 48),
unit_channels=256,
out_channels=channel_cfg['num_output_channels'],
num_stages=3,
num_units=4,
use_prm=False,
norm_cfg=dict(type='BN'),
loss_keypoint=([
dict(
type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
] * 3 + [
dict(
type='JointsOHKMMSELoss',
use_target_weight=True,
loss_weight=1.)
]) * 3),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='megvii',
shift_heatmap=False,
modulate_kernel=5))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
use_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] * 2 + [(11, 11), (9, 9),
(7, 7), (5, 5)],
encoding='Megvii'),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=[
'img',
],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=4,
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-3,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained=None,
backbone=dict(
type='RSN',
unit_channels=256,
num_stages=3,
num_units=4,
num_blocks=[3, 4, 6, 3],
num_steps=4,
norm_cfg=dict(type='BN')),
keypoint_head=dict(
type='TopdownHeatmapMSMUHead',
out_shape=(64, 48),
unit_channels=256,
out_channels=channel_cfg['num_output_channels'],
num_stages=3,
num_units=4,
use_prm=False,
norm_cfg=dict(type='BN'),
loss_keypoint=([
dict(
type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
] * 3 + [
dict(
type='JointsOHKMMSELoss',
use_target_weight=True,
loss_weight=1.)
]) * 3),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='megvii',
shift_heatmap=False,
modulate_kernel=5))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
use_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] * 2 + [(11, 11), (9, 9),
(7, 7), (5, 5)],
encoding='Megvii'),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=[
'img',
],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=4,
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-3,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained='torchvision://resnet50',
backbone=dict(
type='MSPN',
unit_channels=256,
num_stages=4,
num_units=4,
num_blocks=[3, 4, 6, 3],
norm_cfg=dict(type='BN')),
keypoint_head=dict(
type='TopdownHeatmapMSMUHead',
out_shape=(64, 48),
unit_channels=256,
out_channels=channel_cfg['num_output_channels'],
num_stages=4,
num_units=4,
use_prm=False,
norm_cfg=dict(type='BN'),
loss_keypoint=([
dict(
type='JointsMSELoss', use_target_weight=True, loss_weight=0.25)
] * 3 + [
dict(
type='JointsOHKMMSELoss',
use_target_weight=True,
loss_weight=1.)
]) * 4),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='megvii',
shift_heatmap=False,
modulate_kernel=5))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
use_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
kernel=[(15, 15), (11, 11), (9, 9), (7, 7)] * 3 + [(11, 11), (9, 9),
(7, 7), (5, 5)],
encoding='Megvii'),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=[
'img',
],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=4,
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
<!-- [BACKBONE] -->
<details>
<summary align="right"><a href="https://proceedings.neurips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf">AlexNet (NeurIPS'2012)</a></summary>
```bibtex
@inproceedings{krizhevsky2012imagenet,
title={Imagenet classification with deep convolutional neural networks},
author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
booktitle={Advances in neural information processing systems},
pages={1097--1105},
year={2012}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [pose_alexnet](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco_256x192.py) | 256x192 | 0.397 | 0.758 | 0.381 | 0.478 | 0.822 | [ckpt](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192_20200727.log.json) |
Collections:
- Name: AlexNet
Paper:
Title: Imagenet classification with deep convolutional neural networks
URL: https://proceedings.neurips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/alexnet.md
Models:
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco_256x192.py
In Collection: AlexNet
Metadata:
Architecture:
- AlexNet
Training Data: COCO
Name: topdown_heatmap_alexnet_coco_256x192
Results:
- Dataset: COCO
Metrics:
AP: 0.397
AP@0.5: 0.758
AP@0.75: 0.381
AR: 0.478
AR@0.5: 0.822
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/alexnet/alexnet_coco_256x192-a7b1fd15_20200727.pth
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained=None,
backbone=dict(type='AlexNet', num_classes=-1),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=256,
out_channels=channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[40, 56],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=64,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html">CPM (CVPR'2016)</a></summary>
```bibtex
@inproceedings{wei2016convolutional,
title={Convolutional pose machines},
author={Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
pages={4724--4732},
year={2016}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py) | 256x192 | 0.623 | 0.859 | 0.704 | 0.686 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192-aa4ba095_20200817.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192_20200817.log.json) |
| [cpm](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_384x288.py) | 384x288 | 0.650 | 0.864 | 0.725 | 0.708 | 0.905 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288-80feb4bc_20200821.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288_20200821.log.json) |
Collections:
- Name: CPM
Paper:
Title: Convolutional pose machines
URL: http://openaccess.thecvf.com/content_cvpr_2016/html/Wei_Convolutional_Pose_Machines_CVPR_2016_paper.html
README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/cpm.md
Models:
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py
In Collection: CPM
Metadata:
Architecture: &id001
- CPM
Training Data: COCO
Name: topdown_heatmap_cpm_coco_256x192
Results:
- Dataset: COCO
Metrics:
AP: 0.623
AP@0.5: 0.859
AP@0.75: 0.704
AR: 0.686
AR@0.5: 0.903
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192-aa4ba095_20200817.pth
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_384x288.py
In Collection: CPM
Metadata:
Architecture: *id001
Training Data: COCO
Name: topdown_heatmap_cpm_coco_384x288
Results:
- Dataset: COCO
Metrics:
AP: 0.65
AP@0.5: 0.864
AP@0.75: 0.725
AR: 0.708
AR@0.5: 0.905
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288-80feb4bc_20200821.pth
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained=None,
backbone=dict(
type='CPM',
in_channels=3,
out_channels=channel_cfg['num_output_channels'],
feat_channels=128,
num_stages=6),
keypoint_head=dict(
type='TopdownHeatmapMultiStageHead',
in_channels=channel_cfg['num_output_channels'],
out_channels=channel_cfg['num_output_channels'],
num_stages=6,
num_deconv_layers=0,
extra=dict(final_conv_kernel=0, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[24, 32],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=64,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained=None,
backbone=dict(
type='CPM',
in_channels=3,
out_channels=channel_cfg['num_output_channels'],
feat_channels=128,
num_stages=6),
keypoint_head=dict(
type='TopdownHeatmapMultiStageHead',
in_channels=channel_cfg['num_output_channels'],
out_channels=channel_cfg['num_output_channels'],
num_stages=6,
num_deconv_layers=0,
extra=dict(final_conv_kernel=0, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[288, 384],
heatmap_size=[36, 48],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=3),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained=None,
backbone=dict(
type='HourglassNet',
num_stacks=1,
),
keypoint_head=dict(
type='TopdownHeatmapMultiStageHead',
in_channels=256,
out_channels=channel_cfg['num_output_channels'],
num_stages=1,
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained=None,
backbone=dict(
type='HourglassNet',
num_stacks=1,
),
keypoint_head=dict(
type='TopdownHeatmapMultiStageHead',
in_channels=256,
out_channels=channel_cfg['num_output_channels'],
num_stages=1,
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[384, 384],
heatmap_size=[96, 96],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=3),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29">Hourglass (ECCV'2016)</a></summary>
```bibtex
@inproceedings{newell2016stacked,
title={Stacked hourglass networks for human pose estimation},
author={Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
booktitle={European conference on computer vision},
pages={483--499},
year={2016},
organization={Springer}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [pose_hourglass_52](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_256x256.py) | 256x256 | 0.726 | 0.896 | 0.799 | 0.780 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256_20200709.log.json) |
| [pose_hourglass_52](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_384x384.py) | 384x384 | 0.746 | 0.900 | 0.813 | 0.797 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384_20200812.log.json) |
Collections:
- Name: Hourglass
Paper:
Title: Stacked hourglass networks for human pose estimation
URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29
README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hourglass.md
Models:
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_256x256.py
In Collection: Hourglass
Metadata:
Architecture: &id001
- Hourglass
Training Data: COCO
Name: topdown_heatmap_hourglass52_coco_256x256
Results:
- Dataset: COCO
Metrics:
AP: 0.726
AP@0.5: 0.896
AP@0.75: 0.799
AR: 0.78
AR@0.5: 0.934
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass52_coco_384x384.py
In Collection: Hourglass
Metadata:
Architecture: *id001
Training Data: COCO
Name: topdown_heatmap_hourglass52_coco_384x384
Results:
- Dataset: COCO
Metrics:
AP: 0.746
AP@0.5: 0.9
AP@0.75: 0.813
AR: 0.797
AR@0.5: 0.939
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='AdamW',
lr=5e-4,
betas=(0.9, 0.999),
weight_decay=0.01,
paramwise_cfg=dict(
custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='TopDown',
pretrained='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrformer_base-32815020_20220226.pth',
backbone=dict(
type='HRFormer',
in_channels=3,
norm_cfg=norm_cfg,
extra=dict(
drop_path_rate=0.2,
with_rpe=True,
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(2, ),
num_channels=(64, ),
num_heads=[2],
mlp_ratios=[4]),
stage2=dict(
num_modules=1,
num_branches=2,
block='HRFORMERBLOCK',
num_blocks=(2, 2),
num_channels=(78, 156),
num_heads=[2, 4],
mlp_ratios=[4, 4],
window_sizes=[7, 7]),
stage3=dict(
num_modules=4,
num_branches=3,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2),
num_channels=(78, 156, 312),
num_heads=[2, 4, 8],
mlp_ratios=[4, 4, 4],
window_sizes=[7, 7, 7]),
stage4=dict(
num_modules=2,
num_branches=4,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2, 2),
num_channels=(78, 156, 312, 624),
num_heads=[2, 4, 8, 16],
mlp_ratios=[4, 4, 4, 4],
window_sizes=[7, 7, 7, 7]))),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=78,
out_channels=channel_cfg['num_output_channels'],
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_root = 'data/coco'
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file=f'{data_root}/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# fp16 settings
fp16 = dict(loss_scale='dynamic')
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='AdamW',
lr=5e-4,
betas=(0.9, 0.999),
weight_decay=0.01,
paramwise_cfg=dict(
custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='TopDown',
pretrained='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrformer_base-32815020_20220226.pth',
backbone=dict(
type='HRFormer',
in_channels=3,
norm_cfg=norm_cfg,
extra=dict(
drop_path_rate=0.3,
with_rpe=True,
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(2, ),
num_channels=(64, ),
num_heads=[2],
mlp_ratios=[4]),
stage2=dict(
num_modules=1,
num_branches=2,
block='HRFORMERBLOCK',
num_blocks=(2, 2),
num_channels=(78, 156),
num_heads=[2, 4],
mlp_ratios=[4, 4],
window_sizes=[7, 7]),
stage3=dict(
num_modules=4,
num_branches=3,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2),
num_channels=(78, 156, 312),
num_heads=[2, 4, 8],
mlp_ratios=[4, 4, 4],
window_sizes=[7, 7, 7]),
stage4=dict(
num_modules=2,
num_branches=4,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2, 2),
num_channels=(78, 156, 312, 624),
num_heads=[2, 4, 8, 16],
mlp_ratios=[4, 4, 4, 4],
window_sizes=[7, 7, 7, 7]))),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=78,
out_channels=channel_cfg['num_output_channels'],
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=17))
data_root = 'data/coco'
data_cfg = dict(
image_size=[288, 384],
heatmap_size=[72, 96],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file=f'{data_root}/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=3),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# fp16 settings
fp16 = dict(loss_scale='dynamic')
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://proceedings.neurips.cc/paper/2021/hash/3bbfdde8842a5c44a0323518eec97cbe-Abstract.html">HRFormer (NIPS'2021)</a></summary>
```bibtex
@article{yuan2021hrformer,
title={HRFormer: High-Resolution Vision Transformer for Dense Predict},
author={Yuan, Yuhui and Fu, Rao and Huang, Lang and Lin, Weihong and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
journal={Advances in Neural Information Processing Systems},
volume={34},
year={2021}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [pose_hrformer_small](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_256x192.py) | 256x192 | 0.738 | 0.904 | 0.811 | 0.792 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-5310d898_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192_20220316.log.json) |
| [pose_hrformer_small](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_384x288.py) | 384x288 | 0.757 | 0.905 | 0.824 | 0.807 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-98d237ed_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288_20220316.log.json) |
| [pose_hrformer_base](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_256x192.py) | 256x192 | 0.753 | 0.907 | 0.826 | 0.807 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-6f5f1169_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192_20220316.log.json) |
| [pose_hrformer_base](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_384x288.py) | 384x288 | 0.774 | 0.909 | 0.842 | 0.823 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192_20220316.log.json) |
Collections:
- Name: HRFormer
Paper:
Title: 'HRFormer: High-Resolution Vision Transformer for Dense Predict'
URL: https://proceedings.neurips.cc/paper/2021/hash/3bbfdde8842a5c44a0323518eec97cbe-Abstract.html
README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/backbones/hrformer.md
Models:
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_256x192.py
In Collection: HRFormer
Metadata:
Architecture: &id001
- HRFormer
Training Data: COCO
Name: topdown_heatmap_hrformer_small_coco_256x192
Results:
- Dataset: COCO
Metrics:
AP: 0.738
AP@0.5: 0.904
AP@0.75: 0.811
AR: 0.792
AR@0.5: 0.941
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_256x192-5310d898_20220316.pth
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_small_coco_384x288.py
In Collection: HRFormer
Metadata:
Architecture: *id001
Training Data: COCO
Name: topdown_heatmap_hrformer_small_coco_384x288
Results:
- Dataset: COCO
Metrics:
AP: 0.757
AP@0.5: 0.905
AP@0.75: 0.824
AR: 0.807
AR@0.5: 0.941
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_small_coco_384x288-98d237ed_20220316.pth
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_256x192.py
In Collection: HRFormer
Metadata:
Architecture: *id001
Training Data: COCO
Name: topdown_heatmap_hrformer_base_coco_256x192
Results:
- Dataset: COCO
Metrics:
AP: 0.753
AP@0.5: 0.907
AP@0.75: 0.826
AR: 0.807
AR@0.5: 0.943
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_256x192-6f5f1169_20220316.pth
- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_base_coco_384x288.py
In Collection: HRFormer
Metadata:
Architecture: *id001
Training Data: COCO
Name: topdown_heatmap_hrformer_base_coco_384x288
Results:
- Dataset: COCO
Metrics:
AP: 0.774
AP@0.5: 0.909
AP@0.75: 0.842
AR: 0.823
AR@0.5: 0.945
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/top_down/hrformer/hrformer_base_coco_384x288-ecf0758d_20220316.pth
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='AdamW',
lr=5e-4,
betas=(0.9, 0.999),
weight_decay=0.01,
paramwise_cfg=dict(
custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='TopDown',
pretrained='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrformer_small-09516375_20220226.pth',
backbone=dict(
type='HRFormer',
in_channels=3,
norm_cfg=norm_cfg,
extra=dict(
drop_path_rate=0.1,
with_rpe=True,
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(2, ),
num_channels=(64, ),
num_heads=[2],
num_mlp_ratios=[4]),
stage2=dict(
num_modules=1,
num_branches=2,
block='HRFORMERBLOCK',
num_blocks=(2, 2),
num_channels=(32, 64),
num_heads=[1, 2],
mlp_ratios=[4, 4],
window_sizes=[7, 7]),
stage3=dict(
num_modules=4,
num_branches=3,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2),
num_channels=(32, 64, 128),
num_heads=[1, 2, 4],
mlp_ratios=[4, 4, 4],
window_sizes=[7, 7, 7]),
stage4=dict(
num_modules=2,
num_branches=4,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2, 2),
num_channels=(32, 64, 128, 256),
num_heads=[1, 2, 4, 8],
mlp_ratios=[4, 4, 4, 4],
window_sizes=[7, 7, 7, 7]))),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=32,
out_channels=channel_cfg['num_output_channels'],
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_root = 'data/coco'
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file=f'{data_root}/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# fp16 settings
fp16 = dict(loss_scale='dynamic')
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='AdamW',
lr=5e-4,
betas=(0.9, 0.999),
weight_decay=0.01,
paramwise_cfg=dict(
custom_keys={'relative_position_bias_table': dict(decay_mult=0.)}))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='TopDown',
pretrained='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrformer_small-09516375_20220226.pth',
backbone=dict(
type='HRFormer',
in_channels=3,
norm_cfg=norm_cfg,
extra=dict(
drop_path_rate=0.1,
with_rpe=True,
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(2, ),
num_channels=(64, ),
num_heads=[2],
num_mlp_ratios=[4]),
stage2=dict(
num_modules=1,
num_branches=2,
block='HRFORMERBLOCK',
num_blocks=(2, 2),
num_channels=(32, 64),
num_heads=[1, 2],
mlp_ratios=[4, 4],
window_sizes=[7, 7]),
stage3=dict(
num_modules=4,
num_branches=3,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2),
num_channels=(32, 64, 128),
num_heads=[1, 2, 4],
mlp_ratios=[4, 4, 4],
window_sizes=[7, 7, 7]),
stage4=dict(
num_modules=2,
num_branches=4,
block='HRFORMERBLOCK',
num_blocks=(2, 2, 2, 2),
num_channels=(32, 64, 128, 256),
num_heads=[1, 2, 4, 8],
mlp_ratios=[4, 4, 4, 4],
window_sizes=[7, 7, 7, 7]))),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=32,
out_channels=channel_cfg['num_output_channels'],
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_root = 'data/coco'
data_cfg = dict(
image_size=[288, 384],
heatmap_size=[72, 96],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file=f'{data_root}/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=3),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data = dict(
samples_per_gpu=32,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# fp16 settings
fp16 = dict(loss_scale='dynamic')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment