Commit ca8a762a authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #716 failed with stages
in 0 seconds
Collections:
- Name: AE
Paper:
Title: "Associative embedding: End-to-end learning for joint detection and grouping"
URL: https://arxiv.org/abs/1611.05424
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/associative_embedding.md
Models:
- Config: configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py
In Collection: AE
Metadata:
Architecture:
- AE
- HRNet
Training Data: COCO
Name: ae_hrnet-w32_8xb24-300e_coco-512x512
Results:
- Dataset: COCO
Metrics:
AP: 0.656
AP@0.5: 0.864
AP@0.75: 0.719
AR: 0.711
AR@0.5: 0.893
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=140, val_interval=10)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=1e-3,
))
# learning policy
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=140,
milestones=[90, 120],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=160)
# hooks
default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
# codec settings
codec = dict(
type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128))
# model settings
model = dict(
type='BottomupPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(32, 64)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(32, 64, 128)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(32, 64, 128, 256),
multiscale_output=True)),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w32-36af842e.pth'),
),
neck=dict(
type='FeatureMapProcessor',
concat=True,
),
head=dict(
type='CIDHead',
in_channels=480,
num_keypoints=17,
gfd_channels=32,
coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0),
decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0),
contrastive_loss=dict(
type='InfoNCELoss', temperature=0.05, loss_weight=1.0),
decoder=codec,
),
train_cfg=dict(max_train_instances=200),
test_cfg=dict(
multiscale_test=False,
flip_test=True,
shift_heatmap=False,
align_corners=False))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'bottomup'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='BottomupRandomAffine', input_size=codec['input_size']),
dict(type='RandomFlip', direction='horizontal'),
dict(type='GenerateTarget', encoder=codec),
dict(type='BottomupGetHeatmapMask'),
dict(type='PackPoseInputs'),
]
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize',
input_size=codec['input_size'],
size_factor=64,
resize_mode='expand'),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
'img_shape', 'input_size', 'input_center', 'input_scale',
'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
'skeleton_links'))
]
# data loaders
train_dataloader = dict(
batch_size=20,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/person_keypoints_val2017.json',
nms_thr=0.8,
score_mode='keypoint',
)
test_evaluator = val_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=140, val_interval=10)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=1e-3,
))
# learning policy
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=140,
milestones=[90, 120],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=160)
# hooks
default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
# codec settings
codec = dict(
type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128))
# model settings
model = dict(
type='BottomupPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(48, 96)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(48, 96, 192)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(48, 96, 192, 384),
multiscale_output=True)),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w48-8ef0771d.pth'),
),
neck=dict(
type='FeatureMapProcessor',
concat=True,
),
head=dict(
type='CIDHead',
in_channels=720,
num_keypoints=17,
gfd_channels=48,
coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0),
decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0),
contrastive_loss=dict(
type='InfoNCELoss', temperature=0.05, loss_weight=1.0),
decoder=codec,
),
train_cfg=dict(max_train_instances=200),
test_cfg=dict(
multiscale_test=False,
flip_test=True,
shift_heatmap=False,
align_corners=False))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'bottomup'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='BottomupRandomAffine', input_size=codec['input_size']),
dict(type='RandomFlip', direction='horizontal'),
dict(type='GenerateTarget', encoder=codec),
dict(type='BottomupGetHeatmapMask'),
dict(type='PackPoseInputs'),
]
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize',
input_size=codec['input_size'],
size_factor=64,
resize_mode='expand'),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
'img_shape', 'input_size', 'input_center', 'input_scale',
'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
'skeleton_links'))
]
# data loaders
train_dataloader = dict(
batch_size=20,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/person_keypoints_val2017.json',
nms_thr=0.8,
score_mode='keypoint',
)
test_evaluator = val_evaluator
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://openaccess.thecvf.com/content/CVPR2022/html/Wang_Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_CVPR_2022_paper.html">CID (CVPR'2022)</a></summary>
```bibtex
@InProceedings{Wang_2022_CVPR,
author = {Wang, Dongkai and Zhang, Shiliang},
title = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2022},
pages = {11060-11068}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 without multi-scale test
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py) | 512x512 | 0.704 | 0.894 | 0.775 | 0.753 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_20230207.json) |
| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py) | 512x512 | 0.715 | 0.900 | 0.782 | 0.765 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_20230207.json) |
Collections:
- Name: CID
Paper:
Title: Contextual Instance Decoupling for Robust Multi-Person Pose Estimation
URL: https://openaccess.thecvf.com/content/CVPR2022/html/Wang_Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_CVPR_2022_paper.html
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/cid.md
Models:
- Config: configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py
In Collection: CID
Metadata:
Architecture: &id001
- CID
- HRNet
Training Data: COCO
Name: cid_hrnet-w32_8xb20-140e_coco-512x512
Results:
- Dataset: COCO
Metrics:
AP: 0.704
AP@0.5: 0.894
AP@0.75: 0.775
AR: 0.753
AR@0.5: 0.928
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth
- Config: configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py
In Collection: CID
Metadata:
Architecture: *id001
Training Data: COCO
Name: cid_hrnet-w48_8xb20-140e_coco-512x512
Results:
- Dataset: COCO
Metrics:
AP: 0.715
AP@0.5: 0.9
AP@0.75: 0.782
AR: 0.765
AR@0.5: 0.935
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth
# Bottom-up Human Pose Estimation via Disentangled Keypoint Regression (DEKR)
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2104.02300">DEKR (CVPR'2021)</a></summary>
```bibtex
@inproceedings{geng2021bottom,
title={Bottom-up human pose estimation via disentangled keypoint regression},
author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={14676--14686},
year={2021}
}
```
</details>
DEKR is a popular 2D bottom-up pose estimation approach that simultaneously detects all the instances and regresses the offsets from the instance centers to joints.
In order to predict the offsets more accurately, the offsets of different joints are regressed using separated branches with deformable convolutional layers. Thus convolution kernels with different shapes are adopted to extract features for the corresponding joint.
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=140, val_interval=10)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=1e-3,
))
# learning policy
param_scheduler = [
dict(
type='LinearLR', begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type='MultiStepLR',
begin=0,
end=140,
milestones=[90, 120],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=80)
# hooks
default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
# codec settings
codec = dict(
type='SPR',
input_size=(512, 512),
heatmap_size=(128, 128),
sigma=(4, 2),
minimal_diagonal_length=32**0.5,
generate_keypoint_heatmaps=True,
decode_max_instances=30)
# model settings
model = dict(
type='BottomupPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(32, 64)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(32, 64, 128)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(32, 64, 128, 256),
multiscale_output=True)),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w32-36af842e.pth'),
),
neck=dict(
type='FeatureMapProcessor',
concat=True,
),
head=dict(
type='DEKRHead',
in_channels=480,
num_keypoints=17,
heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
displacement_loss=dict(
type='SoftWeightSmoothL1Loss',
use_target_weight=True,
supervise_empty=False,
beta=1 / 9,
loss_weight=0.002,
),
decoder=codec,
# This rescore net is adapted from the official repo.
# If you are not using the original COCO dataset for training,
# please make sure to remove the `rescore_cfg` item
rescore_cfg=dict(
in_channels=74,
norm_indexes=(5, 6),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/kpt_rescore_coco-33d58c5c.pth')),
),
test_cfg=dict(
multiscale_test=False,
flip_test=True,
nms_dist_thr=0.05,
shift_heatmap=True,
align_corners=False))
# enable DDP training when rescore net is used
find_unused_parameters = True
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'bottomup'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='BottomupRandomAffine', input_size=codec['input_size']),
dict(type='RandomFlip', direction='horizontal'),
dict(type='GenerateTarget', encoder=codec),
dict(type='BottomupGetHeatmapMask'),
dict(type='PackPoseInputs'),
]
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize',
input_size=codec['input_size'],
size_factor=32,
resize_mode='expand'),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
'img_shape', 'input_size', 'input_center', 'input_scale',
'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
'skeleton_links'))
]
# data loaders
train_dataloader = dict(
batch_size=10,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/person_keypoints_val2017.json',
nms_mode='none',
score_mode='keypoint',
)
test_evaluator = val_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=140, val_interval=10)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=1e-3,
))
# learning policy
param_scheduler = [
dict(
type='LinearLR', begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type='MultiStepLR',
begin=0,
end=140,
milestones=[90, 120],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=80)
# hooks
default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
# codec settings
codec = dict(
type='SPR',
input_size=(640, 640),
heatmap_size=(160, 160),
sigma=(4, 2),
minimal_diagonal_length=32**0.5,
generate_keypoint_heatmaps=True,
decode_max_instances=30)
# model settings
model = dict(
type='BottomupPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(48, 96)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(48, 96, 192)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(48, 96, 192, 384),
multiscale_output=True)),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w48-8ef0771d.pth'),
),
neck=dict(
type='FeatureMapProcessor',
concat=True,
),
head=dict(
type='DEKRHead',
in_channels=720,
num_keypoints=17,
num_heatmap_filters=48,
heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
displacement_loss=dict(
type='SoftWeightSmoothL1Loss',
use_target_weight=True,
supervise_empty=False,
beta=1 / 9,
loss_weight=0.002,
),
decoder=codec,
# This rescore net is adapted from the official repo.
# If you are not using the original COCO dataset for training,
# please make sure to remove the `rescore_cfg` item
rescore_cfg=dict(
in_channels=74,
norm_indexes=(5, 6),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/kpt_rescore_coco-33d58c5c.pth')),
),
test_cfg=dict(
multiscale_test=False,
flip_test=True,
nms_dist_thr=0.05,
shift_heatmap=True,
align_corners=False))
# enable DDP training when rescore net is used
find_unused_parameters = True
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'bottomup'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='BottomupRandomAffine', input_size=codec['input_size']),
dict(type='RandomFlip', direction='horizontal'),
dict(type='GenerateTarget', encoder=codec),
dict(type='BottomupGetHeatmapMask'),
dict(type='PackPoseInputs'),
]
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize',
input_size=codec['input_size'],
size_factor=32,
resize_mode='expand'),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
'img_shape', 'input_size', 'input_center', 'input_scale',
'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
'skeleton_links'))
]
# data loaders
train_dataloader = dict(
batch_size=10,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/person_keypoints_val2017.json',
nms_mode='none',
score_mode='keypoint',
)
test_evaluator = val_evaluator
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2104.02300">DEKR (CVPR'2021)</a></summary>
```bibtex
@inproceedings{geng2021bottom,
title={Bottom-up human pose estimation via disentangled keypoint regression},
author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={14676--14686},
year={2021}
}
```
</details>
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
```bibtex
@inproceedings{sun2019deep,
title={Deep high-resolution representation learning for human pose estimation},
author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={5693--5703},
year={2019}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 without multi-scale test
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [HRNet-w32](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py) | 512x512 | 0.686 | 0.868 | 0.750 | 0.735 | 0.898 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_20221228.json) |
| [HRNet-w48](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py) | 640x640 | 0.714 | 0.883 | 0.777 | 0.762 | 0.915 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_74796c32-20230124.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_20230124.json) |
Collections:
- Name: DEKR
Paper:
Title: Bottom-up human pose estimation via disentangled keypoint regression
URL: https://arxiv.org/abs/2104.02300
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/dekr.md
Models:
- Config: configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py
In Collection: DEKR
Metadata:
Architecture: &id001
- DEKR
- HRNet
Training Data: COCO
Name: dekr_hrnet-w32_8xb10-140e_coco-512x512
Results:
- Dataset: COCO
Metrics:
AP: 0.686
AP@0.5: 0.868
AP@0.75: 0.750
AR: 0.735
AR@0.5: 0.898
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth
- Config: configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py
In Collection: DEKR
Metadata:
Architecture: *id001
Training Data: COCO
Name: dekr_hrnet-w48_8xb10-140e_coco-640x640
Results:
- Dataset: COCO
Metrics:
AP: 0.714
AP@0.5: 0.883
AP@0.75: 0.777
AR: 0.762
AR@0.5: 0.915
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_74796c32-20230124.pth
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=300, val_interval=20)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=1e-3,
))
# learning policy
param_scheduler = [
dict(
type='LinearLR', begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type='MultiStepLR',
begin=0,
end=300,
milestones=[200, 260],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=80)
# hooks
default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
# codec settings
codec = dict(
type='SPR',
input_size=(512, 512),
heatmap_size=(128, 128),
sigma=(4, 2),
minimal_diagonal_length=32**0.5,
generate_keypoint_heatmaps=True,
decode_max_instances=30)
# model settings
model = dict(
type='BottomupPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(32, 64)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(32, 64, 128)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(32, 64, 128, 256),
multiscale_output=True)),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w32-36af842e.pth'),
),
neck=dict(
type='FeatureMapProcessor',
concat=True,
),
head=dict(
type='DEKRHead',
in_channels=480,
num_keypoints=14,
heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
displacement_loss=dict(
type='SoftWeightSmoothL1Loss',
use_target_weight=True,
supervise_empty=False,
beta=1 / 9,
loss_weight=0.004,
),
decoder=codec,
# This rescore net is adapted from the official repo.
# If you are not using the original CrowdPose dataset for training,
# please make sure to remove the `rescore_cfg` item
rescore_cfg=dict(
in_channels=59,
norm_indexes=(0, 1),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')),
),
test_cfg=dict(
multiscale_test=False,
flip_test=True,
nms_dist_thr=0.05,
shift_heatmap=True,
align_corners=False))
# enable DDP training when rescore net is used
find_unused_parameters = True
# base dataset settings
dataset_type = 'CrowdPoseDataset'
data_mode = 'bottomup'
data_root = 'data/crowdpose/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='BottomupRandomAffine', input_size=codec['input_size']),
dict(type='RandomFlip', direction='horizontal'),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize',
input_size=codec['input_size'],
size_factor=32,
resize_mode='expand'),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
'img_shape', 'input_size', 'input_center', 'input_scale',
'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
'skeleton_links'))
]
# data loaders
train_dataloader = dict(
batch_size=10,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='images/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
nms_mode='none',
score_mode='keypoint',
use_area=False,
iou_type='keypoints_crowd',
prefix='crowdpose')
test_evaluator = val_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=300, val_interval=20)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=1e-3,
))
# learning policy
param_scheduler = [
dict(
type='LinearLR', begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type='MultiStepLR',
begin=0,
end=300,
milestones=[200, 260],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=40)
# hooks
default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater'))
# codec settings
codec = dict(
type='SPR',
input_size=(640, 640),
heatmap_size=(160, 160),
sigma=(4, 2),
minimal_diagonal_length=32**0.5,
generate_keypoint_heatmaps=True,
decode_max_instances=30)
# model settings
model = dict(
type='BottomupPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(48, 96)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(48, 96, 192)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(48, 96, 192, 384),
multiscale_output=True)),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w48-8ef0771d.pth'),
),
neck=dict(
type='FeatureMapProcessor',
concat=True,
),
head=dict(
type='DEKRHead',
in_channels=720,
num_keypoints=14,
num_heatmap_filters=48,
heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True),
displacement_loss=dict(
type='SoftWeightSmoothL1Loss',
use_target_weight=True,
supervise_empty=False,
beta=1 / 9,
loss_weight=0.004,
),
decoder=codec,
# This rescore net is adapted from the official repo.
# If you are not using the original CrowdPose dataset for training,
# please make sure to remove the `rescore_cfg` item
rescore_cfg=dict(
in_channels=59,
norm_indexes=(0, 1),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')),
),
test_cfg=dict(
multiscale_test=False,
flip_test=True,
nms_dist_thr=0.05,
shift_heatmap=True,
align_corners=False))
# enable DDP training when rescore net is used
find_unused_parameters = True
# base dataset settings
dataset_type = 'CrowdPoseDataset'
data_mode = 'bottomup'
data_root = 'data/crowdpose/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='BottomupRandomAffine', input_size=codec['input_size']),
dict(type='RandomFlip', direction='horizontal'),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize',
input_size=codec['input_size'],
size_factor=32,
resize_mode='expand'),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
'img_shape', 'input_size', 'input_center', 'input_scale',
'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
'skeleton_links'))
]
# data loaders
train_dataloader = dict(
batch_size=5,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='images/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'annotations/mmpose_crowdpose_test.json',
nms_mode='none',
score_mode='keypoint',
use_area=False,
iou_type='keypoints_crowd',
prefix='crowdpose')
test_evaluator = val_evaluator
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2104.02300">DEKR (CVPR'2021)</a></summary>
```bibtex
@inproceedings{geng2021bottom,
title={Bottom-up human pose estimation via disentangled keypoint regression},
author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={14676--14686},
year={2021}
}
```
</details>
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Sun_Deep_High-Resolution_Representation_Learning_for_Human_Pose_Estimation_CVPR_2019_paper.html">HRNet (CVPR'2019)</a></summary>
```bibtex
@inproceedings{sun2019deep,
title={Deep high-resolution representation learning for human pose estimation},
author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={5693--5703},
year={2019}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Li_CrowdPose_Efficient_Crowded_Scenes_Pose_Estimation_and_a_New_Benchmark_CVPR_2019_paper.html">CrowdPose (CVPR'2019)</a></summary>
```bibtex
@article{li2018crowdpose,
title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
journal={arXiv preprint arXiv:1812.00324},
year={2018}
}
```
</details>
Results on CrowdPose test without multi-scale test
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP (E) | AP (M) | AP (H) | ckpt | log |
| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
| [HRNet-w32](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py) | 512x512 | 0.663 | 0.857 | 0.714 | 0.740 | 0.671 | 0.576 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_147bae97-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_20221228.json) |
| [HRNet-w48](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py) | 640x640 | 0.679 | 0.869 | 0.731 | 0.753 | 0.688 | 0.593 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_4ea6031e-20230128.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_20230128.json) |
Models:
- Config: configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py
In Collection: DEKR
Metadata:
Architecture: &id001
- DEKR
- HRNet
Training Data: CrowdPose
Name: dekr_hrnet-w32_8xb10-300e_crowdpose-512x512
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.663
AP@0.5: 0.857
AP@0.75: 0.714
AP (E): 0.74
AP (M): 0.671
AP (L): 0.576
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512_147bae97-20221228.pth
- Config: configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py
In Collection: DEKR
Metadata:
Architecture: *id001
Training Data: CrowdPose
Name: dekr_hrnet-w48_8xb5-300e_crowdpose-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.679
AP@0.5: 0.869
AP@0.75: 0.731
AP (E): 0.753
AP (M): 0.688
AP (L): 0.593
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_4ea6031e-20230128.pth
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/pdf/2302.01593.pdf">ED-Pose (ICLR'2023)</a></summary>
```bibtex
@inproceedings{
yang2023explicit,
title={Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation},
author={Jie Yang and Ailing Zeng and Shilong Liu and Feng Li and Ruimao Zhang and Lei Zhang},
booktitle={International Conference on Learning Representations},
year={2023},
url={https://openreview.net/forum?id=s4WVupnJjmX}
}
```
</details>
<!-- [BACKBONE] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
```bibtex
@inproceedings{he2016deep,
title={Deep residual learning for image recognition},
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={770--778},
year={2016}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017.
| Arch | BackBone | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :-------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------------------------------------------: | :-------------------------------------------: |
| [edpose_res50_coco](/configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py) | ResNet-50 | 0.716 | 0.897 | 0.783 | 0.793 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.json) |
The checkpoint is converted from the official repo. The training of EDPose is not supported yet. It will be supported in the future updates.
The above config follows [Pure Python style](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta). Please install `mmengine>=0.8.2` to use this config.
Collections:
- Name: ED-Pose
Paper:
Title: Explicit Box Detection Unifies End-to-End Multi-Person Pose Estimation
URL: https://arxiv.org/pdf/2302.01593.pdf
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/edpose.md
Models:
- Config: configs/body_2d_keypoint/edpose/coco/edpose_res50_8xb2-50e_coco-800x1333.py
In Collection: ED-Pose
Alias: edpose
Metadata:
Architecture: &id001
- ED-Pose
- ResNet
Training Data: COCO
Name: edpose_res50_8xb2-50e_coco-800x1333
Results:
- Dataset: COCO
Metrics:
AP: 0.716
AP@0.5: 0.897
AP@0.75: 0.783
AR: 0.793
AR@0.5: 0.943
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/edpose/coco/edpose_res50_coco_3rdparty.pth
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.config import read_base
with read_base():
from mmpose.configs._base_.default_runtime import * # noqa
from mmcv.transforms import RandomChoice, RandomChoiceResize
from mmengine.dataset import DefaultSampler
from mmengine.model import PretrainedInit
from mmengine.optim import LinearLR, MultiStepLR
from torch.nn import GroupNorm
from torch.optim import Adam
from mmpose.codecs import EDPoseLabel
from mmpose.datasets import (BottomupRandomChoiceResize, BottomupRandomCrop,
CocoDataset, LoadImage, PackPoseInputs,
RandomFlip)
from mmpose.evaluation import CocoMetric
from mmpose.models import (BottomupPoseEstimator, ChannelMapper, EDPoseHead,
PoseDataPreprocessor, ResNet)
from mmpose.models.utils import FrozenBatchNorm2d
# runtime
train_cfg.update(max_epochs=50, val_interval=10) # noqa
# optimizer
optim_wrapper = dict(optimizer=dict(
type=Adam,
lr=1e-3,
))
# learning policy
param_scheduler = [
dict(type=LinearLR, begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type=MultiStepLR,
begin=0,
end=140,
milestones=[33, 45],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=80)
# hooks
default_hooks.update( # noqa
checkpoint=dict(save_best='coco/AP', rule='greater'))
# codec settings
codec = dict(type=EDPoseLabel, num_select=50, num_keypoints=17)
# model settings
model = dict(
type=BottomupPoseEstimator,
data_preprocessor=dict(
type=PoseDataPreprocessor,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=1),
backbone=dict(
type=ResNet,
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type=FrozenBatchNorm2d, requires_grad=False),
norm_eval=True,
style='pytorch',
init_cfg=dict(
type=PretrainedInit, checkpoint='torchvision://resnet50')),
neck=dict(
type=ChannelMapper,
in_channels=[512, 1024, 2048],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type=GroupNorm, num_groups=32),
num_outs=4),
head=dict(
type=EDPoseHead,
num_queries=900,
num_feature_levels=4,
num_keypoints=17,
as_two_stage=True,
encoder=dict(
num_layers=6,
layer_cfg=dict( # DeformableDetrTransformerEncoderLayer
self_attn_cfg=dict( # MultiScaleDeformableAttention
embed_dims=256,
num_heads=8,
num_levels=4,
num_points=4,
batch_first=True),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.0))),
decoder=dict(
num_layers=6,
embed_dims=256,
layer_cfg=dict( # DeformableDetrTransformerDecoderLayer
self_attn_cfg=dict( # MultiheadAttention
embed_dims=256,
num_heads=8,
batch_first=True),
cross_attn_cfg=dict( # MultiScaleDeformableAttention
embed_dims=256,
batch_first=True),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=2048, ffn_drop=0.1)),
query_dim=4,
num_feature_levels=4,
num_group=100,
num_dn=100,
num_box_decoder_layers=2,
return_intermediate=True),
out_head=dict(num_classes=2),
positional_encoding=dict(
num_pos_feats=128,
temperatureH=20,
temperatureW=20,
normalize=True),
denosing_cfg=dict(
dn_box_noise_scale=0.4,
dn_label_noise_ratio=0.5,
dn_labelbook_size=100,
dn_attn_mask_type_list=['match2dn', 'dn2dn', 'group2group']),
data_decoder=codec),
test_cfg=dict(Pmultiscale_test=False, flip_test=False, num_select=50),
train_cfg=dict())
# enable DDP training when rescore net is used
find_unused_parameters = True
# base dataset settings
dataset_type = CocoDataset
data_mode = 'bottomup'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type=LoadImage),
dict(type=RandomFlip, direction='horizontal'),
dict(
type=RandomChoice,
transforms=[
[
dict(
type=RandomChoiceResize,
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
],
[
dict(
type=BottomupRandomChoiceResize,
# The radio of all image in train dataset < 7
# follow the original implement
scales=[(400, 4200), (500, 4200), (600, 4200)],
keep_ratio=True),
dict(
type=BottomupRandomCrop,
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=True),
dict(
type=BottomupRandomChoiceResize,
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
]
]),
dict(type=PackPoseInputs),
]
val_pipeline = [
dict(type=LoadImage),
dict(
type=BottomupRandomChoiceResize,
scales=[(800, 1333)],
keep_ratio=True,
backend='pillow'),
dict(
type=PackPoseInputs,
meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape',
'img_shape', 'input_size', 'input_center', 'input_scale',
'flip', 'flip_direction', 'flip_indices', 'raw_ann_info',
'skeleton_links'))
]
# data loaders
train_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
sampler=dict(type=DefaultSampler, shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=1,
num_workers=8,
persistent_workers=True,
drop_last=False,
sampler=dict(type=DefaultSampler, shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type=CocoMetric,
nms_mode='none',
score_mode='keypoint',
)
test_evaluator = val_evaluator
# Top-down integral-regression-based pose estimation
Top-down methods divide the task into two stages: object detection, followed by single-object pose estimation given object bounding boxes. At the 2nd stage, integral regression based methods use a simple integral operation relates and unifies the heatmap and joint regression differentiably, thus obtain the keypoint coordinates given the features extracted from the bounding box area, following the paradigm introduced in [Integral Human Pose Regression](https://arxiv.org/abs/1711.08229).
## Results and Models
### COCO Dataset
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Model | Input Size | AP | AR | Details and Download |
| :------------------: | :--------: | :---: | :---: | :---------------------------------------------------: |
| ResNet-50+Debias-IPR | 256x256 | 0.675 | 0.765 | [resnet_debias_coco.md](./coco/resnet_debias_coco.md) |
| ResNet-50+DSNT | 256x256 | 0.674 | 0.764 | [resnet_dsnt_coco.md](./coco/resnet_dsnt_coco.md) |
| ResNet-50+IPR | 256x256 | 0.633 | 0.730 | [resnet_ipr_coco.md](./coco/resnet_ipr_coco.md) |
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=210, val_interval=10)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=5e-4,
))
# learning policy
param_scheduler = [
dict(
type='LinearLR', begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type='MultiStepLR',
begin=0,
end=train_cfg['max_epochs'],
milestones=[170, 200],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=512)
# codec settings
codec = dict(
type='IntegralRegressionLabel',
input_size=(256, 256),
heatmap_size=(64, 64),
sigma=2.0,
normalize=True)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='ResNet',
depth=50,
),
head=dict(
type='DSNTHead',
in_channels=2048,
in_featuremap_size=(8, 8),
num_joints=17,
loss=dict(
type='MultipleLossWrapper',
losses=[
dict(type='SmoothL1Loss', use_target_weight=True),
dict(type='KeypointMSELoss', use_target_weight=True)
]),
decoder=codec),
test_cfg=dict(
flip_test=True,
shift_coords=True,
shift_heatmap=True,
),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(type='RandomBBoxTransform'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
test_pipeline = [
dict(type='LoadImage'),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
# data loaders
train_dataloader = dict(
batch_size=64,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=32,
num_workers=2,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=test_pipeline,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
test_evaluator = val_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=210, val_interval=10)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=5e-4,
))
# learning policy
param_scheduler = [
dict(
type='LinearLR', begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type='MultiStepLR',
begin=0,
end=train_cfg['max_epochs'],
milestones=[170, 200],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=512)
# codec settings
codec = dict(
type='IntegralRegressionLabel',
input_size=(256, 256),
heatmap_size=(64, 64),
sigma=2.0,
normalize=True)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='ResNet',
depth=50,
),
head=dict(
type='DSNTHead',
in_channels=2048,
in_featuremap_size=(8, 8),
num_joints=17,
debias=True,
beta=10.,
loss=dict(
type='MultipleLossWrapper',
losses=[
dict(type='SmoothL1Loss', use_target_weight=True),
dict(type='JSDiscretLoss', use_target_weight=True)
]),
decoder=codec),
test_cfg=dict(
flip_test=True,
shift_coords=True,
shift_heatmap=True,
),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(type='RandomBBoxTransform'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
test_pipeline = [
dict(type='LoadImage'),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
# data loaders
train_dataloader = dict(
batch_size=64,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=32,
num_workers=2,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=test_pipeline,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
test_evaluator = val_evaluator
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment