Commit ca8a762a authored by chenzk's avatar chenzk
Browse files

v1.0

parents
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=210, val_interval=10)
# optimizer
optim_wrapper = dict(optimizer=dict(
type='Adam',
lr=5e-4,
))
# learning policy
param_scheduler = [
dict(
type='LinearLR', begin=0, end=500, start_factor=0.001,
by_epoch=False), # warm-up
dict(
type='MultiStepLR',
begin=0,
end=train_cfg['max_epochs'],
milestones=[170, 200],
gamma=0.1,
by_epoch=True)
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=512)
# codec settings
codec = dict(
type='IntegralRegressionLabel',
input_size=(256, 256),
heatmap_size=(64, 64),
sigma=2.0,
normalize=True)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
type='ResNet',
depth=50,
),
head=dict(
type='DSNTHead',
in_channels=2048,
in_featuremap_size=(8, 8),
num_joints=17,
loss=dict(
type='MultipleLossWrapper',
losses=[
dict(type='SmoothL1Loss', use_target_weight=True),
dict(type='JSDiscretLoss', use_target_weight=True)
]),
decoder=codec),
test_cfg=dict(
flip_test=True,
shift_coords=True,
shift_heatmap=True,
),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/'
'pretrain_models/td-hm_res50_8xb64-210e_coco-256x192.pth'))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/coco/'
# pipelines
train_pipeline = [
dict(type='LoadImage'),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(type='RandomBBoxTransform'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
test_pipeline = [
dict(type='LoadImage'),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
# data loaders
train_dataloader = dict(
batch_size=64,
num_workers=2,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_train2017.json',
data_prefix=dict(img='train2017/'),
pipeline=train_pipeline,
))
val_dataloader = dict(
batch_size=32,
num_workers=2,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=test_pipeline,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=f'{data_root}annotations/person_keypoints_val2017.json')
test_evaluator = val_evaluator
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Removing_the_Bias_of_Integral_Pose_Regression_ICCV_2021_paper.pdf">Debias IPR (ICCV'2021)</a></summary>
```bibtex
@inproceedings{gu2021removing,
title={Removing the Bias of Integral Pose Regression},
author={Gu, Kerui and Yang, Linlin and Yao, Angela},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={11067--11076},
year={2021}
}
```
</details>
<!-- [BACKBONE] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
```bibtex
@inproceedings{he2016deep,
title={Deep residual learning for image recognition},
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={770--778},
year={2016}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [debias-ipr_resnet_50](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py) | 256x256 | 0.675 | 0.872 | 0.740 | 0.765 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.log.json) |
Collections:
- Name: DebiasIPR
Paper:
Title: Removing the Bias of Integral Pose Regression
URL: https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Removing_the_Bias_of_Integral_Pose_Regression_ICCV_2021_paper.pdf
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/debias_ipr.md
Models:
- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias--8xb64-210e_coco-256x256.py
In Collection: DebiasIPR
Metadata:
Architecture: &id001
- Debias
- ResNet
Training Data: COCO
Name: ipr_res50_debias--8xb64-210e_coco-256x256
Results:
- Dataset: COCO
Metrics:
AP: 0.675
AP@0.5: 0.872
AP@0.75: 0.74
AR: 0.765
AR@0.5: 0.928
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256-055a7699_20220913.pth
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/1801.07372v2">DSNT (2018)</a></summary>
```bibtex
@article{nibali2018numerical,
title={Numerical Coordinate Regression with Convolutional Neural Networks},
author={Nibali, Aiden and He, Zhen and Morgan, Stuart and Prendergast, Luke},
journal={arXiv preprint arXiv:1801.07372},
year={2018}
}
```
</details>
<!-- [BACKBONE] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
```bibtex
@inproceedings{he2016deep,
title={Deep residual learning for image recognition},
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={770--778},
year={2016}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [ipr_resnet_50_dsnt](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py) | 256x256 | 0.674 | 0.870 | 0.744 | 0.764 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.log.json) |
Collections:
- Name: DSNT
Paper:
Title: Numerical Coordinate Regression with Convolutional Neural Networks
URL: https://arxiv.org/abs/1801.07372v2
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/dsnt.md
Models:
- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py
In Collection: DSNT
Metadata:
Architecture: &id001
- DSNT
- ResNet
Training Data: COCO
Name: ipr_res50_dsnt-8xb64-210e_coco-256x256
Results:
- Dataset: COCO
Metrics:
AP: 0.674
AP@0.5: 0.87
AP@0.75: 0.744
AR: 0.764
AR@0.5: 0.928
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256-441eedc0_20220913.pth
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/1711.08229">IPR (ECCV'2018)</a></summary>
```bibtex
@inproceedings{sun2018integral,
title={Integral human pose regression},
author={Sun, Xiao and Xiao, Bin and Wei, Fangyin and Liang, Shuang and Wei, Yichen},
booktitle={Proceedings of the European conference on computer vision (ECCV)},
pages={529--545},
year={2018}
}
```
</details>
<!-- [BACKBONE] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html">ResNet (CVPR'2016)</a></summary>
```bibtex
@inproceedings{he2016deep,
title={Deep residual learning for image recognition},
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={770--778},
year={2016}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [ipr_resnet_50](/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py) | 256x256 | 0.633 | 0.860 | 0.703 | 0.730 | 0.919 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.log.json) |
Collections:
- Name: IPR
Paper:
Title: Integral human pose regression
URL: https://arxiv.org/abs/1711.08229
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/ipr.md
Models:
- Config: configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py
In Collection: IPR
Metadata:
Architecture: &id001
- IPR
- ResNet
Training Data: COCO
Name: ipr_res50_8xb64-210e_coco-256x256
Results:
- Dataset: COCO
Metrics:
AP: 0.633
AP@0.5: 0.86
AP@0.75: 0.703
AR: 0.73
AR@0.5: 0.919
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256-a3898a33_20220913.pth
# RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2312.07526">RTMO</a></summary>
```bibtex
@misc{lu2023rtmo,
title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
year={2023},
eprint={2312.07526},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
</details>
RTMO is a one-stage pose estimation model that seamlessly integrates coordinate classification into the YOLO architecture. It introduces a Dynamic Coordinate Classifier (DCC) module that handles keypoint localization through dual 1D heatmaps. The DCC employs dynamic bin allocation, localizing the coordinate bins to each predicted bounding box to improve efficiency. It also uses learnable bin representations based on positional encodings, enabling computation of bin-keypoint similarity for precise localization.
RTMO is trained end-to-end using a multi-task loss, with losses for bounding box regression, keypoint heatmap classification via a novel MLE loss, keypoint coordinate proxy regression, and keypoint visibility classification. The MLE loss models annotation uncertainty and balances optimization between easy and hard samples.
During inference, RTMO employs grid-based dense predictions to simultaneously output human detection boxes and poses in a single pass. It selectively decodes heatmaps only for high-scoring grids after NMS, minimizing computational cost.
Compared to prior one-stage methods that regress keypoint coordinates directly, RTMO achieves higher accuracy through coordinate classification while retaining real-time speeds. It also outperforms lightweight top-down approaches for images with many people, as the latter have inference times that scale linearly with the number of human instances.
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.8, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/'
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=17,
mapping=[(i, i) for i in range(17)])
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
train_dataset = dict(
type='CombinedDataset',
metainfo=dict(from_file=metafile),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
test_mode=False,
pipeline=train_pipeline_stage1)
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset)
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_dataset=dataset_coco,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 1.0
deepen_factor = 1.0
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
'_20211126_140236-d3bd2b23.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[256, 512, 1024],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=512,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=512,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=512,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-2,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.8, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/'
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=17,
mapping=[(i, i) for i in range(17)])
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
train_dataset = dict(
type='CombinedDataset',
metainfo=dict(from_file=metafile),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
test_mode=False,
pipeline=train_pipeline_stage1)
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset)
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_dataset=dataset_coco,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.75
deepen_factor = 0.67
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/v1/'
'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[192, 384, 768],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=384,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=384,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=384,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-2,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.8, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/'
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=17,
mapping=[(i, i) for i in range(17)])
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
train_dataset = dict(
type='CombinedDataset',
metainfo=dict(from_file=metafile),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
test_mode=False,
pipeline=train_pipeline_stage1)
train_dataloader = dict(
batch_size=32,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset)
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_dataset=dataset_coco,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.5
deepen_factor = 0.33
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
'20211121_095711-4592a793.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[128, 256, 512],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=256,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile),
use_keypoints_for_center=True),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=256,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1.0,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (416, 416)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(416, 416),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/'
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=17,
mapping=[(i, i) for i in range(17)])
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
train_dataset = dict(
type='CombinedDataset',
metainfo=dict(from_file=metafile),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
test_mode=False,
pipeline=train_pipeline_stage1)
train_dataloader = dict(
batch_size=32,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset)
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_dataset=dataset_coco,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.375
deepen_factor = 0.33
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(320, 640),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_'
'20211124_171234-b4047906.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[96, 192, 384],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=192,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=192,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile),
use_keypoints_for_center=True),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=192,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1.0,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2312.07526">RTMO</a></summary>
```bibtex
@misc{lu2023rtmo,
title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
year={2023},
eprint={2312.07526},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/1711.06475">AI Challenger (ArXiv'2017)</a></summary>
```bibtex
@article{wu2017ai,
title={Ai challenger: A large-scale dataset for going deeper in image understanding},
author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others},
journal={arXiv preprint arXiv:1711.06475},
year={2017}
}
```
</details>
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Li_CrowdPose_Efficient_Crowded_Scenes_Pose_Estimation_and_a_New_Benchmark_CVPR_2019_paper.html">CrowdPose (CVPR'2019)</a></summary>
```bibtex
@article{li2018crowdpose,
title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
journal={arXiv preprint arXiv:1812.00324},
year={2018}
}
```
</details>
<details>
<summary align="right"><a href="https://www.cv-foundation.org/openaccess/content_iccv_2013/html/Jhuang_Towards_Understanding_Action_2013_ICCV_paper.html">JHMDB (ICCV'2013)</a></summary>
```bibtex
@inproceedings{Jhuang:ICCV:2013,
title = {Towards understanding action recognition},
author = {H. Jhuang and J. Gall and S. Zuffi and C. Schmid and M. J. Black},
booktitle = {International Conf. on Computer Vision (ICCV)},
month = Dec,
pages = {3192-3199},
year = {2013}
}
```
</details>
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2014/html/Andriluka_2D_Human_Pose_2014_CVPR_paper.html">MPII (CVPR'2014)</a></summary>
```bibtex
@inproceedings{andriluka14cvpr,
author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt},
title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2014},
month = {June}
}
```
</details>
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Andriluka_PoseTrack_A_Benchmark_CVPR_2018_paper.html">PoseTrack18 (CVPR'2018)</a></summary>
```bibtex
@inproceedings{andriluka2018posetrack,
title={Posetrack: A benchmark for human pose estimation and tracking},
author={Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={5167--5176},
year={2018}
}
```
</details>
<details>
<summary align="right"><a href="https://arxiv.org/abs/2004.00945">Halpe (CVPR'2020)</a></summary>
```bibtex
@inproceedings{li2020pastanet,
title={PaStaNet: Toward Human Activity Knowledge Engine},
author={Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu and Ma, Ze and Chen, Mingyang and Lu, Cewu},
booktitle={CVPR},
year={2020}
}
```
</details>
Results on COCO val2017
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log | onnx |
| :--------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :--------------------------------: | :-------------------------------: | :--------------------------------: |
| [RTMO-t](/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py) | 640x640 | 0.574 | 0.803 | 0.613 | 0.611 | 0.836 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416_20231219.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.zip) |
| [RTMO-s](/configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py) | 640x640 | 0.686 | 0.879 | 0.744 | 0.723 | 0.908 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.zip) |
| [RTMO-m](/configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py) | 640x640 | 0.726 | 0.899 | 0.790 | 0.763 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.zip) |
| [RTMO-l](/configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py) | 640x640 | 0.748 | 0.911 | 0.813 | 0.786 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640_20231211.json) | [onnx](https://download.openmmlab.com/mmpose/v1/projects/rtmo/onnx_sdk/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.zip) |
Models:
- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416x416.py
In Collection: RTMO
Metadata:
Architecture: &id001
- RTMO
Training Data: &id002
- AI Challenger
- COCO
- CrowdPose
- MPII
- sub-JHMDB
- Halpe
- PoseTrack18
Name: rtmo-t_8xb32-600e_body7-416x416
Results:
- Dataset: COCO
Metrics:
AP: 0.574
AP@0.5: 0.803
AP@0.75: 0.613
AR: 0.611
AR@0.5: 0.836
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-t_8xb32-600e_body7-416x416-f48f75cb_20231219.pth
- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-s_8xb32-600e_body7-640x640.py
In Collection: RTMO
Metadata:
Architecture: *id001
Training Data: *id002
Name: rtmo-s_8xb32-600e_body7-640x640
Results:
- Dataset: COCO
Metrics:
AP: 0.686
AP@0.5: 0.879
AP@0.75: 0.744
AR: 0.723
AR@0.5: 0.908
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_body7-640x640-dac2bf74_20231211.pth
- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-m_16xb16-600e_body7-640x640.py
In Collection: RTMO
Metadata:
Architecture: *id001
Training Data: *id002
Name: rtmo-m_16xb16-600e_body7-640x640
Results:
- Dataset: COCO
Metrics:
AP: 0.726
AP@0.5: 0.899
AP@0.75: 0.790
AR: 0.763
AR@0.5: 0.926
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_body7-640x640-39e78cc4_20231211.pth
- Config: configs/body_2d_keypoint/rtmo/body7/rtmo-l_16xb16-600e_body7-640x640.py
In Collection: RTMO
Alias: rtmo
Metadata:
Architecture: *id001
Training Data: *id002
Name: rtmo-l_16xb16-600e_body7-640x640
Results:
- Dataset: COCO
Metrics:
AP: 0.748
AP@0.5: 0.911
AP@0.75: 0.813
AR: 0.786
AR@0.5: 0.939
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_body7-640x640-b37118ce_20231211.pth
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.8, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
data_mode = 'bottomup'
data_root = 'data/'
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=train_pipeline_stage1,
)
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dataset_coco)
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 1.0
deepen_factor = 1.0
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
'_20211126_140236-d3bd2b23.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[256, 512, 1024],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=512,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=512,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=512,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-2,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.8, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
data_mode = 'bottomup'
data_root = 'data/'
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=train_pipeline_stage1,
)
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dataset_coco)
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.75
deepen_factor = 0.67
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/v1/'
'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[192, 384, 768],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=384,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=384,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=384,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-2,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.8, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
data_mode = 'bottomup'
data_root = 'data/'
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=train_pipeline_stage1,
)
train_dataloader = dict(
batch_size=32,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dataset_coco)
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.5
deepen_factor = 0.33
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
'20211121_095711-4592a793.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[128, 256, 512],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=256,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile),
use_keypoints_for_center=True),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=256,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1.0,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2312.07526">RTMO</a></summary>
```bibtex
@misc{lu2023rtmo,
title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
year={2023},
eprint={2312.07526},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
Results on COCO val2017
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: |
| [RTMO-s](/configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py) | 640x640 | 0.677 | 0.878 | 0.737 | 0.715 | 0.908 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640_20231211.json) |
| [RTMO-m](/configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py) | 640x640 | 0.709 | 0.890 | 0.778 | 0.747 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640-6f4e0306_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640_20231211.json) |
| [RTMO-l](/configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py) | 640x640 | 0.724 | 0.899 | 0.788 | 0.762 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640-516a421f_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640_20231211.json) |
Collections:
- Name: RTMO
Paper:
Title: 'RTMO: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation'
URL: https://arxiv.org/abs/2312.07526
README: https://github.com/open-mmlab/mmpose/blob/main/docs/src/papers/algorithms/rtmo.md
Models:
- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-s_8xb32-600e_coco-640x640.py
In Collection: RTMO
Metadata:
Architecture: &id001
- RTMO
Training Data: CrowdPose
Name: rtmo-s_8xb32-600e_coco-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.673
AP@0.5: 0.878
AP@0.75: 0.737
AR: 0.715
AR@0.5: 0.908
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth
- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-m_16xb16-600e_coco-640x640.py
In Collection: RTMO
Metadata:
Architecture: *id001
Training Data: CrowdPose
Name: rtmo-m_16xb16-600e_coco-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.709
AP@0.5: 0.890
AP@0.75: 0.778
AR: 0.747
AR@0.5: 0.920
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-600e_coco-640x640-6f4e0306_20231211.pth
- Config: configs/body_2d_keypoint/rtmo/coco/rtmo-l_16xb16-600e_coco-640x640.py
In Collection: RTMO
Metadata:
Architecture: *id001
Training Data: CrowdPose
Name: rtmo-l_16xb16-600e_coco-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.724
AP@0.5: 0.899
AP@0.75: 0.788
AR: 0.762
AR@0.5: 0.927
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-600e_coco-640x640-516a421f_20231211.pth
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=350,
end=349,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=350,
T_max=320,
end=670,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/crowdpose.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.8, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/'
# mapping
aic_crowdpose = [(3, 0), (0, 1), (4, 2), (1, 3), (5, 4), (2, 5),
(9, 6), (6, 7), (10, 8), (7, 9), (11, 10), (8, 11), (12, 12),
(13, 13)]
coco_crowdpose = [
(5, 0),
(6, 1),
(7, 2),
(8, 3),
(9, 4),
(10, 5),
(11, 6),
(12, 7),
(13, 8),
(14, 9),
(15, 10),
(16, 11),
]
mpii_crowdpose = [
(13, 0),
(12, 1),
(14, 2),
(11, 3),
(15, 4),
(10, 5),
(3, 6),
(2, 7),
(4, 8),
(1, 9),
(5, 10),
(0, 11),
(9, 12),
(7, 13),
]
jhmdb_crowdpose = [(4, 0), (3, 1), (8, 2), (7, 3), (12, 4), (11, 5), (6, 6),
(5, 7), (10, 8), (9, 9), (14, 10), (13, 11), (2, 12),
(0, 13)]
halpe_crowdpose = [
(5, 0),
(6, 1),
(7, 2),
(8, 3),
(9, 4),
(10, 5),
(11, 6),
(12, 7),
(13, 8),
(14, 9),
(15, 10),
(16, 11),
]
posetrack_crowdpose = [
(5, 0),
(6, 1),
(7, 2),
(8, 3),
(9, 4),
(10, 5),
(11, 6),
(12, 7),
(13, 8),
(14, 9),
(15, 10),
(16, 11),
(2, 12),
(1, 13),
]
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=14, mapping=coco_crowdpose)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=14, mapping=aic_crowdpose)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=14,
mapping=[(i, i) for i in range(14)])
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=14, mapping=mpii_crowdpose)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=14,
mapping=jhmdb_crowdpose)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=14,
mapping=halpe_crowdpose)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=14,
mapping=posetrack_crowdpose)
],
)
train_dataset_stage1 = dict(
type='CombinedDataset',
metainfo=dict(from_file=metafile),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
sample_ratio_factor=[1, 0.3, 1, 0.3, 0.3, 0.4, 0.3],
test_mode=False,
pipeline=train_pipeline_stage1)
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset_stage1)
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
score_mode='bbox',
nms_mode='none',
iou_type='keypoints_crowd',
prefix='crowdpose',
use_area=False,
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=30,
new_train_dataset=dataset_crowdpose,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
350: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 1.0
deepen_factor = 1.0
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
'_20211126_140236-d3bd2b23.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[256, 512, 1024],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=512,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=14,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=512,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=512,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-3,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment