Commit ca8a762a authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #716 failed with stages
in 0 seconds
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=350,
end=349,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=350,
T_max=320,
end=670,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/crowdpose.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.2,
rotate_factor=30,
scale_factor=(0.5, 1.5),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.6, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
data_mode = 'bottomup'
data_root = 'data/'
# train datasets
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=train_pipeline_stage1,
)
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dataset_crowdpose)
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
score_mode='bbox',
nms_mode='none',
iou_type='keypoints_crowd',
prefix='crowdpose',
use_area=False,
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=30,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
350: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 1.0
deepen_factor = 1.0
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco'
'_20211126_140236-d3bd2b23.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[256, 512, 1024],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=512,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=14,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=512,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=512,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-3,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=350,
end=349,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=350,
T_max=320,
end=670,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/crowdpose.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.2,
rotate_factor=30,
scale_factor=(0.5, 1.5),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.6, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
data_mode = 'bottomup'
data_root = 'data/'
# train datasets
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=train_pipeline_stage1,
)
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dataset_crowdpose)
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
score_mode='bbox',
nms_mode='none',
iou_type='keypoints_crowd',
prefix='crowdpose',
use_area=False,
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=30,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
350: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.75
deepen_factor = 0.67
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmpose/v1/'
'pretrained_models/yolox_m_8x8_300e_coco_20230829.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[192, 384, 768],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=384,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=14,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=384,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=384,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-3,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=700, val_interval=50, dynamic_intervals=[(670, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=350,
end=349,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=350,
T_max=320,
end=670,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
]
# data
input_size = (640, 640)
metafile = 'configs/_base_/datasets/crowdpose.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(640, 640),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_factor=0.2,
rotate_factor=30,
scale_factor=(0.5, 1.5),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(
type='YOLOXMixUp',
img_scale=(640, 640),
ratio_range=(0.6, 1.6),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(640, 640),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
data_mode = 'bottomup'
data_root = 'data/'
# train datasets
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=train_pipeline_stage1,
)
train_dataloader = dict(
batch_size=32,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dataset_crowdpose)
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
score_mode='bbox',
nms_mode='none',
iou_type='keypoints_crowd',
prefix='crowdpose',
use_area=False,
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=30,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
350: {
'proxy_target_cc': True,
'overlaps_power': 1.0,
'loss_cls.loss_weight': 2.0,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.5
deepen_factor = 0.33
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(480, 800),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
'20211121_095711-4592a793.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[128, 256, 512],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=14,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=256,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=256,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-3,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2312.07526">RTMO</a></summary>
```bibtex
@misc{lu2023rtmo,
title={{RTMO}: Towards High-Performance One-Stage Real-Time Multi-Person Pose Estimation},
author={Peng Lu and Tao Jiang and Yining Li and Xiangtai Li and Kai Chen and Wenming Yang},
year={2023},
eprint={2312.07526},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="http://openaccess.thecvf.com/content_CVPR_2019/html/Li_CrowdPose_Efficient_Crowded_Scenes_Pose_Estimation_and_a_New_Benchmark_CVPR_2019_paper.html">CrowdPose (CVPR'2019)</a></summary>
```bibtex
@article{li2018crowdpose,
title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark},
author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu},
journal={arXiv preprint arXiv:1812.00324},
year={2018}
}
```
</details>
Results on COCO val2017
| Arch | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AP (E) | AP (M) | AP (H) | ckpt | log |
| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: |
| [RTMO-s](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py) | 640x640 | 0.673 | 0.882 | 0.729 | 0.737 | 0.682 | 0.591 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640-79f81c0d_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640_20231211.json) |
| [RTMO-m](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py) | 640x640 | 0.711 | 0.897 | 0.771 | 0.774 | 0.719 | 0.634 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rrtmo-m_16xb16-700e_crowdpose-640x640-0eaf670d_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-m_16xb16-700e_crowdpose-640x640_20231211.json) |
| [RTMO-l](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py) | 640x640 | 0.732 | 0.907 | 0.793 | 0.792 | 0.741 | 0.653 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640-1008211f_20231211.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640_20231211.json) |
| [RTMO-l](/configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py)\* | 640x640 | 0.838 | 0.947 | 0.893 | 0.888 | 0.847 | 0.772 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640-5bafdc11_20231219.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640_20231219.json) |
\* indicates the model is trained using a combined dataset composed of AI Challenger, COCO, CrowdPose, Halpe, MPII, PoseTrack18 and sub-JHMDB.
Models:
- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-s_8xb32-700e_crowdpose-640x640.py
In Collection: RTMO
Metadata:
Architecture: &id001
- RTMO
Training Data: CrowdPose
Name: rtmo-s_8xb32-700e_crowdpose-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.673
AP@0.5: 0.882
AP@0.75: 0.729
AP (E): 0.737
AP (M): 0.682
AP (L): 0.591
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-700e_crowdpose-640x640-79f81c0d_20231211.pth
- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-m_16xb16-700e_crowdpose-640x640.py
In Collection: RTMO
Metadata:
Architecture: *id001
Training Data: CrowdPose
Name: rtmo-m_16xb16-700e_crowdpose-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.711
AP@0.5: 0.897
AP@0.75: 0.771
AP (E): 0.774
AP (M): 0.719
AP (L): 0.634
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rrtmo-m_16xb16-700e_crowdpose-640x640-0eaf670d_20231211.pth
- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_crowdpose-640x640.py
In Collection: RTMO
Metadata:
Architecture: *id001
Training Data: CrowdPose
Name: rtmo-l_16xb16-700e_crowdpose-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.732
AP@0.5: 0.907
AP@0.75: 0.793
AP (E): 0.792
AP (M): 0.741
AP (L): 0.653
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_crowdpose-640x640-1008211f_20231211.pth
- Config: configs/body_2d_keypoint/rtmo/crowdpose/rtmo-l_16xb16-700e_body7-crowdpose-640x640.py
In Collection: RTMO
Metadata:
Architecture: *id001
Training Data: CrowdPose
Name: rtmo-l_16xb16-700e_body7-crowdpose-640x640
Results:
- Dataset: CrowdPose
Metrics:
AP: 0.838
AP@0.5: 0.947
AP@0.75: 0.893
AP (E): 0.888
AP (M): 0.847
AP (L): 0.772
Task: Body 2D Keypoint
Weights: https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-l_16xb16-700e_body7-crowdpose-640x640-5bafdc11_20231219.pth
# RTMPose
Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency.
In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose.
Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries.
To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device.
## Results and Models
### COCO Dataset
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
| Model | Input Size | AP | AR | Details and Download |
| :----------------: | :--------: | :---: | :---: | :---------------------------------------: |
| RTMPose-t | 256x192 | 0.682 | 0.736 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-s | 256x192 | 0.716 | 0.768 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-m | 256x192 | 0.746 | 0.795 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-l | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-t-aic-coco | 256x192 | 0.685 | 0.738 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-s-aic-coco | 256x192 | 0.722 | 0.772 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-m-aic-coco | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-l-aic-coco | 256x192 | 0.765 | 0.813 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-m-aic-coco | 384x288 | 0.770 | 0.816 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
| RTMPose-l-aic-coco | 384x288 | 0.773 | 0.819 | [rtmpose_coco.md](./coco/rtmpose_coco.md) |
### MPII Dataset
| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download |
| :-------: | :--------: | :------: | :------: | :---------------------------------------: |
| RTMPose-m | 256x256 | 0.907 | 0.348 | [rtmpose_mpii.md](./mpii/rtmpose_mpii.md) |
### CrowdPose Dataset
Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector
| Model | Input Size | AP | AR | Details and Download |
| :-------: | :--------: | :---: | :---: | :------------------------------------------------------: |
| RTMPose-m | 256x192 | 0.706 | 0.788 | [rtmpose_crowdpose.md](./crowdpose/rtmpose_crowdpose.md) |
### Human-Art Dataset
Results on Human-Art validation dataset with detector having human AP of 56.2 on Human-Art validation dataset
| Model | Input Size | AP | AR | Details and Download |
| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: |
| RTMPose-s | 256x192 | 0.311 | 0.381 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
| RTMPose-m | 256x192 | 0.355 | 0.417 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
| RTMPose-l | 256x192 | 0.378 | 0.442 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
Results on Human-Art validation dataset with ground-truth bounding-box
| Model | Input Size | AP | AR | Details and Download |
| :-------: | :--------: | :---: | :---: | :---------------------------------------------------: |
| RTMPose-s | 256x192 | 0.698 | 0.732 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
| RTMPose-m | 256x192 | 0.728 | 0.759 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
| RTMPose-l | 256x192 | 0.753 | 0.783 | [rtmpose_humanart.md](./humanart/rtmpose_humanart.md) |
_base_ = ['../../../_base_/default_runtime.py']
# runtime
max_epochs = 420
stage2_num_epochs = 20
base_lr = 4e-3
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=(192, 256),
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=1.,
widen_factor=1.,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-l_udp-body7_210e-256x192-5e9558ef_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=1024,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
# data loaders
train_dataloader = dict(
batch_size=256,
num_workers=10,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
val_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='detection/coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
# hooks
default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
# default_hooks = dict(
# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
test_evaluator = [
dict(type='PCKAccuracy', thr=0.1),
dict(type='AUC'),
dict(type='EPE'),
]
_base_ = ['../../../_base_/default_runtime.py']
# runtime
max_epochs = 420
stage2_num_epochs = 20
base_lr = 4e-3
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=(288, 384),
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=1.,
widen_factor=1.,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-l_udp-body7_210e-384x288-b15bc30d_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=1024,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
# data loaders
train_dataloader = dict(
batch_size=256,
num_workers=10,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
val_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='detection/coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
# hooks
default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
# default_hooks = dict(
# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
test_evaluator = [
dict(type='PCKAccuracy', thr=0.1),
dict(type='AUC'),
dict(type='EPE'),
]
_base_ = ['../../../_base_/default_runtime.py']
# common setting
num_keypoints = 26
input_size = (192, 256)
# runtime
max_epochs = 700
stage2_num_epochs = 30
base_lr = 4e-3
train_batch_size = 512
val_batch_size = 64
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
clip_grad=dict(max_norm=35, norm_type=2),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=input_size,
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=1.,
widen_factor=1.,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=1024,
out_channels=num_keypoints,
input_size=input_size,
in_featuremap_size=tuple([s // 32 for s in input_size]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
# mapping
coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
(20, 21), (21, 23), (22, 25)]
aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
(5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
(12, 17), (13, 18)]
crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
(7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
(13, 18)]
mpii_halpe26 = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(8, 18),
(9, 17),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_halpe26 = [
(0, 18),
(2, 17),
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_halpe26 = [(i, i) for i in range(26)]
ochuman_halpe26 = [(i, i) for i in range(17)]
posetrack_halpe26 = [
(0, 0),
(2, 17),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
# data loaders
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=5,
pin_memory=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=ochuman_halpe26)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
val_dataloader = dict(
batch_size=val_batch_size,
num_workers=5,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(
checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
val_evaluator = test_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# common setting
num_keypoints = 26
input_size = (288, 384)
# runtime
max_epochs = 700
stage2_num_epochs = 30
base_lr = 4e-3
train_batch_size = 512
val_batch_size = 64
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
clip_grad=dict(max_norm=35, norm_type=2),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=input_size,
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=1.,
widen_factor=1.,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=1024,
out_channels=num_keypoints,
input_size=input_size,
in_featuremap_size=tuple([s // 32 for s in input_size]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
# mapping
coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
(20, 21), (21, 23), (22, 25)]
aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
(5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
(12, 17), (13, 18)]
crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
(7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
(13, 18)]
mpii_halpe26 = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(8, 18),
(9, 17),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_halpe26 = [
(0, 18),
(2, 17),
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_halpe26 = [(i, i) for i in range(26)]
ochuman_halpe26 = [(i, i) for i in range(17)]
posetrack_halpe26 = [
(0, 0),
(2, 17),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
# data loaders
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=10,
pin_memory=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=ochuman_halpe26)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
val_dataloader = dict(
batch_size=val_batch_size,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(
checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
val_evaluator = test_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# runtime
max_epochs = 420
stage2_num_epochs = 20
base_lr = 4e-3
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=(192, 256),
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.67,
widen_factor=0.75,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-m_udp-body7_210e-256x192-e0c9327b_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=768,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True, ))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
# data loaders
train_dataloader = dict(
batch_size=256,
num_workers=10,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
val_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='detection/coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
# hooks
default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
# default_hooks = dict(
# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
test_evaluator = [
dict(type='PCKAccuracy', thr=0.1),
dict(type='AUC'),
dict(type='EPE')
]
_base_ = ['../../../_base_/default_runtime.py']
# runtime
max_epochs = 420
stage2_num_epochs = 20
base_lr = 4e-3
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=(288, 384),
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.67,
widen_factor=0.75,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-m_udp-body7_210e-384x288-b9bc2b57_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=768,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True, ))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
# data loaders
train_dataloader = dict(
batch_size=256,
num_workers=10,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
val_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='detection/coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
# hooks
default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
# default_hooks = dict(
# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
test_evaluator = [
dict(type='PCKAccuracy', thr=0.1),
dict(type='AUC'),
dict(type='EPE')
]
_base_ = ['../../../_base_/default_runtime.py']
# common setting
num_keypoints = 26
input_size = (192, 256)
# runtime
max_epochs = 700
stage2_num_epochs = 30
base_lr = 4e-3
train_batch_size = 512
val_batch_size = 64
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
clip_grad=dict(max_norm=35, norm_type=2),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=input_size,
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.67,
widen_factor=0.75,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=768,
out_channels=num_keypoints,
input_size=input_size,
in_featuremap_size=tuple([s // 32 for s in input_size]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
# mapping
coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
(20, 21), (21, 23), (22, 25)]
aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
(5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
(12, 17), (13, 18)]
crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
(7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
(13, 18)]
mpii_halpe26 = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(8, 18),
(9, 17),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_halpe26 = [
(0, 18),
(2, 17),
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_halpe26 = [(i, i) for i in range(26)]
ochuman_halpe26 = [(i, i) for i in range(17)]
posetrack_halpe26 = [
(0, 0),
(2, 17),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
# data loaders
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=10,
pin_memory=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=ochuman_halpe26)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
val_dataloader = dict(
batch_size=val_batch_size,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(
checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
val_evaluator = test_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# common setting
num_keypoints = 26
input_size = (288, 384)
# runtime
max_epochs = 700
stage2_num_epochs = 30
base_lr = 4e-3
train_batch_size = 512
val_batch_size = 64
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
clip_grad=dict(max_norm=35, norm_type=2),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=input_size,
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.67,
widen_factor=0.75,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=768,
out_channels=num_keypoints,
input_size=input_size,
in_featuremap_size=tuple([s // 32 for s in input_size]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = 'data/'
# backend_args = dict(backend='local')
backend_args = dict(
backend='petrel',
path_mapping=dict({
f'{data_root}': 's3://openmmlab/datasets/',
f'{data_root}': 's3://openmmlab/datasets/'
}))
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
# mapping
coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
(20, 21), (21, 23), (22, 25)]
aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
(5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
(12, 17), (13, 18)]
crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
(7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
(13, 18)]
mpii_halpe26 = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(8, 18),
(9, 17),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_halpe26 = [
(0, 18),
(2, 17),
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_halpe26 = [(i, i) for i in range(26)]
ochuman_halpe26 = [(i, i) for i in range(17)]
posetrack_halpe26 = [
(0, 0),
(2, 17),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
# data loaders
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=10,
pin_memory=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=ochuman_halpe26)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
val_dataloader = dict(
batch_size=val_batch_size,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
test_dataloader = val_dataloader
# hooks
# default_hooks = dict(
default_hooks = dict(
checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
val_evaluator = test_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# common setting
num_keypoints = 26
input_size = (192, 256)
# runtime
max_epochs = 700
stage2_num_epochs = 30
base_lr = 4e-3
train_batch_size = 1024
val_batch_size = 64
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0),
clip_grad=dict(max_norm=35, norm_type=2),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=input_size,
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.33,
widen_factor=0.5,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=512,
out_channels=num_keypoints,
input_size=input_size,
in_featuremap_size=tuple([s // 32 for s in input_size]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.6, 1.4],
rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
# mapping
coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
(20, 21), (21, 23), (22, 25)]
aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
(5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
(12, 17), (13, 18)]
crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
(7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
(13, 18)]
mpii_halpe26 = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(8, 18),
(9, 17),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_halpe26 = [
(0, 18),
(2, 17),
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_halpe26 = [(i, i) for i in range(26)]
ochuman_halpe26 = [(i, i) for i in range(17)]
posetrack_halpe26 = [
(0, 0),
(2, 17),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
# data loaders
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=10,
pin_memory=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=ochuman_halpe26)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
val_dataloader = dict(
batch_size=val_batch_size,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(
checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
val_evaluator = test_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# runtime
max_epochs = 420
stage2_num_epochs = 20
base_lr = 4e-3
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=(192, 256),
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.33,
widen_factor=0.5,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-s_udp-body7_210e-256x192-8c9ccbdb_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=512,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.75, 1.25],
rotate_factor=60),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
# data loaders
train_dataloader = dict(
batch_size=256,
num_workers=10,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
val_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='detection/coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
# hooks
default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
# default_hooks = dict(
# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
test_evaluator = [
dict(type='PCKAccuracy', thr=0.1),
dict(type='AUC'),
dict(type='EPE')
]
_base_ = ['../../../_base_/default_runtime.py']
# common setting
num_keypoints = 26
input_size = (192, 256)
# runtime
max_epochs = 700
stage2_num_epochs = 30
base_lr = 4e-3
train_batch_size = 1024
val_batch_size = 64
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
clip_grad=dict(max_norm=35, norm_type=2),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=input_size,
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.167,
widen_factor=0.375,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-tiny_udp-body7_210e-256x192-a3775292_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=384,
out_channels=num_keypoints,
input_size=input_size,
in_featuremap_size=tuple([s // 32 for s in input_size]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.6, 1.4],
rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
# mapping
coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
(20, 21), (21, 23), (22, 25)]
aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
(5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
(12, 17), (13, 18)]
crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
(7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
(13, 18)]
mpii_halpe26 = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(8, 18),
(9, 17),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_halpe26 = [
(0, 18),
(2, 17),
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_halpe26 = [(i, i) for i in range(26)]
ochuman_halpe26 = [(i, i) for i in range(17)]
posetrack_halpe26 = [
(0, 0),
(2, 17),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
# data loaders
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=10,
pin_memory=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=ochuman_halpe26)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
val_dataloader = dict(
batch_size=val_batch_size,
num_workers=10,
pin_memory=True,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(
checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
# dict(
# type='EMAHook',
# ema_type='ExpMomentumEMA',
# momentum=0.0002,
# update_buffers=True,
# priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
val_evaluator = test_evaluator
_base_ = ['../../../_base_/default_runtime.py']
# runtime
max_epochs = 420
stage2_num_epochs = 20
base_lr = 4e-3
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
# use cosine lr from 210 to 420 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=(192, 256),
sigma=(4.9, 5.66),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=0.167,
widen_factor=0.375,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-tiny_udp-body7_210e-256x192-a3775292_20230504.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=384,
out_channels=17,
input_size=codec['input_size'],
in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.75, 1.25],
rotate_factor=60),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
# data loaders
train_dataloader = dict(
batch_size=256,
num_workers=10,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=ochuman_coco)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
val_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
bbox_file=f'{data_root}coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='detection/coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = dict(
batch_size=64,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
# hooks
default_hooks = dict(
checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1))
# default_hooks = dict(
# checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
# dict(
# type='EMAHook',
# ema_type='ExpMomentumEMA',
# momentum=0.0002,
# update_buffers=True,
# priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json')
test_evaluator = [
dict(type='PCKAccuracy', thr=0.1),
dict(type='AUC'),
dict(type='EPE')
]
_base_ = ['../../../_base_/default_runtime.py']
# common setting
num_keypoints = 26
input_size = (288, 384)
# runtime
max_epochs = 700
stage2_num_epochs = 20
base_lr = 4e-3
train_batch_size = 256
val_batch_size = 64
train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
clip_grad=dict(max_norm=35, norm_type=2),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]
# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=1024)
# codec settings
codec = dict(
type='SimCCLabel',
input_size=input_size,
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)
# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=1.33,
widen_factor=1.25,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmposev1/cspnext-x_udp-body7_210e-384x288-d28b58e6_20230529.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=1280,
out_channels=num_keypoints,
input_size=input_size,
in_featuremap_size=tuple([s // 32 for s in input_size]),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True))
# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = 'data/'
backend_args = dict(backend='local')
# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.5, 1.5], rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PhotometricDistortion'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]
train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.5, 1.5],
rotate_factor=90),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(
type='GenerateTarget',
encoder=codec,
use_dataset_keypoint_weights=True),
dict(type='PackPoseInputs')
]
# mapping
coco_halpe26 = [(i, i) for i in range(17)] + [(17, 20), (18, 22), (19, 24),
(20, 21), (21, 23), (22, 25)]
aic_halpe26 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
(5, 9), (6, 12), (7, 14), (8, 16), (9, 11), (10, 13), (11, 15),
(12, 17), (13, 18)]
crowdpose_halpe26 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
(7, 12), (8, 13), (9, 14), (10, 15), (11, 16), (12, 17),
(13, 18)]
mpii_halpe26 = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(8, 18),
(9, 17),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_halpe26 = [
(0, 18),
(2, 17),
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_halpe26 = [(i, i) for i in range(26)]
ochuman_halpe26 = [(i, i) for i in range(17)]
posetrack_halpe26 = [
(0, 0),
(2, 17),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='detection/coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
# data loaders
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=10,
pin_memory=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
pipeline=train_pipeline,
test_mode=False,
))
# val datasets
val_coco = dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=coco_halpe26)
],
)
val_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_val.json',
data_prefix=dict(
img='pose/ai_challenge/ai_challenger_keypoint'
'_validation_20170911/keypoint_validation_images_20170911/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=aic_halpe26)
],
)
val_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_test.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=crowdpose_halpe26)
],
)
val_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_val.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=mpii_halpe26)
],
)
val_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_test.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=jhmdb_halpe26)
],
)
val_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_val_v1.json',
data_prefix=dict(img='detection/coco/val2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=halpe_halpe26)
],
)
val_ochuman = dict(
type='OCHumanDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='ochuman/annotations/'
'ochuman_coco_format_val_range_0.00_1.00.json',
data_prefix=dict(img='pose/OCHuman/images/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=ochuman_halpe26)
],
)
val_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_val.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=num_keypoints,
mapping=posetrack_halpe26)
],
)
val_dataloader = dict(
batch_size=val_batch_size,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/halpe26.py'),
datasets=[
val_coco,
val_aic,
val_crowdpose,
val_mpii,
val_jhmdb,
val_halpe,
val_ochuman,
val_posetrack,
],
pipeline=val_pipeline,
test_mode=True,
))
test_dataloader = val_dataloader
# hooks
default_hooks = dict(
checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1))
custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]
# evaluators
test_evaluator = [dict(type='PCKAccuracy', thr=0.1), dict(type='AUC')]
val_evaluator = test_evaluator
<!-- [ALGORITHM] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-030-58580-8_27">RTMPose (arXiv'2023)</a></summary>
```bibtex
@misc{https://doi.org/10.48550/arxiv.2303.07399,
doi = {10.48550/ARXIV.2303.07399},
url = {https://arxiv.org/abs/2303.07399},
author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai},
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose},
publisher = {arXiv},
year = {2023},
copyright = {Creative Commons Attribution 4.0 International}
}
```
</details>
<!-- [BACKBONE] -->
<details>
<summary align="right"><a href="https://arxiv.org/abs/2212.07784">RTMDet (arXiv'2022)</a></summary>
```bibtex
@misc{lyu2022rtmdet,
title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
year={2022},
eprint={2212.07784},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
</details>
<!-- [DATASET] -->
<details>
<summary align="right"><a href="https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48">COCO (ECCV'2014)</a></summary>
```bibtex
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
```
</details>
- Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset.
- `*` denotes model trained on 7 public datasets:
- [AI Challenger](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#aic)
- [MS COCO](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#coco)
- [CrowdPose](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#crowdpose)
- [MPII](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#mpii)
- [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#sub-jhmdb-dataset)
- [Halpe](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html#halpe)
- [PoseTrack18](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#posetrack18)
- `Body8` denotes the addition of the [OCHuman](https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_body_keypoint.html#ochuman) dataset, in addition to the 7 datasets mentioned above, for evaluation.
| Config | Input Size | AP<sup><br>(COCO) | PCK@0.1<sup><br>(Body8) | AUC<sup><br>(Body8) | EPE<sup><br>(Body8) | Params(M) | FLOPS(G) | Download |
| :--------------------------------------------: | :--------: | :---------------: | :---------------------: | :-----------------: | :-----------------: | :-------: | :------: | :-----------------------------------------------: |
| [RTMPose-t\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-t_8xb256-420e_body8-256x192.py) | 256x192 | 65.9 | 91.44 | 63.18 | 19.45 | 3.34 | 0.36 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-t_simcc-body7_pt-body7_420e-256x192-026a1439_20230504.pth) |
| [RTMPose-s\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-s_8xb256-420e_body8-256x192.py) | 256x192 | 69.7 | 92.45 | 65.15 | 17.85 | 5.47 | 0.68 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-s_simcc-body7_pt-body7_420e-256x192-acd4a1ef_20230504.pth) |
| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-256x192.py) | 256x192 | 74.9 | 94.25 | 68.59 | 15.12 | 13.59 | 1.93 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-256x192-e48f03d0_20230504.pth) |
| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-256x192.py) | 256x192 | 76.7 | 95.08 | 70.14 | 13.79 | 27.66 | 4.16 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-256x192-4dba18fc_20230504.pth) |
| [RTMPose-m\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-m_8xb256-420e_body8-384x288.py) | 384x288 | 76.6 | 94.64 | 70.38 | 13.98 | 13.72 | 4.33 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth) |
| [RTMPose-l\*](/configs/body_2d_keypoint/rtmpose/body8/rtmpose-l_8xb256-420e_body8-384x288.py) | 384x288 | 78.3 | 95.36 | 71.58 | 13.08 | 27.79 | 9.35 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-body7_pt-body7_420e-384x288-3f5a1437_20230504.pth) |
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment