Commit 5b3e36dc authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add model TSM

parents
Pipeline #315 failed with stages
in 0 seconds
_base_ = [
'../../../_base_/models/slowonly_r50.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# The flag indicates using joint training
omnisource = True
# optimizer
optimizer = dict(
type='SGD', lr=0.15, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr=0)
# runtime settings
total_epochs = 256
checkpoint_config = dict(interval=8)
work_dir = './work_dirs/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb'
find_unused_parameters = False
_base_ = [
'../../../_base_/models/slowonly_r50.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
web_root = 'data/OmniSource/'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_web = ('data/OmniSource/annotations/webimage_200/'
'tsn_8seg_webimage_200_wodup.txt')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_web_pipeline = [
dict(type='ImageDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='BuildPseudoClip', clip_len=8),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='ImageDataset',
ann_file=ann_file_web,
data_prefix=web_root,
pipeline=train_web_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5)
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.15, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr=0)
# runtime settings
total_epochs = 256
checkpoint_config = dict(interval=8)
work_dir = ('./work_dirs/omnisource/'
'slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb')
find_unused_parameters = False
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=200))
omnisource = True
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
gg_root = 'data/OmniSource/googleimage_200'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_gg = ('data/OmniSource/annotations/googleimage_200/'
'tsn_8seg_googleimage_200_wodup.txt')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_gg_pipeline = [
dict(type='ImageDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
omni_videos_per_gpu=[12, 64],
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='ImageDataset',
ann_file=ann_file_gg,
data_prefix=gg_root,
pipeline=train_gg_pipeline)
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.00375, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = ('./work_dirs/omnisource/'
'tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb')
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=200))
omnisource = True
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
iv_root = 'data/OmniSource/insvideo_200'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_iv = ('data/OmniSource/annotations/insvideo_200/'
'slowonly_8x8_insvideo_200_wodup.txt')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_iv_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type=dataset_type,
ann_file=ann_file_iv,
data_prefix=iv_root,
pipeline=train_iv_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5)
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.00375, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = ('./work_dirs/omnisource/'
'tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb')
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=200))
omnisource = True
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
kraw_root = 'data/OmniSource/kinetics_raw_200_train'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_kraw = ('data/OmniSource/annotations/kinetics_raw_200/'
'slowonly_8x8_kinetics_raw_200.json')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_kraw_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='RawVideoDataset',
ann_file=ann_file_kraw,
data_prefix=kraw_root,
pipeline=train_kraw_pipeline,
clipname_tmpl='part_{}.mp4',
sampling_strategy='positive')
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.00375, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = ('./work_dirs/omnisource/'
'tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb')
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=200))
omnisource = True
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
web_root = 'data/OmniSource/'
iv_root = 'data/OmniSource/insvideo_200'
kraw_root = 'data/OmniSource/kinetics_raw_200_train'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_web = ('data/OmniSource/annotations/webimage_200/'
'tsn_8seg_webimage_200_wodup.txt')
ann_file_iv = ('data/OmniSource/annotations/insvideo_200/'
'slowonly_8x8_insvideo_200_wodup.txt')
ann_file_kraw = ('data/OmniSource/annotations/kinetics_raw_200/'
'slowonly_8x8_kinetics_raw_200.json')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_web_pipeline = [
dict(type='ImageDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_iv_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_kraw_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
omni_videos_per_gpu=[12, 64, 12, 12],
train_ratio=[2, 1, 1, 1],
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='ImageDataset',
ann_file=ann_file_web,
data_prefix=web_root,
pipeline=train_web_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5),
dict(
type=dataset_type,
ann_file=ann_file_iv,
data_prefix=iv_root,
pipeline=train_iv_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5),
dict(
type='RawVideoDataset',
ann_file=ann_file_kraw,
data_prefix=kraw_root,
pipeline=train_kraw_pipeline,
clipname_tmpl='part_{}.mp4',
sampling_strategy='positive')
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.00375, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = ('./work_dirs/omnisource/'
'tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb')
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=200))
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.00375, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = './work_dirs/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb'
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=200))
omnisource = True
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
web_root = 'data/OmniSource/'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_web = ('data/OmniSource/annotations/webimage_200/'
'tsn_8seg_webimage_200_wodup.txt')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_web_pipeline = [
dict(type='ImageDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
omni_videos_per_gpu=[12, 64],
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='ImageDataset',
ann_file=ann_file_web,
data_prefix=web_root,
pipeline=train_web_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5)
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.00375, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = ('./work_dirs/omnisource/'
'tsn_r50_1x1x8_100e_minikinetics_webimage_rgb')
# R2plus1D
[A closer look at spatiotemporal convolutions for action recognition](https://openaccess.thecvf.com/content_cvpr_2018/html/Tran_A_Closer_Look_CVPR_2018_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
In this paper we discuss several forms of spatiotemporal convolutions for video analysis and study their effects on action recognition. Our motivation stems from the observation that 2D CNNs applied to individual frames of the video have remained solid performers in action recognition. In this work we empirically demonstrate the accuracy advantages of 3D CNNs over 2D CNNs within the framework of residual learning. Furthermore, we show that factorizing the 3D convolutional filters into separate spatial and temporal components yields significantly advantages in accuracy. Our empirical study leads to the design of a new spatiotemporal convolutional block "R(2+1)D" which gives rise to CNNs that achieve results comparable or superior to the state-of-the-art on Sports-1M, Kinetics, UCF101 and HMDB51.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143043885-3d00413c-b556-445e-9673-f5805c08c195.png" width="800"/>
</div>
## Results and Models
### Kinetics-400
| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------------ | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: |
| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py) | short-side 256 | 8x4 | ResNet34 | None | 67.30 | 87.65 | x | 5019 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb_20200729-aa94765e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log.json) |
| [r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | short-side 256 | 8 | ResNet34 | None | 67.3 | 87.8 | x | 5019 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb_20200826-ab35a529.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log.json) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log) |
| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py) | short-side 320 | 8x2 | ResNet34 | None | 68.68 | 88.36 | 1.6 (80x3 frames) | 5019 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json) |
| [r2plus1d_r34_32x2x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py) | short-side 320 | 8x2 | ResNet34 | None | 74.60 | 91.59 | 0.5 (320x3 frames) | 12975 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json) |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time, not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
:::
For more details on data preparation, you can refer to Kinetics400 in [Data Preparation](/docs/en/data_preparation.md).
## Train
You can use the following command to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Example: train R(2+1)D model on Kinetics-400 dataset in a deterministic option with periodic validation.
```shell
python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
--work-dir work_dirs/r2plus1d_r34_3d_8x8x1_180e_kinetics400_rgb \
--validate --seed 0 --deterministic
```
For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
## Test
You can use the following command to test a model.
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Example: test R(2+1)D model on Kinetics-400 dataset and dump the result to a json file.
```shell
python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json --average-clips=prob
```
For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
## Citation
```BibTeX
@inproceedings{tran2018closer,
title={A closer look at spatiotemporal convolutions for action recognition},
author={Tran, Du and Wang, Heng and Torresani, Lorenzo and Ray, Jamie and LeCun, Yann and Paluri, Manohar},
booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
pages={6450--6459},
year={2018}
}
```
# R2plus1D
## 简介
<!-- [ALGORITHM] -->
```BibTeX
@inproceedings{tran2018closer,
title={A closer look at spatiotemporal convolutions for action recognition},
author={Tran, Du and Wang, Heng and Torresani, Lorenzo and Ray, Jamie and LeCun, Yann and Paluri, Manohar},
booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
pages={6450--6459},
year={2018}
}
```
## 模型库
### Kinetics-400
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | 推理时间 (video/s) | GPU 显存占用 (M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------------ | :------: | :------: | :------: | :----: | :---------: | :---------: | :----------------: | :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: |
| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py) | 短边 256 | 8x4 | ResNet34 | None | 67.30 | 87.65 | x | 5019 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb_20200729-aa94765e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log.json) |
| [r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | 短边 256 | 8 | ResNet34 | None | 67.3 | 87.8 | x | 5019 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb_20200826-ab35a529.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log.json) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log) |
| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py) | 短边 320 | 8x2 | ResNet34 | None | 68.68 | 88.36 | 1.6 (80x3 frames) | 5019 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json) |
| [r2plus1d_r34_32x2x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py) | 短边 320 | 8x2 | ResNet34 | None | 74.60 | 91.59 | 0.5 (320x3 frames) | 12975 | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log) | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json) |
注:
1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地,MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
依据 [线性缩放规则](https://arxiv.org/abs/1706.02677),当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时,需要根据批大小等比例地调节学习率。
如,lr=0.01 对应 4 GPUs x 2 video/gpu,以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
2. 这里的 **推理时间** 是根据 [基准测试脚本](/tools/analysis/benchmark.py) 获得的,采用测试时的采帧策略,且只考虑模型的推理时间,
并不包括 IO 时间以及预处理时间。对于每个配置,MMAction2 使用 1 块 GPU 并设置批大小(每块 GPU 处理的视频个数)为 1 来计算推理时间。
3. 我们使用的 Kinetics400 验证集包含 19796 个视频,用户可以从 [验证集视频](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB) 下载这些视频。同时也提供了对应的 [数据列表](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (每行格式为:视频 ID,视频帧数目,类别序号)以及 [标签映射](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) (类别序号到类别名称)。
对于数据集准备的细节,用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 Kinetics400 部分。
## 如何训练
用户可以使用以下指令进行模型训练。
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
例如:以一个确定性的训练方式,辅以定期的验证过程进行 R(2+1)D 模型在 Kinetics400 数据集上的训练。
```shell
python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
--work-dir work_dirs/r2plus1d_r34_3d_8x8x1_180e_kinetics400_rgb \
--validate --seed 0 --deterministic
```
更多训练细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
## 如何测试
用户可以使用以下指令进行模型测试。
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如:在 Kinetics400 数据集上测试 R(2+1)D 模型,并将结果导出为一个 json 文件。
```shell
python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json --average-clips=prob
```
更多测试细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
Collections:
- Name: R2Plus1D
README: configs/recognition/r2plus1d/README.md
Paper:
URL: https://arxiv.org/abs/1711.11248
Title: A Closer Look at Spatiotemporal Convolutions for Action Recognition
Models:
- Config: configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py
In Collection: R2Plus1D
Metadata:
Architecture: ResNet34
Batch Size: 8
Epochs: 180
FLOPs: 53175572992
Parameters: 63759281
Pretrained: None
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: r2plus1d_r34_8x8x1_180e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 67.3
Top 5 Accuracy: 87.65
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log
Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb_20200729-aa94765e.pth
- Config: configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py
In Collection: R2Plus1D
Metadata:
Architecture: ResNet34
Batch Size: 16
Epochs: 180
FLOPs: 53175572992
Parameters: 63759281
Pretrained: None
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 67.3
Top 5 Accuracy: 87.8
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log
Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log.json
Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb_20200826-ab35a529.pth
- Config: configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py
In Collection: R2Plus1D
Metadata:
Architecture: ResNet34
Batch Size: 8
Epochs: 180
FLOPs: 53175572992
Parameters: 63759281
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: RGB
Name: r2plus1d_r34_8x8x1_180e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 68.68
Top 5 Accuracy: 88.36
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log
Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth
- Config: configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py
In Collection: R2Plus1D
Metadata:
Architecture: ResNet34
Batch Size: 6
Epochs: 180
FLOPs: 212701677568
Parameters: 63759281
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: RGB
Name: r2plus1d_r34_32x2x1_180e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 74.6
Top 5 Accuracy: 91.59
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log
Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth
_base_ = ['./r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py']
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=6,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
test_mode=True),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline,
test_mode=True))
# optimizer
optimizer = dict(
type='SGD', lr=0.075, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = './work_dirs/r2plus1d_r34_3d_32x2x1_180e_kinetics400_rgb/'
_base_ = [
'../../_base_/models/r2plus1d_r34.py', '../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
test_mode=True),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline,
test_mode=True))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.1, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr=0)
total_epochs = 180
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/'
find_unused_parameters = False
_base_ = ['./r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py']
# model settings
model = dict(backbone=dict(act_cfg=dict(type='ReLU')))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/kinetics400/videos_train'
data_root_val = 'data/kinetics400/videos_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
test_mode=True),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline,
test_mode=True))
# optimizer
optimizer = dict(
type='SGD', lr=0.2, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
work_dir = './work_dirs/r2plus1d_r34_video_3d_8x8x1_180e_kinetics400_rgb/'
_base_ = ['../../_base_/models/r2plus1d_r34.py']
# model settings
model = dict(backbone=dict(act_cfg=dict(type='ReLU')))
# dataset settings
dataset_type = 'VideoDataset'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=1,
workers_per_gpu=2,
test=dict(
type=dataset_type,
ann_file=None,
data_prefix=None,
pipeline=test_pipeline))
# SlowFast
[SlowFast Networks for Video Recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143044111-94676f64-7ba8-4081-9011-f8054bed7030.png" width="800"/>
</div>
## Results and Models
### Kinetics-400
| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json |
| :-------------------------------------------------------------------------------------------------------------------------------------------- | :------------: | :--: | :------------------: | :------: | :------: | :------: | :----------------------: | :--------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) | short-side 256 | 8x4 | ResNet50 | None | 74.75 | 91.73 | x | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb_20200728-145f1097.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log.json) |
| [slowfast_r50_video_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py) | short-side 256 | 8 | ResNet50 | None | 73.95 | 91.50 | x | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log.json) |
| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) | short-side 320 | 8x2 | ResNet50 | None | 76.0 | 92.54 | 1.6 ((32+4)x10x3 frames) | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20210722-04e43ed4.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log.json) |
| [slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb.py) | short-side 320 | 8x2 | ResNet50 | None | 76.34 | 92.67 | x | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb_20210722-bb725050.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log.json) |
| [slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) | short-side 320 | 8x3 | ResNet50 | None | 76.94 | 92.8 | 1.3 ((32+8)x10x3 frames) | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json) |
| [slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.py) | short-side 320 | 8x4 | ResNet50 | None | 76.34 | 92.61 | | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr-43988bac.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.json) |
| [slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py) | short-side 320 | 8x2 | ResNet50 | None | 76.07 | 92.21 | x | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb-f82bd304.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.json) |
| [slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_perbn_r50_8x8x1_256e_kinetics400_rgb_steplr.py) | short-side 320 | 8x4 | ResNet50 | None | 76.58 | 92.85 | | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr-28474e54.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.json) |
| [slowfast_r101_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py) | short-side 256 | 8x1 | ResNet101 + ResNet50 | None | 76.69 | 93.07 | | 16628 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/slowfast_r101_4x16x1_256e_kinetics400_rgb_20210218-d8b58813.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log.json) |
| [slowfast_r101_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py) | short-side 256 | 8x4 | ResNet101 | None | 77.90 | 93.51 | | 25994 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/slowfast_r101_8x8x1_256e_kinetics400_rgb_20210218-0dd54025.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log.json) |
| [slowfast_r152_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py) | short-side 256 | 8x1 | ResNet152 + ResNet50 | None | 77.13 | 93.20 | | 10077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/slowfast_r152_4x16x1_256e_kinetics400_rgb_20210122-bdeb6b87.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log.json) |
### Something-Something V1
| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------ | :--------: | :--: | :------: | :---------: | :------: | :------: | :---------------------: | :--------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: |
| [slowfast_r50_16x8x1_22e_sthv1_rgb](/configs/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | Kinetics400 | 49.67 | 79.00 | x | 9293 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb_20211202-aaaf9279.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.json) |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time, not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
:::
For more details on data preparation, you can refer to Kinetics400 in [Data Preparation](/docs/en/data_preparation.md).
## Train
You can use the following command to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Example: train SlowFast model on Kinetics-400 dataset in a deterministic option with periodic validation.
```shell
python tools/train.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
--work-dir work_dirs/slowfast_r50_4x16x1_256e_kinetics400_rgb \
--validate --seed 0 --deterministic
```
For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
## Test
You can use the following command to test a model.
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Example: test SlowFast model on Kinetics-400 dataset and dump the result to a json file.
```shell
python tools/test.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json --average-clips=prob
```
For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
## Citation
```BibTeX
@inproceedings{feichtenhofer2019slowfast,
title={Slowfast networks for video recognition},
author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
booktitle={Proceedings of the IEEE international conference on computer vision},
pages={6202--6211},
year={2019}
}
```
# SlowFast
## 简介
<!-- [ALGORITHM] -->
```BibTeX
@inproceedings{feichtenhofer2019slowfast,
title={Slowfast networks for video recognition},
author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
booktitle={Proceedings of the IEEE international conference on computer vision},
pages={6202--6211},
year={2019}
}
```
## 模型库
### Kinetics-400
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | 推理时间 (video/s) | GPU 显存占用 (M) | ckpt | log | json |
| :-------------------------------------------------------------------------------------------------------------------------------------- | :-----: | :------: | :------------------: | :----: | :---------: | :---------: | :----------------------: | :--------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) | 短边256 | 8x4 | ResNet50 | None | 74.75 | 91.73 | x | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb_20200728-145f1097.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log.json) |
| [slowfast_r50_video_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py) | 短边256 | 8 | ResNet50 | None | 73.95 | 91.50 | x | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log.json) |
| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) | 短边320 | 8x2 | ResNet50 | None | 76.0 | 92.54 | 1.6 ((32+4)x10x3 frames) | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20210722-04e43ed4.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log.json) |
| [slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb.py) | 短边320 | 8x2 | ResNet50 | None | 76.34 | 92.67 | x | 6203 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb_20210722-bb725050.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log.json) |
| [slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) | 短边320 | 8x3 | ResNet50 | None | 76.94 | 92.8 | 1.3 ((32+8)x10x3 frames) | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json) |
| [slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) | 短边320 | 8x4 | ResNet50 | None | 76.34 | 92.61 | 1.3 ((32+8)x10x3 frames) | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr-43988bac.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.json) |
| [slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py) | 短边320 | 8x2 | ResNet50 | None | 76.07 | 92.21 | x | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb-f82bd304.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.json) |
| [slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb.py) | 短边320 | 8x4 | ResNet50 | None | 76.58 | 92.85 | 1.3 ((32+8)x10x3 frames) | 9062 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr-28474e54.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.json) |
| [slowfast_r101_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py) | 短边256 | 8x1 | ResNet101 + ResNet50 | None | 76.69 | 93.07 | | 16628 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/slowfast_r101_4x16x1_256e_kinetics400_rgb_20210218-d8b58813.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log.json) |
| [slowfast_r101_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py) | 短边256 | 8x4 | ResNet101 | None | 77.90 | 93.51 | | 25994 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/slowfast_r101_8x8x1_256e_kinetics400_rgb_20210218-0dd54025.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log.json) |
| [slowfast_r152_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py) | 短边256 | 8x1 | ResNet152 + ResNet50 | None | 77.13 | 93.20 | | 10077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/slowfast_r152_4x16x1_256e_kinetics400_rgb_20210122-bdeb6b87.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log.json) |
### Something-Something V1
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | 推理时间 (video/s) | GPU 显存占用 (M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------ | :----: | :------: | :------: | :---------: | :---------: | :---------: | :----------------: | :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: |
| [slowfast_r50_16x8x1_22e_sthv1_rgb](/configs/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | Kinetics400 | 49.67 | 79.00 | x | 9293 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb_20211202-aaaf9279.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.json) |
注:
1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地,MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
依据 [线性缩放规则](https://arxiv.org/abs/1706.02677),当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时,需要根据批大小等比例地调节学习率。
如,lr=0.01 对应 4 GPUs x 2 video/gpu,以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
2. 这里的 **推理时间** 是根据 [基准测试脚本](/tools/analysis/benchmark.py) 获得的,采用测试时的采帧策略,且只考虑模型的推理时间,
并不包括 IO 时间以及预处理时间。对于每个配置,MMAction2 使用 1 块 GPU 并设置批大小(每块 GPU 处理的视频个数)为 1 来计算推理时间。
3. 我们使用的 Kinetics400 验证集包含 19796 个视频,用户可以从 [验证集视频](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB) 下载这些视频。同时也提供了对应的 [数据列表](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (每行格式为:视频 ID,视频帧数目,类别序号)以及 [标签映射](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) (类别序号到类别名称)。
对于数据集准备的细节,用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 Kinetics400 部分。
## 如何训练
用户可以使用以下指令进行模型训练。
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
例如:以一个确定性的训练方式,辅以定期的验证过程进行 SlowFast 模型在 Kinetics400 数据集上的训练。
```shell
python tools/train.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
--work-dir work_dirs/slowfast_r50_4x16x1_256e_kinetics400_rgb \
--validate --seed 0 --deterministic
```
更多训练细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
## 如何测试
用户可以使用以下指令进行模型测试。
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如:在 SlowFast 数据集上测试 CSN 模型,并将结果导出为一个 json 文件。
```shell
python tools/test.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json --average-clips=prob
```
更多测试细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
Collections:
- Name: SlowFast
README: configs/recognition/slowfast/README.md
Paper:
URL: https://arxiv.org/abs/1812.03982
Title: SlowFast Networks for Video Recognition
Models:
- Config: configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 36441296896
Parameters: 34479288
Pretrained: None
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: slowfast_r50_4x16x1_256e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 74.75
Top 5 Accuracy: 91.73
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb_20200728-145f1097.pth
- Config: configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 36441296896
Parameters: 34479288
Pretrained: None
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: slowfast_r50_video_4x16x1_256e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.95
Top 5 Accuracy: 91.50
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth
- Config: configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 36441296896
Parameters: 34479288
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 24 GPUs
Modality: RGB
Name: slowfast_r50_4x16x1_256e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 75.64
Top 5 Accuracy: 92.3
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth
- Config: configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 66222034944
Parameters: 34565560
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 24 GPUs
Modality: RGB
Name: slowfast_r50_8x8x1_256e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 76.94
Top 5 Accuracy: 92.8
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth
- Config: configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 66222034944
Parameters: 34565560
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 76.34
Top 5 Accuracy: 92.61
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr-43988bac.pth
- Config: configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 66222034944
Parameters: 34565560
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: RGB
Name: slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 76.07
Top 5 Accuracy: 92.21
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb-f82bd304.pth
- Config: configs/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 66222034944
Parameters: 34565560
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 76.58
Top 5 Accuracy: 92.85
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr-28474e54.pth
- Config: configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet101 + ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 65042780160
Parameters: 62384312
Pretrained: None
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: slowfast_r101_r50_4x16x1_256e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 76.69
Top 5 Accuracy: 93.07
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/slowfast_r101_4x16x1_256e_kinetics400_rgb_20210218-d8b58813.pth
- Config: configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet101
Batch Size: 8
Epochs: 256
FLOPs: 127070375936
Parameters: 62912312
Pretrained: None
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: slowfast_r101_8x8x1_256e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 77.9
Top 5 Accuracy: 93.51
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/slowfast_r101_8x8x1_256e_kinetics400_rgb_20210218-0dd54025.pth
- Config: configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet152 + ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 91515654144
Parameters: 84843704
Pretrained: None
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: slowfast_r152_r50_4x16x1_256e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 77.13
Top 5 Accuracy: 93.2
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/slowfast_r152_4x16x1_256e_kinetics400_rgb_20210122-bdeb6b87.pth
- Config: configs/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb.py
In Collection: SlowFast
Metadata:
Architecture: ResNet50
Batch Size: 4
Epochs: 22
FLOPs: 132442627584
Parameters: 34044630
Pretrained: Kinetics400
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: slowfast_r50_16x8x1_22e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 49.67
Top 5 Accuracy: 79.00
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb_20211202-aaaf9279.pth
model = dict(
type='Recognizer3D',
backbone=dict(
type='ResNet3dSlowFast',
pretrained=None,
resample_rate=4, # tau
speed_ratio=4, # alpha
channel_ratio=8, # beta_inv
slow_pathway=dict(
type='resnet3d',
depth=50,
pretrained=None,
lateral=True,
lateral_norm=True,
fusion_kernel=7,
conv1_kernel=(1, 7, 7),
dilations=(1, 1, 1, 1),
conv1_stride_t=1,
pool1_stride_t=1,
inflate=(0, 0, 1, 1),
norm_eval=False),
fast_pathway=dict(
type='resnet3d',
depth=50,
pretrained=None,
lateral=False,
base_channels=8,
conv1_kernel=(5, 7, 7),
conv1_stride_t=1,
pool1_stride_t=1,
norm_eval=False)),
cls_head=dict(
type='SlowFastHead',
in_channels=2304, # 2048+256
num_classes=400,
spatial_type='avg',
dropout_ratio=0.5),
# model training and testing settings
train_cfg=None,
test_cfg=dict(average_clips='prob'))
train_cfg = None
test_cfg = dict(average_clips='prob')
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=4,
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001) # 16gpu 0.1->0.2
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
lr_config = dict(policy='step', step=[94, 154, 196])
total_epochs = 239
evaluation = dict(
interval=3, metrics=['top_k_accuracy', 'mean_class_accuracy'])
log_config = dict(
interval=20,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook'),
])
dist_params = dict(backend='nccl')
log_level = 'INFO'
checkpoint_config = dict(interval=3)
workflow = [('train', 1)]
find_unused_parameters = False
multigrid = dict(
long_cycle=True,
short_cycle=True,
epoch_factor=1.5,
long_cycle_factors=[[0.25, 0.7071], [0.5, 0.7071], [0.5, 1], [1, 1]],
short_cycle_factors=[0.5, 0.7071],
default_s=(224, 224),
)
precise_bn = dict(num_iters=200, interval=3)
load_from = None
resume_from = None
work_dir = './work_dirs/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb'
_base_ = [
'../../_base_/models/slowfast_r50.py', '../../_base_/default_runtime.py'
]
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=4,
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.1, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(
policy='CosineAnnealing',
min_lr=0,
warmup='linear',
warmup_by_epoch=True,
warmup_iters=34)
total_epochs = 256
# precise bn
precise_bn = dict(num_iters=200, interval=1)
# runtime settings
checkpoint_config = dict(interval=4)
work_dir = './work_dirs/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb'
find_unused_parameters = False
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment