add model TSM

5b3e36dc · Sugon_ldc · 5b3e36dc · 5b3e36dc · 5b3e36dc · 5b3e36dc
Commit 5b3e36dc authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_rgb.py
+++ b/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_rgb.py
+_base_ = [
+    '../../../_base_/models/slowonly_r50.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+
+evaluation = dict(
+    interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# The flag indicates using joint training
+omnisource = True
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.15, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0)
+
+# runtime settings
+total_epochs = 256
+checkpoint_config = dict(interval=8)
+work_dir = './work_dirs/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb'
+find_unused_parameters = False
--- a/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb.py
+++ b/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb.py
+_base_ = [
+    '../../../_base_/models/slowonly_r50.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+# The flag indicates using joint training
+omnisource = True
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+web_root = 'data/OmniSource/'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_web = ('data/OmniSource/annotations/webimage_200/'
+                'tsn_8seg_webimage_200_wodup.txt')
+
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_web_pipeline = [
+    dict(type='ImageDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='BuildPseudoClip', clip_len=8),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=[
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_train,
+            data_prefix=data_root,
+            pipeline=train_pipeline),
+        dict(
+            type='ImageDataset',
+            ann_file=ann_file_web,
+            data_prefix=web_root,
+            pipeline=train_web_pipeline,
+            num_classes=200,
+            sample_by_class=True,
+            power=0.5)
+    ],
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.15, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0)
+
+# runtime settings
+total_epochs = 256
+checkpoint_config = dict(interval=8)
+work_dir = ('./work_dirs/omnisource/'
+            'slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb')
+find_unused_parameters = False
--- a/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb.py
+++ b/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb.py
+_base_ = [
+    '../../../_base_/models/tsn_r50.py',
+    '../../../_base_/schedules/sgd_100e.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(cls_head=dict(num_classes=200))
+
+omnisource = True
+# dataset settings
+dataset_type = 'VideoDataset'
+# The flag indicates using joint training
+omnisource = True
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+gg_root = 'data/OmniSource/googleimage_200'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_gg = ('data/OmniSource/annotations/googleimage_200/'
+               'tsn_8seg_googleimage_200_wodup.txt')
+
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_gg_pipeline = [
+    dict(type='ImageDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    omni_videos_per_gpu=[12, 64],
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=[
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_train,
+            data_prefix=data_root,
+            pipeline=train_pipeline),
+        dict(
+            type='ImageDataset',
+            ann_file=ann_file_gg,
+            data_prefix=gg_root,
+            pipeline=train_gg_pipeline)
+    ],
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00375, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = ('./work_dirs/omnisource/'
+            'tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb')
--- a/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb.py
+++ b/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb.py
+_base_ = [
+    '../../../_base_/models/tsn_r50.py',
+    '../../../_base_/schedules/sgd_100e.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(cls_head=dict(num_classes=200))
+
+omnisource = True
+# dataset settings
+dataset_type = 'VideoDataset'
+# The flag indicates using joint training
+omnisource = True
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+iv_root = 'data/OmniSource/insvideo_200'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_iv = ('data/OmniSource/annotations/insvideo_200/'
+               'slowonly_8x8_insvideo_200_wodup.txt')
+
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_iv_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=[
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_train,
+            data_prefix=data_root,
+            pipeline=train_pipeline),
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_iv,
+            data_prefix=iv_root,
+            pipeline=train_iv_pipeline,
+            num_classes=200,
+            sample_by_class=True,
+            power=0.5)
+    ],
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00375, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = ('./work_dirs/omnisource/'
+            'tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb')
--- a/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb.py
+++ b/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb.py
+_base_ = [
+    '../../../_base_/models/tsn_r50.py',
+    '../../../_base_/schedules/sgd_100e.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(cls_head=dict(num_classes=200))
+
+omnisource = True
+# dataset settings
+dataset_type = 'VideoDataset'
+# The flag indicates using joint training
+omnisource = True
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+kraw_root = 'data/OmniSource/kinetics_raw_200_train'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_kraw = ('data/OmniSource/annotations/kinetics_raw_200/'
+                 'slowonly_8x8_kinetics_raw_200.json')
+
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_kraw_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=[
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_train,
+            data_prefix=data_root,
+            pipeline=train_pipeline),
+        dict(
+            type='RawVideoDataset',
+            ann_file=ann_file_kraw,
+            data_prefix=kraw_root,
+            pipeline=train_kraw_pipeline,
+            clipname_tmpl='part_{}.mp4',
+            sampling_strategy='positive')
+    ],
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00375, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = ('./work_dirs/omnisource/'
+            'tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb')
--- a/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb.py
+++ b/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb.py
+_base_ = [
+    '../../../_base_/models/tsn_r50.py',
+    '../../../_base_/schedules/sgd_100e.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(cls_head=dict(num_classes=200))
+
+omnisource = True
+# dataset settings
+dataset_type = 'VideoDataset'
+# The flag indicates using joint training
+omnisource = True
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+web_root = 'data/OmniSource/'
+iv_root = 'data/OmniSource/insvideo_200'
+kraw_root = 'data/OmniSource/kinetics_raw_200_train'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_web = ('data/OmniSource/annotations/webimage_200/'
+                'tsn_8seg_webimage_200_wodup.txt')
+ann_file_iv = ('data/OmniSource/annotations/insvideo_200/'
+               'slowonly_8x8_insvideo_200_wodup.txt')
+ann_file_kraw = ('data/OmniSource/annotations/kinetics_raw_200/'
+                 'slowonly_8x8_kinetics_raw_200.json')
+
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_web_pipeline = [
+    dict(type='ImageDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_iv_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_kraw_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    omni_videos_per_gpu=[12, 64, 12, 12],
+    train_ratio=[2, 1, 1, 1],
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=[
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_train,
+            data_prefix=data_root,
+            pipeline=train_pipeline),
+        dict(
+            type='ImageDataset',
+            ann_file=ann_file_web,
+            data_prefix=web_root,
+            pipeline=train_web_pipeline,
+            num_classes=200,
+            sample_by_class=True,
+            power=0.5),
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_iv,
+            data_prefix=iv_root,
+            pipeline=train_iv_pipeline,
+            num_classes=200,
+            sample_by_class=True,
+            power=0.5),
+        dict(
+            type='RawVideoDataset',
+            ann_file=ann_file_kraw,
+            data_prefix=kraw_root,
+            pipeline=train_kraw_pipeline,
+            clipname_tmpl='part_{}.mp4',
+            sampling_strategy='positive')
+    ],
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00375, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = ('./work_dirs/omnisource/'
+            'tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb')
--- a/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_rgb.py
+++ b/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_rgb.py
+_base_ = [
+    '../../../_base_/models/tsn_r50.py',
+    '../../../_base_/schedules/sgd_100e.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(cls_head=dict(num_classes=200))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+# The flag indicates using joint training
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00375, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = './work_dirs/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb'
--- a/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb.py
+++ b/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb.py
+_base_ = [
+    '../../../_base_/models/tsn_r50.py',
+    '../../../_base_/schedules/sgd_100e.py',
+    '../../../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(cls_head=dict(num_classes=200))
+
+omnisource = True
+# dataset settings
+dataset_type = 'VideoDataset'
+# The flag indicates using joint training
+omnisource = True
+
+data_root = 'data/OmniSource/kinetics_200_train'
+data_root_val = 'data/OmniSource/kinetics_200_val'
+web_root = 'data/OmniSource/'
+
+ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
+ann_file_web = ('data/OmniSource/annotations/webimage_200/'
+                'tsn_8seg_webimage_200_wodup.txt')
+
+ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+train_web_pipeline = [
+    dict(type='ImageDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+
+data = dict(
+    videos_per_gpu=12,
+    omni_videos_per_gpu=[12, 64],
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=[
+        dict(
+            type=dataset_type,
+            ann_file=ann_file_train,
+            data_prefix=data_root,
+            pipeline=train_pipeline),
+        dict(
+            type='ImageDataset',
+            ann_file=ann_file_web,
+            data_prefix=web_root,
+            pipeline=train_web_pipeline,
+            num_classes=200,
+            sample_by_class=True,
+            power=0.5)
+    ],
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00375, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = ('./work_dirs/omnisource/'
+            'tsn_r50_1x1x8_100e_minikinetics_webimage_rgb')
--- a/configs/recognition/r2plus1d/README.md
+++ b/configs/recognition/r2plus1d/README.md
+# R2plus1D
+
+[A closer look at spatiotemporal convolutions for action recognition](https://openaccess.thecvf.com/content_cvpr_2018/html/Tran_A_Closer_Look_CVPR_2018_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+In this paper we discuss several forms of spatiotemporal convolutions for video analysis and study their effects on action recognition. Our motivation stems from the observation that 2D CNNs applied to individual frames of the video have remained solid performers in action recognition. In this work we empirically demonstrate the accuracy advantages of 3D CNNs over 2D CNNs within the framework of residual learning. Furthermore, we show that factorizing the 3D convolutional filters into separate spatial and temporal components yields significantly advantages in accuracy. Our empirical study leads to the design of a new spatiotemporal convolutional block "R(2+1)D" which gives rise to CNNs that achieve results comparable or superior to the state-of-the-art on Sports-1M, Kinetics, UCF101 and HMDB51.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143043885-3d00413c-b556-445e-9673-f5805c08c195.png" width="800"/>
+</div>
+
+## Results and Models
+
+### Kinetics-400
+
+| config                                                                                                                          |   resolution   | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) |                                                                                          ckpt                                                                                          |                                                                    log                                                                     |                                                                        json                                                                        |
+| :------------------------------------------------------------------------------------------------------------------------------ | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py)             | short-side 256 | 8x4  | ResNet34 |   None   |  67.30   |  87.65   |            x            |    5019    |  [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb_20200729-aa94765e.pth)  |    [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log)    |     [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log.json)     |
+| [r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | short-side 256 |  8   | ResNet34 |   None   |   67.3   |   87.8   |            x            |    5019    | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb_20200826-ab35a529.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log.json) |       [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log)       |
+| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py)             | short-side 320 | 8x2  | ResNet34 |   None   |  68.68   |  88.36   |    1.6 (80x3 frames)    |    5019    |       [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth)       |          [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log)          | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json) |
+| [r2plus1d_r34_32x2x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py)           | short-side 320 | 8x2  | ResNet34 |   None   |  74.60   |  91.59   |   0.5 (320x3 frames)    |   12975    |      [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth)      |         [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log)         | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json) |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time, not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
+3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
+
+:::
+
+For more details on data preparation, you can refer to Kinetics400 in [Data Preparation](/docs/en/data_preparation.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train R(2+1)D model on Kinetics-400 dataset in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
+    --work-dir work_dirs/r2plus1d_r34_3d_8x8x1_180e_kinetics400_rgb \
+    --validate --seed 0 --deterministic
+```
+
+For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test R(2+1)D model on Kinetics-400 dataset and dump the result to a json file.
+
+```shell
+python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json --average-clips=prob
+```
+
+For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
+
+## Citation
+
+```BibTeX
+@inproceedings{tran2018closer,
+  title={A closer look at spatiotemporal convolutions for action recognition},
+  author={Tran, Du and Wang, Heng and Torresani, Lorenzo and Ray, Jamie and LeCun, Yann and Paluri, Manohar},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={6450--6459},
+  year={2018}
+}
+```
--- a/configs/recognition/r2plus1d/README_zh-CN.md
+++ b/configs/recognition/r2plus1d/README_zh-CN.md
+# R2plus1D
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@inproceedings{tran2018closer,
+  title={A closer look at spatiotemporal convolutions for action recognition},
+  author={Tran, Du and Wang, Heng and Torresani, Lorenzo and Ray, Jamie and LeCun, Yann and Paluri, Manohar},
+  booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+  pages={6450--6459},
+  year={2018}
+}
+```
+
+## 模型库
+
+### Kinetics-400
+
+| 配置文件                                                                                                                        |  分辨率  | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | 推理时间 (video/s) | GPU 显存占用 (M) |                                                                                          ckpt                                                                                          |                                                                    log                                                                     |                                                                        json                                                                        |
+| :------------------------------------------------------------------------------------------------------------------------------ | :------: | :------: | :------: | :----: | :---------: | :---------: | :----------------: | :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py)             | 短边 256 |   8x4    | ResNet34 |  None  |    67.30    |    87.65    |         x          |       5019       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb_20200729-aa94765e.pth)  |    [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log)    |     [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log.json)     |
+| [r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | 短边 256 |    8     | ResNet34 |  None  |    67.3     |    87.8     |         x          |       5019       | [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb_20200826-ab35a529.pth) | [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log.json) |       [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log)       |
+| [r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py)             | 短边 320 |   8x2    | ResNet34 |  None  |    68.68    |    88.36    | 1.6 (80x3 frames)  |       5019       |       [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth)       |          [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log)          | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json) |
+| [r2plus1d_r34_32x2x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py)           | 短边 320 |   8x2    | ResNet34 |  None  |    74.60    |    91.59    | 0.5 (320x3 frames) |      12975       |      [ckpt](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth)      |         [log](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log)         | [json](https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json) |
+
+注：
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 这里的 **推理时间** 是根据 [基准测试脚本](/tools/analysis/benchmark.py) 获得的，采用测试时的采帧策略，且只考虑模型的推理时间，
+   并不包括 IO 时间以及预处理时间。对于每个配置，MMAction2 使用 1 块 GPU 并设置批大小（每块 GPU 处理的视频个数）为 1 来计算推理时间。
+3. 我们使用的 Kinetics400 验证集包含 19796 个视频，用户可以从 [验证集视频](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB) 下载这些视频。同时也提供了对应的 [数据列表](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) （每行格式为：视频 ID，视频帧数目，类别序号）以及 [标签映射](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) （类别序号到类别名称）。
+
+对于数据集准备的细节，用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 Kinetics400 部分。
+
+## 如何训练
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：以一个确定性的训练方式，辅以定期的验证过程进行 R(2+1)D 模型在 Kinetics400 数据集上的训练。
+
+```shell
+python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
+    --work-dir work_dirs/r2plus1d_r34_3d_8x8x1_180e_kinetics400_rgb \
+    --validate --seed 0 --deterministic
+```
+
+更多训练细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
+
+## 如何测试
+
+用户可以使用以下指令进行模型测试。
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+例如：在 Kinetics400 数据集上测试 R(2+1)D 模型，并将结果导出为一个 json 文件。
+
+```shell
+python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json --average-clips=prob
+```
+
+更多测试细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
--- a/configs/recognition/r2plus1d/metafile.yml
+++ b/configs/recognition/r2plus1d/metafile.yml
+Collections:
+- Name: R2Plus1D
+  README: configs/recognition/r2plus1d/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1711.11248
+    Title: A Closer Look at Spatiotemporal Convolutions for Action Recognition
+Models:
+- Config: configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py
+  In Collection: R2Plus1D
+  Metadata:
+    Architecture: ResNet34
+    Batch Size: 8
+    Epochs: 180
+    FLOPs: 53175572992
+    Parameters: 63759281
+    Pretrained: None
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: r2plus1d_r34_8x8x1_180e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 67.3
+      Top 5 Accuracy: 87.65
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb_20200729-aa94765e.pth
+- Config: configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py
+  In Collection: R2Plus1D
+  Metadata:
+    Architecture: ResNet34
+    Batch Size: 16
+    Epochs: 180
+    FLOPs: 53175572992
+    Parameters: 63759281
+    Pretrained: None
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 67.3
+      Top 5 Accuracy: 87.8
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log
+  Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/20200724_201360.log.json
+  Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb_20200826-ab35a529.pth
+- Config: configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py
+  In Collection: R2Plus1D
+  Metadata:
+    Architecture: ResNet34
+    Batch Size: 8
+    Epochs: 180
+    FLOPs: 53175572992
+    Parameters: 63759281
+    Pretrained: None
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 16 GPUs
+  Modality: RGB
+  Name: r2plus1d_r34_8x8x1_180e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 68.68
+      Top 5 Accuracy: 88.36
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth
+- Config: configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py
+  In Collection: R2Plus1D
+  Metadata:
+    Architecture: ResNet34
+    Batch Size: 6
+    Epochs: 180
+    FLOPs: 212701677568
+    Parameters: 63759281
+    Pretrained: None
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 16 GPUs
+  Modality: RGB
+  Name: r2plus1d_r34_32x2x1_180e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 74.6
+      Top 5 Accuracy: 91.59
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth
--- a/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py
+++ b/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py
+_base_ = ['./r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py']
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=6,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline,
+        test_mode=True))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.075, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = './work_dirs/r2plus1d_r34_3d_32x2x1_180e_kinetics400_rgb/'
--- a/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py
+++ b/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py
+_base_ = [
+    '../../_base_/models/r2plus1d_r34.py', '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline,
+        test_mode=True))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.1, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0)
+total_epochs = 180
+
+# runtime settings
+checkpoint_config = dict(interval=5)
+work_dir = './work_dirs/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/'
+find_unused_parameters = False
--- a/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py
+++ b/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py
+_base_ = ['./r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py']
+
+# model settings
+model = dict(backbone=dict(act_cfg=dict(type='ReLU')))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline,
+        test_mode=True))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.2, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+
+# runtime settings
+work_dir = './work_dirs/r2plus1d_r34_video_3d_8x8x1_180e_kinetics400_rgb/'
--- a/configs/recognition/r2plus1d/r2plus1d_r34_video_inference_8x8x1_180e_kinetics400_rgb.py
+++ b/configs/recognition/r2plus1d/r2plus1d_r34_video_inference_8x8x1_180e_kinetics400_rgb.py
+_base_ = ['../../_base_/models/r2plus1d_r34.py']
+
+# model settings
+model = dict(backbone=dict(act_cfg=dict(type='ReLU')))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=1,
+    workers_per_gpu=2,
+    test=dict(
+        type=dataset_type,
+        ann_file=None,
+        data_prefix=None,
+        pipeline=test_pipeline))
--- a/configs/recognition/slowfast/README.md
+++ b/configs/recognition/slowfast/README.md
+# SlowFast
+
+[SlowFast Networks for Video Recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143044111-94676f64-7ba8-4081-9011-f8054bed7030.png" width="800"/>
+</div>
+
+## Results and Models
+
+### Kinetics-400
+
+| config                                                                                                                                        |   resolution   | gpus |       backbone       | pretrain | top1 acc | top5 acc | inference_time(video/s)  | gpu_mem(M) |                                                                                            ckpt                                                                                             |                                                                                        log                                                                                        |                                                                                        json                                                                                         |
+| :-------------------------------------------------------------------------------------------------------------------------------------------- | :------------: | :--: | :------------------: | :------: | :------: | :------: | :----------------------: | :--------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py)                         | short-side 256 | 8x4  |       ResNet50       |   None   |  74.75   |  91.73   |            x             |    6203    |   [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb_20200728-145f1097.pth)    |                       [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log)                       |                     [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log.json)                     |
+| [slowfast_r50_video_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py)             | short-side 256 |  8   |       ResNet50       |   None   |  73.95   |  91.50   |            x             |    6203    |  [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth)   |                      [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log)                       |                    [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log.json)                     |
+| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py)                         | short-side 320 | 8x2  |       ResNet50       |   None   |   76.0   |  92.54   | 1.6 ((32+4)x10x3 frames) |    6203    |        [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20210722-04e43ed4.pth)         |                   [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log)                   |                 [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log.json)                 |
+| [slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb.py)             | short-side 320 | 8x2  |       ResNet50       |   None   |  76.34   |  92.67   |            x             |    6203    |  [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb_20210722-bb725050.pth)   |             [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log)             |           [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log.json)           |
+| [slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py)                           | short-side 320 | 8x3  |       ResNet50       |   None   |  76.94   |   92.8   | 1.3 ((32+8)x10x3 frames) |    9062    |         [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth)          |                          [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log)                          |                        [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json)                        |
+| [slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.py)             | short-side 320 | 8x4  |       ResNet50       |   None   |  76.34   |  92.61   |                          |    9062    |       [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr-43988bac.pth)       |       [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.log)       |       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.json)       |
+| [slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py)       | short-side 320 | 8x2  |       ResNet50       |   None   |  76.07   |  92.21   |            x             |    9062    |    [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb-f82bd304.pth)    |    [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.log)    |    [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.json)    |
+| [slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_perbn_r50_8x8x1_256e_kinetics400_rgb_steplr.py) | short-side 320 | 8x4  |       ResNet50       |   None   |  76.58   |  92.85   |                          |    9062    | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr-28474e54.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.json) |
+| [slowfast_r101_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py)               | short-side 256 | 8x1  | ResNet101 + ResNet50 |   None   |  76.69   |  93.07   |                          |   16628    |       [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/slowfast_r101_4x16x1_256e_kinetics400_rgb_20210218-d8b58813.pth)        |                         [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log)                         |                       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log.json)                       |
+| [slowfast_r101_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py)                         | short-side 256 | 8x4  |      ResNet101       |   None   |  77.90   |  93.51   |                          |   25994    |        [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/slowfast_r101_8x8x1_256e_kinetics400_rgb_20210218-0dd54025.pth)         |                         [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log)                          |                       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log.json)                        |
+| [slowfast_r152_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py)               | short-side 256 | 8x1  | ResNet152 + ResNet50 |   None   |  77.13   |  93.20   |                          |   10077    |       [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/slowfast_r152_4x16x1_256e_kinetics400_rgb_20210122-bdeb6b87.pth)        |                         [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log)                         |                       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log.json)                       |
+
+### Something-Something V1
+
+| config                                                                                                  | resolution | gpus | backbone |  pretrain   | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) |                                                                              ckpt                                                                              |                                                                     log                                                                     |                                                                     json                                                                      |
+| :------------------------------------------------------------------------------------------------------ | :--------: | :--: | :------: | :---------: | :------: | :------: | :---------------------: | :--------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: |
+| [slowfast_r50_16x8x1_22e_sthv1_rgb](/configs/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb.py) | height 100 |  8   | ResNet50 | Kinetics400 |  49.67   |  79.00   |            x            |    9293    | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb_20211202-aaaf9279.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.json) |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time, not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
+3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
+
+:::
+
+For more details on data preparation, you can refer to Kinetics400 in [Data Preparation](/docs/en/data_preparation.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train SlowFast model on Kinetics-400 dataset in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
+    --work-dir work_dirs/slowfast_r50_4x16x1_256e_kinetics400_rgb \
+    --validate --seed 0 --deterministic
+```
+
+For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test SlowFast model on Kinetics-400 dataset and dump the result to a json file.
+
+```shell
+python tools/test.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json --average-clips=prob
+```
+
+For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
+
+## Citation
+
+```BibTeX
+@inproceedings{feichtenhofer2019slowfast,
+  title={Slowfast networks for video recognition},
+  author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  pages={6202--6211},
+  year={2019}
+}
+```
--- a/configs/recognition/slowfast/README_zh-CN.md
+++ b/configs/recognition/slowfast/README_zh-CN.md
+# SlowFast
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@inproceedings{feichtenhofer2019slowfast,
+  title={Slowfast networks for video recognition},
+  author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
+  booktitle={Proceedings of the IEEE international conference on computer vision},
+  pages={6202--6211},
+  year={2019}
+}
+```
+
+## 模型库
+
+### Kinetics-400
+
+| 配置文件                                                                                                                                | 分辨率  | GPU 数量 |       主干网络       | 预训练 | top1 准确率 | top5 准确率 |    推理时间 (video/s)    | GPU 显存占用 (M) |                                                                                            ckpt                                                                                             |                                                                                        log                                                                                        |                                                                                        json                                                                                         |
+| :-------------------------------------------------------------------------------------------------------------------------------------- | :-----: | :------: | :------------------: | :----: | :---------: | :---------: | :----------------------: | :--------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py)                   | 短边256 |   8x4    |       ResNet50       |  None  |    74.75    |    91.73    |            x             |       6203       |   [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb_20200728-145f1097.pth)    |                       [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log)                       |                     [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log.json)                     |
+| [slowfast_r50_video_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py)       | 短边256 |    8     |       ResNet50       |  None  |    73.95    |    91.50    |            x             |       6203       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth)   |                      [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log)                       |                    [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log.json)                     |
+| [slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py)                   | 短边320 |   8x2    |       ResNet50       |  None  |    76.0     |    92.54    | 1.6 ((32+4)x10x3 frames) |       6203       |        [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20210722-04e43ed4.pth)         |                   [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log)                   |                 [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_20210722.log.json)                 |
+| [slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb.py)       | 短边320 |   8x2    |       ResNet50       |  None  |    76.34    |    92.67    |            x             |       6203       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb_20210722-bb725050.pth)   |             [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log)             |           [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb/slowfast_prebn_r50_4x16x1_20210722.log.json)           |
+| [slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py)                     | 短边320 |   8x3    |       ResNet50       |  None  |    76.94    |    92.8     | 1.3 ((32+8)x10x3 frames) |       9062       |         [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth)          |                          [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log)                          |                        [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json)                        |
+| [slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py)              | 短边320 |   8x4    |       ResNet50       |  None  |    76.34    |    92.61    | 1.3 ((32+8)x10x3 frames) |       9062       |       [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr-43988bac.pth)       |       [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.log)       |       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.json)       |
+| [slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py) | 短边320 |   8x2    |       ResNet50       |  None  |    76.07    |    92.21    |            x             |       9062       |    [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb-f82bd304.pth)    |    [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.log)    |    [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.json)    |
+| [slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr](/configs/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb.py)  | 短边320 |   8x4    |       ResNet50       |  None  |    76.58    |    92.85    | 1.3 ((32+8)x10x3 frames) |       9062       | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr-28474e54.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.json) |
+| [slowfast_r101_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py)         | 短边256 |   8x1    | ResNet101 + ResNet50 |  None  |    76.69    |    93.07    |                          |      16628       |       [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/slowfast_r101_4x16x1_256e_kinetics400_rgb_20210218-d8b58813.pth)        |                         [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log)                         |                       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log.json)                       |
+| [slowfast_r101_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py)                   | 短边256 |   8x4    |      ResNet101       |  None  |    77.90    |    93.51    |                          |      25994       |        [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/slowfast_r101_8x8x1_256e_kinetics400_rgb_20210218-0dd54025.pth)         |                         [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log)                          |                       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log.json)                        |
+| [slowfast_r152_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py)         | 短边256 |   8x1    | ResNet152 + ResNet50 |  None  |    77.13    |    93.20    |                          |      10077       |       [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/slowfast_r152_4x16x1_256e_kinetics400_rgb_20210122-bdeb6b87.pth)        |                         [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log)                         |                       [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log.json)                       |
+
+### Something-Something V1
+
+| 配置文件                                                                                                | 分辨率 | GPU 数量 | 主干网络 |   预训练    | top1 准确率 | top5 准确率 | 推理时间 (video/s) | GPU 显存占用 (M) |                                                                              ckpt                                                                              |                                                                     log                                                                     |                                                                     json                                                                      |
+| :------------------------------------------------------------------------------------------------------ | :----: | :------: | :------: | :---------: | :---------: | :---------: | :----------------: | :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: |
+| [slowfast_r50_16x8x1_22e_sthv1_rgb](/configs/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb.py) | 高 100 |    8     | ResNet50 | Kinetics400 |    49.67    |    79.00    |         x          |       9293       | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb_20211202-aaaf9279.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.json) |
+
+注：
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 这里的 **推理时间** 是根据 [基准测试脚本](/tools/analysis/benchmark.py) 获得的，采用测试时的采帧策略，且只考虑模型的推理时间，
+   并不包括 IO 时间以及预处理时间。对于每个配置，MMAction2 使用 1 块 GPU 并设置批大小（每块 GPU 处理的视频个数）为 1 来计算推理时间。
+3. 我们使用的 Kinetics400 验证集包含 19796 个视频，用户可以从 [验证集视频](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB) 下载这些视频。同时也提供了对应的 [数据列表](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) （每行格式为：视频 ID，视频帧数目，类别序号）以及 [标签映射](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) （类别序号到类别名称）。
+
+对于数据集准备的细节，用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 Kinetics400 部分。
+
+## 如何训练
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：以一个确定性的训练方式，辅以定期的验证过程进行 SlowFast 模型在 Kinetics400 数据集上的训练。
+
+```shell
+python tools/train.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
+    --work-dir work_dirs/slowfast_r50_4x16x1_256e_kinetics400_rgb \
+    --validate --seed 0 --deterministic
+```
+
+更多训练细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
+
+## 如何测试
+
+用户可以使用以下指令进行模型测试。
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+例如：在 SlowFast 数据集上测试 CSN 模型，并将结果导出为一个 json 文件。
+
+```shell
+python tools/test.py configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json --average-clips=prob
+```
+
+更多测试细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
--- a/configs/recognition/slowfast/metafile.yml
+++ b/configs/recognition/slowfast/metafile.yml
+Collections:
+- Name: SlowFast
+  README: configs/recognition/slowfast/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1812.03982
+    Title: SlowFast Networks for Video Recognition
+Models:
+- Config: configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 36441296896
+    Parameters: 34479288
+    Pretrained: None
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: slowfast_r50_4x16x1_256e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 74.75
+      Top 5 Accuracy: 91.73
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200731_151706.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb_20200728-145f1097.pth
+- Config: configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 36441296896
+    Parameters: 34479288
+    Pretrained: None
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: slowfast_r50_video_4x16x1_256e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 73.95
+      Top 5 Accuracy: 91.50
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/20200812_160237.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth
+- Config: configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 36441296896
+    Parameters: 34479288
+    Pretrained: None
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 24 GPUs
+  Modality: RGB
+  Name: slowfast_r50_4x16x1_256e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 75.64
+      Top 5 Accuracy: 92.3
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth
+- Config: configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 66222034944
+    Parameters: 34565560
+    Pretrained: None
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 24 GPUs
+  Modality: RGB
+  Name: slowfast_r50_8x8x1_256e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 76.94
+      Top 5 Accuracy: 92.8
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth
+- Config: configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 66222034944
+    Parameters: 34565560
+    Pretrained: None
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 76.34
+      Top 5 Accuracy: 92.61
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_r50_8x8x1_256e_kinetics400_rgb_steplr-43988bac.pth
+- Config: configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 66222034944
+    Parameters: 34565560
+    Pretrained: None
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 16 GPUs
+  Modality: RGB
+  Name: slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 76.07
+      Top 5 Accuracy: 92.21
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb-f82bd304.pth
+- Config: configs/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 66222034944
+    Parameters: 34565560
+    Pretrained: None
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 76.58
+      Top 5 Accuracy: 92.85
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr/slowfast_prebn_r50_8x8x1_256e_kinetics400_rgb_steplr-28474e54.pth
+- Config: configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet101 + ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 65042780160
+    Parameters: 62384312
+    Pretrained: None
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: slowfast_r101_r50_4x16x1_256e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 76.69
+      Top 5 Accuracy: 93.07
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/slowfast_r101_4x16x1_256e_kinetics400_rgb_20210218-d8b58813.pth
+- Config: configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet101
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 127070375936
+    Parameters: 62912312
+    Pretrained: None
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: slowfast_r101_8x8x1_256e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 77.9
+      Top 5 Accuracy: 93.51
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/slowfast_r101_8x8x1_256e_kinetics400_rgb_20210218-0dd54025.pth
+- Config: configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet152 + ResNet50
+    Batch Size: 8
+    Epochs: 256
+    FLOPs: 91515654144
+    Parameters: 84843704
+    Pretrained: None
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: slowfast_r152_r50_4x16x1_256e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 77.13
+      Top 5 Accuracy: 93.2
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/slowfast_r152_4x16x1_256e_kinetics400_rgb_20210122-bdeb6b87.pth
+- Config: configs/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb.py
+  In Collection: SlowFast
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 4
+    Epochs: 22
+    FLOPs: 132442627584
+    Parameters: 34044630
+    Pretrained: Kinetics400
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: slowfast_r50_16x8x1_22e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 49.67
+      Top 5 Accuracy: 79.00
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_16x8x1_22e_sthv1_rgb/slowfast_r50_16x8x1_22e_sthv1_rgb_20211202-aaaf9279.pth
--- a/configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py
+++ b/configs/recognition/slowfast/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb.py
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowFast',
+        pretrained=None,
+        resample_rate=4,  # tau
+        speed_ratio=4,  # alpha
+        channel_ratio=8,  # beta_inv
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            lateral_norm=True,
+            fusion_kernel=7,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            norm_eval=False),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            norm_eval=False)),
+    cls_head=dict(
+        type='SlowFastHead',
+        in_channels=2304,  # 2048+256
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5),
+    # model training and testing settings
+    train_cfg=None,
+    test_cfg=dict(average_clips='prob'))
+
+train_cfg = None
+test_cfg = dict(average_clips='prob')
+
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)  # 16gpu 0.1->0.2
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+lr_config = dict(policy='step', step=[94, 154, 196])
+
+total_epochs = 239
+
+evaluation = dict(
+    interval=3, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        #    dict(type='TensorboardLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+
+checkpoint_config = dict(interval=3)
+workflow = [('train', 1)]
+
+find_unused_parameters = False
+
+multigrid = dict(
+    long_cycle=True,
+    short_cycle=True,
+    epoch_factor=1.5,
+    long_cycle_factors=[[0.25, 0.7071], [0.5, 0.7071], [0.5, 1], [1, 1]],
+    short_cycle_factors=[0.5, 0.7071],
+    default_s=(224, 224),
+)
+
+precise_bn = dict(num_iters=200, interval=3)
+
+load_from = None
+resume_from = None
+
+work_dir = './work_dirs/slowfast_multigrid_r50_8x8x1_358e_kinetics400_rgb'
--- a/configs/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb.py
+++ b/configs/recognition/slowfast/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb.py
+_base_ = [
+    '../../_base_/models/slowfast_r50.py', '../../_base_/default_runtime.py'
+]
+
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.1, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    min_lr=0,
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=34)
+total_epochs = 256
+
+# precise bn
+precise_bn = dict(num_iters=200, interval=1)
+
+# runtime settings
+checkpoint_config = dict(interval=4)
+work_dir = './work_dirs/slowfast_prebn_r50_4x16x1_256e_kinetics400_rgb'
+find_unused_parameters = False