add model TSM

5b3e36dc · Sugon_ldc · 5b3e36dc · 5b3e36dc · 5b3e36dc · 5b3e36dc
Commit 5b3e36dc authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+++ b/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+max_num_sampled_feat = 5
+window_size = 60
+lfb_channels = 2048
+dataset_modes = ('train', 'val')
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='FBOHead',
+            lfb_cfg=dict(
+                lfb_prefix_path=lfb_prefix_path,
+                max_num_sampled_feat=max_num_sampled_feat,
+                window_size=window_size,
+                lfb_channels=lfb_channels,
+                dataset_modes=dataset_modes,
+                device='gpu'),
+            fbo_cfg=dict(type='avg')),
+        bbox_head=dict(in_channels=4096)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids', 'img_key'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+
+optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb'  # noqa E501
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+++ b/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+max_num_sampled_feat = 5
+window_size = 60
+lfb_channels = 2048
+dataset_modes = ('train', 'val')
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='FBOHead',
+            lfb_cfg=dict(
+                lfb_prefix_path=lfb_prefix_path,
+                max_num_sampled_feat=max_num_sampled_feat,
+                window_size=window_size,
+                lfb_channels=lfb_channels,
+                dataset_modes=dataset_modes,
+                device='gpu'),
+            fbo_cfg=dict(type='max')),
+        bbox_head=dict(in_channels=4096)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids', 'img_key'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+
+optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb'  # noqa E501
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+++ b/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+max_num_sampled_feat = 5
+window_size = 60
+lfb_channels = 2048
+dataset_modes = ('train', 'val')
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='FBOHead',
+            lfb_cfg=dict(
+                lfb_prefix_path=lfb_prefix_path,
+                max_num_sampled_feat=max_num_sampled_feat,
+                window_size=window_size,
+                lfb_channels=lfb_channels,
+                dataset_modes=dataset_modes,
+                device='gpu'),
+            fbo_cfg=dict(
+                type='non_local',
+                st_feat_channels=2048,
+                lt_feat_channels=lfb_channels,
+                latent_channels=512,
+                num_st_feat=1,
+                num_lt_feat=window_size * max_num_sampled_feat,
+                num_non_local_layers=2,
+                st_feat_dropout_ratio=0.2,
+                lt_feat_dropout_ratio=0.2,
+                pre_activate=True)),
+        bbox_head=dict(in_channels=2560)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids', 'img_key'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+
+optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb'  # noqa E501
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py
+++ b/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py
+# This config is used to generate long-term feature bank.
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+dataset_mode = 'train'  # ['train', 'val', 'test']
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='LFBInferHead',
+            lfb_prefix_path=lfb_prefix_path,
+            dataset_mode=dataset_mode,
+            use_half_precision=True)))
+
+# dataset settings
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv'
+
+exclude_file_infer = (
+    f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv')
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_infer = (
+    f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl')
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+infer_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=1,
+    workers_per_gpu=2,
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_infer,
+        exclude_file=exclude_file_infer,
+        pipeline=infer_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_infer,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+
+dist_params = dict(backend='nccl')
--- a/configs/detection/lfb/metafile.yml
+++ b/configs/detection/lfb/metafile.yml
+Collections:
+- Name: LFB
+  README: configs/detection/lfb/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1812.05038
+    Title: Long-Term Feature Banks for Detailed Video Understanding
+Models:
+- Config: configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  In Collection: LFB
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 20
+    Input: 4x16
+    Pretrained: Kinetics-400
+    Resolution: short-side 256
+    Training Data: AVA v2.1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  Results:
+  - Dataset: AVA v2.1
+    Metrics:
+      mAP: 24.11
+    Task: Spatial Temporal Action Detection
+  Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log.json
+  Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log
+  Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210224-2ae136d9.pth
+- Config: configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  In Collection: LFB
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 20
+    Input: 4x16
+    Pretrained: Kinetics-400
+    Resolution: short-side 256
+    Training Data: AVA v2.1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  Results:
+  - Dataset: AVA v2.1
+    Metrics:
+      mAP: 20.17
+    Task: Spatial Temporal Action Detection
+  Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json
+  Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log
+  Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-19c330b7.pth
+- Config: configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  In Collection: LFB
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 20
+    Input: 4x16
+    Pretrained: Kinetics-400
+    Resolution: short-side 256
+    Training Data: AVA v2.1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  Results:
+  - Dataset: AVA v2.1
+    Metrics:
+      mAP: 22.15
+    Task: Spatial Temporal Action Detection
+  Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json
+  Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log
+  Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-37efcd15.pth
--- a/configs/localization/bmn/README.md
+++ b/configs/localization/bmn/README.md
+# BMN
+
+[Bmn: Boundary-matching network for temporal action proposal generation](https://openaccess.thecvf.com/content_ICCV_2019/html/Lin_BMN_Boundary-Matching_Network_for_Temporal_Action_Proposal_Generation_ICCV_2019_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Temporal action proposal generation is an challenging and promising task which aims to locate temporal regions in real-world videos where action or event may occur. Current bottom-up proposal generation methods can generate proposals with precise boundary, but cannot efficiently generate adequately reliable confidence scores for retrieving proposals. To address these difficulties, we introduce the Boundary-Matching (BM) mechanism to evaluate confidence scores of densely distributed proposals, which denote a proposal as a matching pair of starting and ending boundaries and combine all densely distributed BM pairs into the BM confidence map. Based on BM mechanism, we propose an effective, efficient and end-to-end proposal generation method, named Boundary-Matching Network (BMN), which generates proposals with precise temporal boundaries as well as reliable confidence scores simultaneously. The two-branches of BMN are jointly trained in an unified framework. We conduct experiments on two challenging datasets: THUMOS-14 and ActivityNet-1.3, where BMN shows significant performance improvement with remarkable efficiency and generalizability. Further, combining with existing action classifier, BMN can achieve state-of-the-art temporal action detection performance.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143016479-2ca7e8b6-a17b-4a4c-b4c9-ae731935cd91.png" width="800"/>
+</div>
+
+## Results and Models
+
+### ActivityNet feature
+
+|                                                    config                                                     |    feature     | gpus | AR@100 |  AUC  | AP@0.5 | AP@0.75 | AP@0.95 |  mAP  | gpu_mem(M) | iter time(s) |                                                                             ckpt                                                                             |                                                                       log                                                                        | json                                                                                                                                               |
+| :-----------------------------------------------------------------------------------------------------------: | :------------: | :--: | :----: | :---: | :----: | :-----: | :-----: | :---: | :--------: | ------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [bmn_400x100_9e_2x8_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py) | cuhk_mean_100  |  2   | 75.28  | 67.22 | 42.47  |  31.31  |  9.92   | 30.34 |    5420    | 3.27         | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth) |    [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log)     | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json)    |
+|                                                                                                               | mmaction_video |  2   | 75.43  | 67.22 | 42.62  |  31.56  |  10.86  | 30.77 |    5420    | 3.27         |  [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth)  | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json) |
+|                                                                                                               | mmaction_clip  |  2   | 75.35  | 67.38 | 43.08  |  32.19  |  10.73  | 31.15 |    5420    | 3.27         |   [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth)   |  [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log)  | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json)   |
+|           [BMN-official](https://github.com/JJBOY/BMN-Boundary-Matching-Network) (for reference)\*            | cuhk_mean_100  |  -   | 75.27  | 67.49 | 42.22  |  30.98  |  9.22   | 30.00 |     -      | -            |                                                                              -                                                                               |                                                                        -                                                                         | -                                                                                                                                                  |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk), mmaction_video and mmaction_clip denote feature extracted by mmaction, with video-level activitynet finetuned model or clip-level activitynet finetuned model respectively.
+3. We evaluate the action detection performance of BMN, using  [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) submission for ActivityNet2017 Untrimmed Video Classification Track to assign label for each action proposal.
+
+:::
+
+\*We train BMN with the [official repo](https://github.com/JJBOY/BMN-Boundary-Matching-Network), evaluate its proposal generation and action detection performance with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) for label assigning.
+
+For more details on data preparation, you can refer to ActivityNet feature in [Data Preparation](/docs/en/data_preparation.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train BMN model on ActivityNet features dataset.
+
+```shell
+python tools/train.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+```
+
+For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting) .
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test BMN on ActivityNet feature dataset.
+
+```shell
+# Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
+python tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+```
+
+You can also test the action detection performance of the model, with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) prediction file and generated proposal file (`results.json` in last command).
+
+```shell
+python tools/analysis/report_map.py --proposal path/to/proposal_file
+```
+
+:::{note}
+
+1. (Optional) You can use the following command to generate a formatted proposal file, which will be fed into the action classifier (Currently supports SSN and P-GCN, not including TSN, I3D etc.) to get the classification result of proposals.
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+:::
+
+For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset) .
+
+## Citation
+
+```BibTeX
+@inproceedings{lin2019bmn,
+  title={Bmn: Boundary-matching network for temporal action proposal generation},
+  author={Lin, Tianwei and Liu, Xiao and Li, Xin and Ding, Errui and Wen, Shilei},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={3889--3898},
+  year={2019}
+}
+```
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{zhao2017cuhk,
+  title={Cuhk \& ethz \& siat submission to activitynet challenge 2017},
+  author={Zhao, Y and Zhang, B and Wu, Z and Yang, S and Zhou, L and Yan, S and Wang, L and Xiong, Y and Lin, D and Qiao, Y and others},
+  journal={arXiv preprint arXiv:1710.08011},
+  volume={8},
+  year={2017}
+}
+```
--- a/configs/localization/bmn/README_zh-CN.md
+++ b/configs/localization/bmn/README_zh-CN.md
+# BMN
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@inproceedings{lin2019bmn,
+  title={Bmn: Boundary-matching network for temporal action proposal generation},
+  author={Lin, Tianwei and Liu, Xiao and Li, Xin and Ding, Errui and Wen, Shilei},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={3889--3898},
+  year={2019}
+}
+```
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{zhao2017cuhk,
+  title={Cuhk \& ethz \& siat submission to activitynet challenge 2017},
+  author={Zhao, Y and Zhang, B and Wu, Z and Yang, S and Zhou, L and Yan, S and Wang, L and Xiong, Y and Lin, D and Qiao, Y and others},
+  journal={arXiv preprint arXiv:1710.08011},
+  volume={8},
+  year={2017}
+}
+```
+
+## 模型库
+
+### ActivityNet feature
+
+|                                                   配置文件                                                    |      特征      | GPU 数量 | AR@100 |  AUC  | AP@0.5 | AP@0.75 | AP@0.95 |  mAP  | GPU 显存占用 (M) | 推理时间 (s) |                                                                             ckpt                                                                             |                                                                       log                                                                        | json                                                                                                                                               |
+| :-----------------------------------------------------------------------------------------------------------: | :------------: | :------: | :----: | :---: | :----: | :-----: | :-----: | :---: | :--------------: | ------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [bmn_400x100_9e_2x8_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py) | cuhk_mean_100  |    2     | 75.28  | 67.22 | 42.47  |  31.31  |  9.92   | 30.34 |       5420       | 3.27         | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth) |    [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log)     | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json)    |
+|                                                                                                               | mmaction_video |    2     | 75.43  | 67.22 | 42.62  |  31.56  |  10.86  | 30.77 |       5420       | 3.27         |  [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth)  | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json) |
+|                                                                                                               | mmaction_clip  |    2     | 75.35  | 67.38 | 43.08  |  32.19  |  10.73  | 31.15 |       5420       | 3.27         |   [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth)   |  [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log)  | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json)   |
+|           [BMN-official](https://github.com/JJBOY/BMN-Boundary-Matching-Network) (for reference)\*            | cuhk_mean_100  |    -     | 75.27  | 67.49 | 42.22  |  30.98  |  9.22   | 30.00 |        -         | -            |                                                                              -                                                                               |                                                                        -                                                                         | -                                                                                                                                                  |
+
+- 注：
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 对于 **特征** 这一列，`cuhk_mean_100` 表示所使用的特征为利用 [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk) 代码库抽取的，被广泛利用的 CUHK ActivityNet 特征，
+   `mmaction_video` 和 `mmaction_clip` 分布表示所使用的特征为利用 MMAction 抽取的，视频级别 ActivityNet 预训练模型的特征；视频片段级别 ActivityNet 预训练模型的特征。
+3. MMAction2 使用 ActivityNet2017 未剪辑视频分类赛道上 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 所提交的结果来为每个视频的时序动作候选指定标签，以用于 BMN 模型评估。
+
+\*MMAction2 在 [原始代码库](https://github.com/JJBOY/BMN-Boundary-Matching-Network) 上训练 BMN，并且在 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 的对应标签上评估时序动作候选生成和时序检测的结果。
+
+对于数据集准备的细节，用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 ActivityNet 特征部分。
+
+## 如何训练
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：在 ActivityNet 特征上训练 BMN。
+
+```shell
+python tools/train.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+```
+
+更多训练细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
+
+## 如何测试
+
+用户可以使用以下指令进行模型测试。
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+例如：在 ActivityNet 特征上测试 BMN 模型。
+
+```shell
+# 注：如果需要进行指标验证，需确测试数据的保标注文件包含真实标签
+python tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+```
+
+用户也可以利用 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 的预测文件评估模型时序检测的结果，并生成时序动作候选文件（即命令中的 `results.json`）
+
+```shell
+python tools/analysis/report_map.py --proposal path/to/proposal_file
+```
+
+注：
+
+1. (可选项) 用户可以使用以下指令生成格式化的时序动作候选文件，该文件可被送入动作识别器中（目前只支持 SSN 和 P-GCN，不包括 TSN, I3D 等），以获得时序动作候选的分类结果。
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+更多测试细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
--- a/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+++ b/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+_base_ = [
+    '../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_val.json'
+
+test_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(
+        type='Collect',
+        keys=['raw_feature'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature']),
+]
+train_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+val_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+evaluation = dict(interval=1, metrics=['AR@AN'])
+
+# optimizer
+optimizer = dict(
+    type='Adam', lr=0.001, weight_decay=0.0001)  # this lr is used for 2 gpus
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=7)
+total_epochs = 9
+
+# runtime settings
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'
+output_config = dict(out=f'{work_dir}/results.json', output_format='json')
--- a/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature_mmaction_clip.py
+++ b/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature_mmaction_clip.py
+_base_ = [
+    '../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_mmaction_clip/'
+data_root_val = 'data/ActivityNet/activitynet_feature_mmaction_clip/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_val.json'
+
+test_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(
+        type='Collect',
+        keys=['raw_feature'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature']),
+]
+train_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+val_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+evaluation = dict(interval=1, metrics=['AR@AN'])
+
+# optimizer
+optimizer = dict(
+    type='Adam', lr=0.001, weight_decay=0.0001)  # this lr is used for 2 gpus
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=7)
+total_epochs = 9
+
+# runtime settings
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'
+output_config = dict(out=f'{work_dir}/results.json', output_format='json')
--- a/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature_mmaction_video.py
+++ b/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature_mmaction_video.py
+_base_ = [
+    '../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_mmaction_video/'
+data_root_val = 'data/ActivityNet/activitynet_feature_mmaction_video/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_val.json'
+
+test_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(
+        type='Collect',
+        keys=['raw_feature'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature']),
+]
+train_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+val_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+evaluation = dict(interval=1, metrics=['AR@AN'])
+
+# optimizer
+optimizer = dict(
+    type='Adam', lr=0.001, weight_decay=0.0001)  # this lr is used for 2 gpus
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=7)
+total_epochs = 9
+
+# runtime settings
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'
+output_config = dict(out=f'{work_dir}/results.json', output_format='json')
--- a/configs/localization/bmn/metafile.yml
+++ b/configs/localization/bmn/metafile.yml
+Collections:
+- Name: BMN
+  README: configs/localization/bmn/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1907.09702
+    Title: "BMN: Boundary-Matching Network for Temporal Action Proposal Generation"
+Models:
+- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+  In Collection: BMN
+  Metadata:
+    Batch Size: 8
+    Epochs: 9
+    Training Data: ActivityNet v1.3
+    Training Resources: 2 GPUs
+    feature: cuhk_mean_100
+  Name: bmn_400x100_2x8_9e_activitynet_feature
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AP@0.5: 42.47
+      AP@0.75: 31.31
+      AP@0.95: 9.92
+      AR@100: 75.28
+      AUC: 67.22
+      mAP: 30.34
+    Task: Temporal Action Localization
+  Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json
+  Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log
+  Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth
+- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature_mmaction_video.py
+  In Collection: BMN
+  Metadata:
+    Batch Size: 8
+    Epochs: 9
+    Training Data: ActivityNet v1.3
+    Training Resources: 2 GPUs
+    feature: mmaction_video
+  Name: bmn_400x100_2x8_9e_activitynet_feature_mmaction_video
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AP@0.5: 42.62
+      AP@0.75: 31.56
+      AP@0.95: 10.86
+      AR@100: 75.43
+      AUC: 67.22
+      mAP: 30.77
+    Task: Temporal Action Localization
+  Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json
+  Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log
+  Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth
+- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature_mmaction_clip.py
+  In Collection: BMN
+  Metadata:
+    Batch Size: 8
+    Epochs: 9
+    Training Data: ActivityNet v1.3
+    Training Resources: 2 GPUs
+    feature: mmaction_clip
+  Name: bmn_400x100_2x8_9e_activitynet_feature_mmaction_clip
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AP@0.5: 43.08
+      AP@0.75: 32.19
+      AP@0.95: 10.73
+      AR@100: 75.35
+      AUC: 67.38
+      mAP: 31.15
+    Task: Temporal Action Localization
+  Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json
+  Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log
+  Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth
--- a/configs/localization/bsn/README.md
+++ b/configs/localization/bsn/README.md
+# BSN
+
+[Bsn: Boundary sensitive network for temporal action proposal generation](https://openaccess.thecvf.com/content_ECCV_2018/html/Tianwei_Lin_BSN_Boundary_Sensitive_ECCV_2018_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Temporal action proposal generation is an important yet challenging problem, since temporal proposals with rich action content are indispensable for analysing real-world videos with long duration and high proportion irrelevant content. This problem requires methods not only generating proposals with precise temporal boundaries, but also retrieving proposals to cover truth action instances with high recall and high overlap using relatively fewer proposals. To address these difficulties, we introduce an effective proposal generation method, named Boundary-Sensitive Network (BSN), which adopts "local to global" fashion. Locally, BSN first locates temporal boundaries with high probabilities, then directly combines these boundaries as proposals. Globally, with Boundary-Sensitive Proposal feature, BSN retrieves proposals by evaluating the confidence of whether a proposal contains an action within its region. We conduct experiments on two challenging datasets: ActivityNet-1.3 and THUMOS14, where BSN outperforms other state-of-the-art temporal action proposal generation methods with high recall and high temporal precision. Finally, further experiments demonstrate that by combining existing action classifiers, our method significantly improves the state-of-the-art temporal action detection performance.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143016692-69efafbd-cec6-47f1-af45-371d0ff78a97.png" width="800"/>
+</div>
+
+## Results and Models
+
+### ActivityNet feature
+
+| config                                   |    feature     | gpus | pretrain | AR@100 |  AUC  |   gpu_mem(M)    |     iter time(s)      |                                                                                                                                                                                   ckpt                                                                                                                                                                                    |                                                                                                                                                                 log                                                                                                                                                                 |                                                                                                                                                                       json                                                                                                                                                                       |
+| :--------------------------------------- | :------------: | :--: | :------: | :----: | :---: | :-------------: | :-------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| bsn_400x100_1x16_20e_activitynet_feature | cuhk_mean_100  |  1   |   None   | 74.66  | 66.45 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json)  [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json) |
+|                                          | mmaction_video |  1   |   None   | 74.93  | 66.74 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) |           [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth)           |  [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log)  |      [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json)       |
+|                                          | mmaction_clip  |  1   |   None   | 75.19  | 66.81 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) |             [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth)             |    [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log)    |        [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json)         |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk), mmaction_video and mmaction_clip denote feature extracted by mmaction, with video-level activitynet finetuned model or clip-level activitynet finetuned model respectively.
+
+:::
+
+For more details on data preparation, you can refer to ActivityNet feature in [Data Preparation](/docs/en/data_preparation.md).
+
+## Train
+
+You can use the following commands to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Examples:
+
+1. train BSN(TEM) on ActivityNet features dataset.
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+2. train BSN(PEM) on PGM results.
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
+
+## Inference
+
+You can use the following commands to inference a model.
+
+1. For TEM Inference
+
+   ```shell
+   # Note: This could not be evaluated.
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. For PGM Inference
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. For PEM Inference
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+Examples:
+
+1. Inference BSN(TEM) with pretrained model.
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. Inference BSN(PGM) with pretrained model.
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode train
+   ```
+
+3. Inference BSN(PEM) with evaluation metric 'AR@AN' and output the results.
+
+   ```shell
+   # Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py  checkpoints/SOME_CHECKPOINT.pth  --eval AR@AN --out results.json
+   ```
+
+## Test
+
+You can use the following commands to test a model.
+
+1. TEM
+
+   ```shell
+   # Note: This could not be evaluated.
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. PGM
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. PEM
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+Examples:
+
+1. Test a TEM model on ActivityNet dataset.
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. Test a PGM model on ActivityNet dataset.
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode test
+   ```
+
+3. Test a PEM model with with evaluation metric 'AR@AN' and output the results.
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+   ```
+
+:::{note}
+
+1. (Optional) You can use the following command to generate a formatted proposal file, which will be fed into the action classifier (Currently supports only SSN and P-GCN, not including TSN, I3D etc.) to get the classification result of proposals.
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+:::
+
+For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
+
+## Citation
+
+```BibTeX
+@inproceedings{lin2018bsn,
+  title={Bsn: Boundary sensitive network for temporal action proposal generation},
+  author={Lin, Tianwei and Zhao, Xu and Su, Haisheng and Wang, Chongjing and Yang, Ming},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={3--19},
+  year={2018}
+}
+```
--- a/configs/localization/bsn/README_zh-CN.md
+++ b/configs/localization/bsn/README_zh-CN.md
+# BSN
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@inproceedings{lin2018bsn,
+  title={Bsn: Boundary sensitive network for temporal action proposal generation},
+  author={Lin, Tianwei and Zhao, Xu and Su, Haisheng and Wang, Chongjing and Yang, Ming},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={3--19},
+  year={2018}
+}
+```
+
+## 模型库
+
+### ActivityNet feature
+
+| 配置文件                                 |      特征      | GPU 数量 | 预训练 | AR@100 |  AUC  | GPU 显存占用 (M) |     迭代时间 (s)      |                                                                                                                                                                                   ckpt                                                                                                                                                                                    |                                                                                                                                                                 log                                                                                                                                                                 |                                                                                                                                                                       json                                                                                                                                                                       |
+| :--------------------------------------- | :------------: | :------: | :----: | :----: | :---: | :--------------: | :-------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| bsn_400x100_1x16_20e_activitynet_feature | cuhk_mean_100  |    1     |  None  | 74.66  | 66.45 | 41(TEM)+25(PEM)  | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json)  [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json) |
+|                                          | mmaction_video |    1     |  None  | 74.93  | 66.74 | 41(TEM)+25(PEM)  | 0.074(TEM)+0.036(PEM) |           [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth)           |  [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log)  |      [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json)       |
+|                                          | mmaction_clip  |    1     |  None  | 75.19  | 66.81 | 41(TEM)+25(PEM)  | 0.074(TEM)+0.036(PEM) |             [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth)             |    [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log)    |        [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json)         |
+
+注：
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 对于 **特征** 这一列，`cuhk_mean_100` 表示所使用的特征为利用 [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk) 代码库抽取的，被广泛利用的 CUHK ActivityNet 特征，
+   `mmaction_video` 和 `mmaction_clip` 分布表示所使用的特征为利用 MMAction 抽取的，视频级别 ActivityNet 预训练模型的特征；视频片段级别 ActivityNet 预训练模型的特征。
+
+对于数据集准备的细节，用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 ActivityNet 特征部分。
+
+## 如何训练
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：
+
+1. 在 ActivityNet 特征上训练 BSN(TEM) 模型。
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+2. 基于 PGM 的结果训练 BSN(PEM)。
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+更多训练细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
+
+## 如何进行推理
+
+用户可以使用以下指令进行模型推理。
+
+1. 推理 TEM 模型。
+
+   ```shell
+   # Note: This could not be evaluated.
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. 推理 PGM 模型
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. 推理 PEM 模型
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+例如
+
+1. 利用预训练模型进行 BSN(TEM) 模型的推理。
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. 利用预训练模型进行 BSN(PGM) 模型的推理
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode train
+   ```
+
+3. 推理 BSN(PEM) 模型，并计算 'AR@AN' 指标，输出结果文件。
+
+   ```shell
+   # 注：如果需要进行指标验证，需确测试数据的保标注文件包含真实标签
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py  checkpoints/SOME_CHECKPOINT.pth  --eval AR@AN --out results.json
+   ```
+
+## 如何测试
+
+用户可以使用以下指令进行模型测试。
+
+1. TEM
+
+   ```shell
+   # 注：该命令无法进行指标验证
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. PGM
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. PEM
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+例如：
+
+1. 在 ActivityNet 数据集上测试 TEM 模型。
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. 在 ActivityNet 数据集上测试 PGM 模型。
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode test
+   ```
+
+3. 测试 PEM 模型，并计算 'AR@AN' 指标，输出结果文件。
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+   ```
+
+注：
+
+1. (可选项) 用户可以使用以下指令生成格式化的时序动作候选文件，该文件可被送入动作识别器中（目前只支持 SSN 和 P-GCN，不包括 TSN, I3D 等），以获得时序动作候选的分类结果。
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+更多测试细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
--- a/configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+++ b/configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+_base_ = [
+    '../../_base_/models/bsn_pem.py', '../../_base_/schedules/adam_20e.py',
+    '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_val.json'
+
+work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
+pgm_proposals_dir = f'{work_dir}/pgm_proposals/'
+pgm_features_dir = f'{work_dir}/pgm_features/'
+
+test_pipeline = [
+    dict(
+        type='LoadProposals',
+        top_k=1000,
+        pgm_proposals_dir=pgm_proposals_dir,
+        pgm_features_dir=pgm_features_dir),
+    dict(
+        type='Collect',
+        keys=['bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['bsp_feature'])
+]
+
+train_pipeline = [
+    dict(
+        type='LoadProposals',
+        top_k=500,
+        pgm_proposals_dir=pgm_proposals_dir,
+        pgm_features_dir=pgm_features_dir),
+    dict(
+        type='Collect',
+        keys=['bsp_feature', 'reference_temporal_iou'],
+        meta_name='video_meta',
+        meta_keys=[]),
+    dict(type='ToTensor', keys=['bsp_feature', 'reference_temporal_iou']),
+    dict(
+        type='ToDataContainer',
+        fields=(dict(key='bsp_feature', stack=False),
+                dict(key='reference_temporal_iou', stack=False)))
+]
+
+val_pipeline = [
+    dict(
+        type='LoadProposals',
+        top_k=1000,
+        pgm_proposals_dir=pgm_proposals_dir,
+        pgm_features_dir=pgm_features_dir),
+    dict(
+        type='Collect',
+        keys=['bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['bsp_feature'])
+]
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+evaluation = dict(interval=1, metrics=['AR@AN'])
+
+# runtime settings
+checkpoint_config = dict(interval=1, filename_tmpl='pem_epoch_{}.pth')
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+output_config = dict(out=f'{work_dir}/results.json', output_format='json')
--- a/configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+++ b/configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_test.json'
+
+work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
+tem_results_dir = f'{work_dir}/tem_results/'
+pgm_proposals_dir = f'{work_dir}/pgm_proposals/'
+pgm_features_dir = f'{work_dir}/pgm_features/'
+
+temporal_scale = 100
+pgm_proposals_cfg = dict(
+    pgm_proposals_thread=8, temporal_scale=temporal_scale, peak_threshold=0.5)
+pgm_features_test_cfg = dict(
+    pgm_features_thread=4,
+    top_k=1000,
+    num_sample_start=8,
+    num_sample_end=8,
+    num_sample_action=16,
+    num_sample_interp=3,
+    bsp_boundary_ratio=0.2)
+pgm_features_train_cfg = dict(
+    pgm_features_thread=4,
+    top_k=500,
+    num_sample_start=8,
+    num_sample_end=8,
+    num_sample_action=16,
+    num_sample_interp=3,
+    bsp_boundary_ratio=0.2)
--- a/configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+++ b/configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+_base_ = ['../../_base_/models/bsn_tem.py', '../../_base_/default_runtime.py']
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_full.json'
+
+test_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(
+        type='Collect',
+        keys=['raw_feature'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature'])
+]
+train_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(type='ToDataContainer', fields=[dict(key='gt_bbox', stack=False)])
+]
+val_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(type='ToDataContainer', fields=[dict(key='gt_bbox', stack=False)])
+]
+
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+
+# optimizer
+optimizer = dict(
+    type='Adam', lr=0.001, weight_decay=0.0001)  # this lr is used for 1 gpus
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=7)
+total_epochs = 20
+
+# runtime settings
+checkpoint_config = dict(interval=1, filename_tmpl='tem_epoch_{}.pth')
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+workflow = [('train', 1), ('val', 1)]
+work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
+tem_results_dir = f'{work_dir}/tem_results/'
+output_config = dict(out=tem_results_dir, output_format='csv')
--- a/configs/localization/bsn/metafile.yml
+++ b/configs/localization/bsn/metafile.yml
+Collections:
+- Name: BSN
+  README: configs/localization/bsn/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1806.02964
+    Title: "BSN: Boundary Sensitive Network for Temporal Action Proposal Generation"
+Models:
+- Config:
+  - configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+  - configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+  - configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+  In Collection: BSN
+  Metadata:
+    Pretrained: None
+    Training Data: ActivityNet v1.3
+    Training Resources: 1 GPUs
+    feature: cuhk_mean_100
+  Name: bsn_400x100_1x16_20e_activitynet_feature (cuhk_mean_100)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AR@100: 74.66
+      AUC: 66.45
+    Task: Temporal Action Localization
+  Training Json Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json
+  Training Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log
+  Weights:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth
+- Config:
+  - configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+  - configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+  - configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+  In Collection: BSN
+  Metadata:
+    Pretrained: None
+    Training Data: ActivityNet v1.3
+    Training Resources: 1 GPUs
+    feature: mmaction_video
+  Name: bsn_400x100_1x16_20e_activitynet_feature (mmaction_video)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AR@100: 74.93
+      AUC: 66.74
+    Task: Temporal Action Localization
+  Training Json Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json
+  Training Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log
+  Weights:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth
+- Config:
+  - configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+  - configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+  - configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+  In Collection: BSN
+  Metadata:
+    Pretrained: None
+    Training Data: ActivityNet v1.3
+    Training Resources: 1 GPUs
+    feature: mmaction_clip
+  Name: bsn_400x100_1x16_20e_activitynet_feature (mmaction_clip)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AR@100: 75.19
+      AUC: 66.81
+    Task: Temporal Action Localization
+  Training Json Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json
+  Training Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log
+  Weights:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth
--- a/configs/localization/ssn/README.md
+++ b/configs/localization/ssn/README.md
+# SSN
+
+[Temporal Action Detection With Structured Segment Networks](https://openaccess.thecvf.com/content_iccv_2017/html/Zhao_Temporal_Action_Detection_ICCV_2017_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Detecting actions in untrimmed videos is an important yet challenging task. In this paper, we present the structured segment network (SSN), a novel framework which models the temporal structure of each action instance via a structured temporal pyramid. On top of the pyramid, we further introduce a decomposed discriminative model comprising two classifiers, respectively for classifying actions and determining completeness. This allows the framework to effectively distinguish positive proposals from background or incomplete ones, thus leading to both accurate recognition and localization. These components are integrated into a unified network that can be efficiently trained in an end-to-end fashion. Additionally, a simple yet effective temporal action proposal scheme, dubbed temporal actionness grouping (TAG) is devised to generate high quality action proposals. On two challenging benchmarks, THUMOS14 and ActivityNet, our method remarkably outperforms previous state-of-the-art methods, demonstrating superior accuracy and strong adaptivity in handling actions with various temporal structures.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143016899-017893d3-a907-4487-90a2-cb884088266c.png" width="800"/>
+</div>
+
+## Results and Models
+
+|                                          config                                           | gpus | backbone | pretrain | mAP@0.3 | mAP@0.4 | mAP@0.5 |                                              reference mAP@0.3                                              |                                              reference mAP@0.4                                              |                                              reference mAP@0.5                                              | gpu_mem(M) |                                                                    ckpt                                                                    |                                                      log                                                      | json                                                                                                                |                                                                            reference ckpt                                                                             |                                                               reference json                                                               |
+| :---------------------------------------------------------------------------------------: | :--: | :------: | :------: | :-----: | :-----: | :-----: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :--------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
+| [ssn_r50_450e_thumos14_rgb](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) |  8   | ResNet50 | ImageNet |  29.37  |  22.15  |  15.69  | [27.61](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [21.28](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [14.57](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) |    6352    | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/ssn_r50_450e_thumos14_rgb_20201012-1920ab16.pth) | [log](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log) | [json](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log.json) | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/ssn_r50_450e_thumos14_rgb_ref_20201014-b6f48f68.pth) | [json](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/20201008_103258.log.json) |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. Since SSN utilizes different structured temporal pyramid pooling methods at training and testing, please refer to [ssn_r50_450e_thumos14_rgb_train](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) at training and [ssn_r50_450e_thumos14_rgb_test](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py) at testing.
+3. We evaluate the action detection performance of SSN, using action proposals of TAG. For more details on data preparation, you can refer to thumos14 TAG proposals in [Data Preparation](/docs/en/data_preparation.md).
+4. The reference SSN in is evaluated with `ResNet50` backbone in MMAction, which is the same backbone with ours. Note that the original setting of MMAction SSN uses the `BNInception` backbone.
+
+:::
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train SSN model on thumos14 dataset.
+
+```shell
+python tools/train.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py
+```
+
+For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test BMN on ActivityNet feature dataset.
+
+```shell
+# Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
+python tools/test.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py checkpoints/SOME_CHECKPOINT.pth --eval mAP
+```
+
+For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
+
+## Citation
+
+```BibTeX
+@InProceedings{Zhao_2017_ICCV,
+author = {Zhao, Yue and Xiong, Yuanjun and Wang, Limin and Wu, Zhirong and Tang, Xiaoou and Lin, Dahua},
+title = {Temporal Action Detection With Structured Segment Networks},
+booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
+month = {Oct},
+year = {2017}
+}
+```
--- a/configs/localization/ssn/README_zh-CN.md
+++ b/configs/localization/ssn/README_zh-CN.md
+# SSN
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@InProceedings{Zhao_2017_ICCV,
+author = {Zhao, Yue and Xiong, Yuanjun and Wang, Limin and Wu, Zhirong and Tang, Xiaoou and Lin, Dahua},
+title = {Temporal Action Detection With Structured Segment Networks},
+booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
+month = {Oct},
+year = {2017}
+}
+```
+
+## 模型库
+
+|                                         配置文件                                          | GPU 数量 | 主干网络 |  预训练  | mAP@0.3 | mAP@0.4 | mAP@0.5 |                                             参考代码的 mAP@0.3                                              |                                             参考代码的 mAP@0.4                                              |                                             参考代码的 mAP@0.5                                              | GPU 显存占用 (M) |                                                                    ckpt                                                                    |                                                      log                                                      | json                                                                                                                |                                                                            参考代码的 ckpt                                                                            |                                                              参考代码的 json                                                               |
+| :---------------------------------------------------------------------------------------: | :------: | :------: | :------: | :-----: | :-----: | :-----: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :--------------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
+| [ssn_r50_450e_thumos14_rgb](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) |    8     | ResNet50 | ImageNet |  29.37  |  22.15  |  15.69  | [27.61](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [21.28](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [14.57](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) |       6352       | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/ssn_r50_450e_thumos14_rgb_20201012-1920ab16.pth) | [log](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log) | [json](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log.json) | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/ssn_r50_450e_thumos14_rgb_ref_20201014-b6f48f68.pth) | [json](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/20201008_103258.log.json) |
+
+注：
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 由于 SSN 在训练和测试阶段使用不同的结构化时序金字塔池化方法（structured temporal pyramid pooling methods），请分别参考 [ssn_r50_450e_thumos14_rgb_train](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) 和 [ssn_r50_450e_thumos14_rgb_test](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py)。
+3. MMAction2 使用 TAG 的时序动作候选进行 SSN 模型的精度验证。关于数据准备的更多细节，用户可参考 [Data 数据集准备文档](/docs/zh_cn/data_preparation.md) 准备 thumos14 的 TAG 时序动作候选。
+4. 参考代码的 SSN 模型是和 MMAction2 一样在 `ResNet50` 主干网络上验证的。注意，这里的 SSN 的初始设置与原代码库的 `BNInception` 骨干网络的设置相同。
+
+## 如何训练
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：在 thumos14 数据集上训练 SSN 模型。
+
+```shell
+python tools/train.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py
+```
+
+更多训练细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
+
+## 如何测试
+
+用户可以使用以下指令进行模型测试。
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+例如：在 ActivityNet 特征上测试 BMN。
+
+```shell
+# 注：如果需要进行指标验证，需确测试数据的保标注文件包含真实标签
+python tools/test.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py checkpoints/SOME_CHECKPOINT.pth --eval mAP
+```
+
+更多测试细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
--- a/configs/localization/ssn/metafile.yml
+++ b/configs/localization/ssn/metafile.yml
+Collections:
+- Name: SSN
+  README: configs/localization/ssn/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1704.06228
+    Title: Temporal Action Detection with Structured Segment Networks
+Models:
+- Config: configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py
+  In Collection: SSN
+  Metadata:
+    Architecture: ResNet50
+    Pretrained: ImageNet
+    Training Data: THUMOS 14
+    Training Resources: 8 GPUs
+  Name: ssn_r50_450e_thumos14_rgb
+  Results:
+  - Dataset: THUMOS 14
+    Metrics:
+      mAP@0.3: 29.37
+      mAP@0.4: 22.15
+      mAP@0.5: 15.69
+    Task: Temporal Action Localization
+  Training Json Log: https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log.json
+  Training Log: https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log
+  Weights: https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/ssn_r50_450e_thumos14_rgb_20201012-1920ab16.pth
+  reference mAP@0.3: '[27.61](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started)'
+  reference mAP@0.4: '[21.28](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started)'
+  reference mAP@0.5: '[14.57](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started)'
+  reference ckpt: '[ckpt](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/ssn_r50_450e_thumos14_rgb_ref_20201014-b6f48f68.pth)'
+  reference json: '[json](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/20201008_103258.log.json)'