Initial add code.

aa58d024 · unknown · aa58d024 · aa58d024 · aa58d024 · aa58d024
Commit aa58d024 authored Mar 20, 2023 by unknown
20 changed files
--- a/configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py
+++ b/configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py
+# model setting
+model = dict(
+    type='FastRCNN',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=101,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0,
+            debug=False)),
+    test_cfg=dict(rcnn=dict(action_thr=0.002)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape'],
+        nested=True)
+]
+data = dict(
+    videos_per_gpu=6,
+    workers_per_gpu=2,
+    # During testing, each video may have different shape
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+
+optimizer = dict(type='SGD', lr=0.075, momentum=0.9, weight_decay=0.00001)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = ('./work_dirs/ava/'
+            'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb')
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'omni/'
+             'slowonly_r101_omni_8x8x1_kinetics400_rgb_20200926-b5dbb701.pth')
+
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/ava/slowonly_omnisource_pretrained_r50_4x16x1_20e_ava_rgb.py
+++ b/configs/detection/ava/slowonly_omnisource_pretrained_r50_4x16x1_20e_ava_rgb.py
+# model setting
+model = dict(
+    type='FastRCNN',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0,
+            debug=False)),
+    test_cfg=dict(rcnn=dict(action_thr=0.002)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    # During testing, each video may have different shape
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+
+optimizer = dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = ('./work_dirs/ava/'
+            'slowonly_omnisource_pretrained_r50_4x16x1_20e_ava_rgb')
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'omni/'
+             'slowonly_r50_omni_4x16x1_kinetics400_rgb_20200926-51b1f7ea.pth')
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/lfb/README.md
+++ b/configs/detection/lfb/README.md
+# LFB
+
+[Long-term feature banks for detailed video understanding](https://openaccess.thecvf.com/content_CVPR_2019/html/Wu_Long-Term_Feature_Banks_for_Detailed_Video_Understanding_CVPR_2019_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+To understand the world, we humans constantly need to relate the present to the past, and put events in context. In this paper, we enable existing video models to do the same. We propose a long-term feature bank---supportive information extracted over the entire span of a video---to augment state-of-the-art video models that otherwise would only view short clips of 2-5 seconds. Our experiments demonstrate that augmenting 3D convolutional networks with a long-term feature bank yields state-of-the-art results on three challenging video datasets: AVA, EPIC-Kitchens, and Charades.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143016220-21d90fb3-fd9f-499c-820f-f6c421bda7aa.png" width="800"/>
+</div>
+
+## Results and Models
+
+### AVA2.1
+
+|                                                                          Model                                                                          | Modality |  Pretrained  |                                               Backbone                                               | Input | gpus |   Resolution   |  mAP  |                                                                     log                                                                      |                                                                        json                                                                        |                                                                                                    ckpt                                                                                                     |
+| :-----------------------------------------------------------------------------------------------------------------------------------------------------: | :------: | :----------: | :--------------------------------------------------------------------------------------------------: | :---: | :--: | :------------: | :---: | :------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  [lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py)  |   RGB    | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16  |  8   | short-side 256 | 24.11 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log)  | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log.json)  |  [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210224-2ae136d9.pth)  |
+| [lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) |   RGB    | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16  |  8   | short-side 256 | 20.17 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-19c330b7.pth) |
+| [lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) |   RGB    | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16  |  8   | short-side 256 | 22.15 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-37efcd15.pth) |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. We use `slowonly_r50_4x16x1` instead of `I3D-R50-NL` in the original paper as the backbone of LFB, but we have achieved the similar improvement: (ours: 20.1 -> 24.11 vs. author: 22.1 -> 25.8).
+3. Because the long-term features are randomly sampled in testing, the test accuracy may have some differences.
+4. Before train or test lfb, you need to infer feature bank with the [lfb_slowonly_r50_ava_infer.py](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py). For more details on infer feature bank, you can refer to [Train](#Train) part.
+5. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`.
+6. The ROIHead now supports single-label classification (i.e. the network outputs at most
+   one-label per actor). This can be done by (a) setting multilabel=False during training and
+   the test_cfg.rcnn.action_thr for testing.
+
+:::
+
+## Train
+
+### a. Infer long-term feature bank for training
+
+Before train or test lfb, you need to infer long-term feature bank first.
+
+Specifically, run the test on the training, validation, testing dataset with the config file [lfb_slowonly_r50_ava_infer](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py) (The config file will only infer the feature bank of training dataset and you need set `dataset_mode = 'val'` to infer the feature bank of validation dataset in the config file.), and the shared head [LFBInferHead](/mmaction/models/heads/lfb_infer_head.py) will generate the feature bank.
+
+A long-term feature bank file of AVA training and validation datasets with float32 precision occupies 3.3 GB. If store the features with float16 precision, the feature bank occupies 1.65 GB.
+
+You can use the following command to infer feature bank of AVA training and validation dataset and the feature bank will be stored in `lfb_prefix_path/lfb_train.pkl` and `lfb_prefix_path/lfb_val.pkl`.
+
+```shell
+# set `dataset_mode = 'train'` in lfb_slowonly_r50_ava_infer.py
+python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
+    checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+
+# set `dataset_mode = 'val'` in lfb_slowonly_r50_ava_infer.py
+python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
+    checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+```
+
+We use [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth) from [slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) to infer feature bank.
+
+### b. Train LFB
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train LFB model on AVA with half-precision long-term feature bank.
+
+```shell
+python tools/train.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
+  --validate --seed 0 --deterministic
+```
+
+For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
+
+## Test
+
+### a. Infer long-term feature bank for testing
+
+Before train or test lfb, you also need to infer long-term feature bank first. If you have generated the feature bank file, you can skip it.
+
+The step is the same with **Infer long-term feature bank for training** part in [Train](#Train).
+
+### b. Test LFB
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test LFB model on AVA with half-precision long-term feature bank and dump the result to a csv file.
+
+```shell
+python tools/test.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval mAP --out results.csv
+```
+
+For more details, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
+
+## Citation
+
+<!-- [DATASET] -->
+
+```BibTeX
+@inproceedings{gu2018ava,
+  title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
+  author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={6047--6056},
+  year={2018}
+}
+```
+
+```BibTeX
+@inproceedings{wu2019long,
+  title={Long-term feature banks for detailed video understanding},
+  author={Wu, Chao-Yuan and Feichtenhofer, Christoph and Fan, Haoqi and He, Kaiming and Krahenbuhl, Philipp and Girshick, Ross},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={284--293},
+  year={2019}
+}
+```
--- a/configs/detection/lfb/README_zh-CN.md
+++ b/configs/detection/lfb/README_zh-CN.md
+# LFB
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@inproceedings{wu2019long,
+  title={Long-term feature banks for detailed video understanding},
+  author={Wu, Chao-Yuan and Feichtenhofer, Christoph and Fan, Haoqi and He, Kaiming and Krahenbuhl, Philipp and Girshick, Ross},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={284--293},
+  year={2019}
+}
+```
+
+## 模型库
+
+### AVA2.1
+
+|                                                                        配置文件                                                                         | 模态 |    预训练    |                                               主干网络                                               | 输入 | GPU 数量 |  分辨率  | 平均精度 |                                                                     log                                                                      |                                                                        json                                                                        |                                                                                                    ckpt                                                                                                     |
+| :-----------------------------------------------------------------------------------------------------------------------------------------------------: | :--: | :----------: | :--------------------------------------------------------------------------------------------------: | :--: | :------: | :------: | :------: | :------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  [lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py)  | RGB  | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 |    8     | 短边 256 |  24.11   | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log)  | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log.json)  |  [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210224-2ae136d9.pth)  |
+| [lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB  | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 |    8     | 短边 256 |  20.17   | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-19c330b7.pth) |
+| [lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB  | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 |    8     | 短边 256 |  22.15   | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-37efcd15.pth) |
+
+- 注:
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 本 LFB 模型暂没有使用原论文中的 `I3D-R50-NL` 作为主干网络，而是用 `slowonly_r50_4x16x1` 替代，但取得了同样的提升效果：（本模型：20.1 -> 24.11 而原论文模型：22.1 -> 25.8）。
+3. 因为测试时，长时特征是被随机采样的，所以测试精度可能有一些偏差。
+4. 在训练或测试 LFB 之前，用户需要使用配置文件特征库 [lfb_slowonly_r50_ava_infer.py](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py) 来推导长时特征库。有关推导长时特征库的更多细节，请参照[训练部分](#%E8%AE%AD%E7%BB%83)。
+5. 用户也可以直接从 [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) 或者 [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar) 下载 float32 或 float16 的长时特征库，并把它们放在 `lfb_prefix_path` 上。
+
+## 训练
+
+### a. 为训练 LFB 推导长时特征库
+
+在训练或测试 LFB 之前，用户首先需要推导长时特征库。
+
+具体来说，使用配置文件 [lfb_slowonly_r50_ava_infer](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py)，在训练集、验证集、测试集上都运行一次模型测试。
+
+配置文件的默认设置是推导训练集的长时特征库，用户需要将 `dataset_mode` 设置成 `'val'` 来推导验证集的长时特征库，在推导过程中。共享头 [LFBInferHead](/mmaction/models/heads/lfb_infer_head.py) 会生成长时特征库。
+
+AVA 训练集和验证集的 float32 精度的长时特征库文件大约占 3.3 GB。如果以半精度来存储长时特征，文件大约占 1.65 GB。
+
+用户可以使用以下命令来推导 AVA 训练集和验证集的长时特征库，而特征库会被存储为 `lfb_prefix_path/lfb_train.pkl` 和 `lfb_prefix_path/lfb_val.pkl`。
+
+```shell
+# 在 lfb_slowonly_r50_ava_infer.py 中 设置 `dataset_mode = 'train'`
+python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
+    checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+
+# 在 lfb_slowonly_r50_ava_infer.py 中 设置 `dataset_mode = 'val'`
+python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
+    checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+```
+
+MMAction2 使用来自配置文件 [slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) 的模型权重文件 [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth)作为推导长时特征库的 LFB 模型的主干网络的预训练模型。
+
+### b. 训练 LFB
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：使用半精度的长时特征库在 AVA 数据集上训练 LFB 模型。
+
+```shell
+python tools/train.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
+  --validate --seed 0 --deterministic
+```
+
+更多训练细节，可参考 [基础教程](/docs_zh_CN/getting_started.md#%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE) 中的 **训练配置** 部分。
+
+## 测试
+
+### a. 为测试 LFB 推导长时特征库
+
+在训练或测试 LFB 之前，用户首先需要推导长时特征库。如果用户之前已经生成了特征库文件，可以跳过这一步。
+
+这一步做法与[训练部分](#Train)中的 **为训练 LFB 推导长时特征库** 相同。
+
+### b. 测试 LFB
+
+用户可以使用以下指令进行模型测试。
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+例如：使用半精度的长时特征库在 AVA 数据集上测试 LFB 模型，并将结果导出为一个 json 文件。
+
+```shell
+python tools/test.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval mAP --out results.csv
+```
+
+更多测试细节，可参考 [基础教程](/docs_zh_CN/getting_started.md#%E6%B5%8B%E8%AF%95%E6%9F%90%E4%B8%AA%E6%95%B0%E6%8D%AE%E9%9B%86) 中的 **测试某个数据集** 部分。
--- a/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+++ b/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+max_num_sampled_feat = 5
+window_size = 60
+lfb_channels = 2048
+dataset_modes = ('train', 'val')
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='FBOHead',
+            lfb_cfg=dict(
+                lfb_prefix_path=lfb_prefix_path,
+                max_num_sampled_feat=max_num_sampled_feat,
+                window_size=window_size,
+                lfb_channels=lfb_channels,
+                dataset_modes=dataset_modes,
+                device='gpu'),
+            fbo_cfg=dict(type='avg')),
+        bbox_head=dict(in_channels=4096)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids', 'img_key'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+
+optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb'  # noqa E501
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+++ b/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+max_num_sampled_feat = 5
+window_size = 60
+lfb_channels = 2048
+dataset_modes = ('train', 'val')
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='FBOHead',
+            lfb_cfg=dict(
+                lfb_prefix_path=lfb_prefix_path,
+                max_num_sampled_feat=max_num_sampled_feat,
+                window_size=window_size,
+                lfb_channels=lfb_channels,
+                dataset_modes=dataset_modes,
+                device='gpu'),
+            fbo_cfg=dict(type='max')),
+        bbox_head=dict(in_channels=4096)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids', 'img_key'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+
+optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb'  # noqa E501
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+++ b/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+max_num_sampled_feat = 5
+window_size = 60
+lfb_channels = 2048
+dataset_modes = ('train', 'val')
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='FBOHead',
+            lfb_cfg=dict(
+                lfb_prefix_path=lfb_prefix_path,
+                max_num_sampled_feat=max_num_sampled_feat,
+                window_size=window_size,
+                lfb_channels=lfb_channels,
+                dataset_modes=dataset_modes,
+                device='gpu'),
+            fbo_cfg=dict(
+                type='non_local',
+                st_feat_channels=2048,
+                lt_feat_channels=lfb_channels,
+                latent_channels=512,
+                num_st_feat=1,
+                num_lt_feat=window_size * max_num_sampled_feat,
+                num_non_local_layers=2,
+                st_feat_dropout_ratio=0.2,
+                lt_feat_dropout_ratio=0.2,
+                pre_activate=True)),
+        bbox_head=dict(in_channels=2560)))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+    dict(
+        type='ToDataContainer',
+        fields=[
+            dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
+        ]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
+        meta_keys=['scores', 'entity_ids', 'img_key'])
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        person_det_score_thr=0.9,
+        data_prefix=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+data['test'] = data['val']
+evaluation = dict(interval=1, save_best='mAP@0.5IOU')
+
+optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
+# this lr is used for 8 gpus
+
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+
+lr_config = dict(
+    policy='step',
+    step=[10, 15],
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.1)
+total_epochs = 20
+
+checkpoint_config = dict(interval=1)
+workflow = [('train', 1)]
+log_config = dict(
+    interval=20, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb'  # noqa E501
+load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
+             'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
+resume_from = None
+find_unused_parameters = False
--- a/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py
+++ b/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py
+# This config is used to generate long-term feature bank.
+_base_ = ['../_base_/models/slowonly_r50.py']
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+dataset_mode = 'train'  # ['train', 'val', 'test']
+
+model = dict(
+    roi_head=dict(
+        shared_head=dict(
+            type='LFBInferHead',
+            lfb_prefix_path=lfb_prefix_path,
+            dataset_mode=dataset_mode,
+            use_half_precision=True)))
+
+# dataset settings
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv'
+
+exclude_file_infer = (
+    f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv')
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_infer = (
+    f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl')
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+infer_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    # Rename is needed to use mmdet detectors
+    dict(type='Rename', mapping=dict(imgs='img')),
+    dict(type='ToTensor', keys=['img', 'proposals']),
+    dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
+    dict(
+        type='Collect',
+        keys=['img', 'proposals'],
+        meta_keys=['scores', 'img_shape', 'img_key'],
+        nested=True)
+]
+
+data = dict(
+    videos_per_gpu=1,
+    workers_per_gpu=2,
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_infer,
+        exclude_file=exclude_file_infer,
+        pipeline=infer_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_infer,
+        person_det_score_thr=0.9,
+        data_prefix=data_root))
+
+dist_params = dict(backend='nccl')
--- a/configs/detection/lfb/metafile.yml
+++ b/configs/detection/lfb/metafile.yml
+Collections:
+- Name: LFB
+  README: configs/detection/lfb/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1812.05038
+    Title: Long-Term Feature Banks for Detailed Video Understanding
+Models:
+- Config: configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  In Collection: LFB
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 20
+    Input: 4x16
+    Pretrained: Kinetics-400
+    Resolution: short-side 256
+    Training Data: AVA v2.1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  Results:
+  - Dataset: AVA v2.1
+    Metrics:
+      mAP: 24.11
+    Task: Spatial Temporal Action Detection
+  Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log.json
+  Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log
+  Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210224-2ae136d9.pth
+- Config: configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  In Collection: LFB
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 20
+    Input: 4x16
+    Pretrained: Kinetics-400
+    Resolution: short-side 256
+    Training Data: AVA v2.1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  Results:
+  - Dataset: AVA v2.1
+    Metrics:
+      mAP: 20.17
+    Task: Spatial Temporal Action Detection
+  Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json
+  Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log
+  Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-19c330b7.pth
+- Config: configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  In Collection: LFB
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 20
+    Input: 4x16
+    Pretrained: Kinetics-400
+    Resolution: short-side 256
+    Training Data: AVA v2.1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
+  Results:
+  - Dataset: AVA v2.1
+    Metrics:
+      mAP: 22.15
+    Task: Spatial Temporal Action Detection
+  Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json
+  Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log
+  Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-37efcd15.pth
--- a/configs/localization/bmn/README.md
+++ b/configs/localization/bmn/README.md
+# BMN
+
+[Bmn: Boundary-matching network for temporal action proposal generation](https://openaccess.thecvf.com/content_ICCV_2019/html/Lin_BMN_Boundary-Matching_Network_for_Temporal_Action_Proposal_Generation_ICCV_2019_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Temporal action proposal generation is an challenging and promising task which aims to locate temporal regions in real-world videos where action or event may occur. Current bottom-up proposal generation methods can generate proposals with precise boundary, but cannot efficiently generate adequately reliable confidence scores for retrieving proposals. To address these difficulties, we introduce the Boundary-Matching (BM) mechanism to evaluate confidence scores of densely distributed proposals, which denote a proposal as a matching pair of starting and ending boundaries and combine all densely distributed BM pairs into the BM confidence map. Based on BM mechanism, we propose an effective, efficient and end-to-end proposal generation method, named Boundary-Matching Network (BMN), which generates proposals with precise temporal boundaries as well as reliable confidence scores simultaneously. The two-branches of BMN are jointly trained in an unified framework. We conduct experiments on two challenging datasets: THUMOS-14 and ActivityNet-1.3, where BMN shows significant performance improvement with remarkable efficiency and generalizability. Further, combining with existing action classifier, BMN can achieve state-of-the-art temporal action detection performance.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143016479-2ca7e8b6-a17b-4a4c-b4c9-ae731935cd91.png" width="800"/>
+</div>
+
+## Results and Models
+
+### ActivityNet feature
+
+|                                                    config                                                     |    feature     | gpus | AR@100 |  AUC  | AP@0.5 | AP@0.75 | AP@0.95 |  mAP  | gpu_mem(M) | iter time(s) |                                                                             ckpt                                                                             |                                                                       log                                                                        | json                                                                                                                                               |
+| :-----------------------------------------------------------------------------------------------------------: | :------------: | :--: | :----: | :---: | :----: | :-----: | :-----: | :---: | :--------: | ------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [bmn_400x100_9e_2x8_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py) | cuhk_mean_100  |  2   | 75.28  | 67.22 | 42.47  |  31.31  |  9.92   | 30.34 |    5420    | 3.27         | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth) |    [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log)     | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json)    |
+|                                                                                                               | mmaction_video |  2   | 75.43  | 67.22 | 42.62  |  31.56  |  10.86  | 30.77 |    5420    | 3.27         |  [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth)  | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json) |
+|                                                                                                               | mmaction_clip  |  2   | 75.35  | 67.38 | 43.08  |  32.19  |  10.73  | 31.15 |    5420    | 3.27         |   [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth)   |  [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log)  | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json)   |
+|           [BMN-official](https://github.com/JJBOY/BMN-Boundary-Matching-Network) (for reference)\*            | cuhk_mean_100  |  -   | 75.27  | 67.49 | 42.22  |  30.98  |  9.22   | 30.00 |     -      | -            |                                                                              -                                                                               |                                                                        -                                                                         | -                                                                                                                                                  |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk), mmaction_video and mmaction_clip denote feature extracted by mmaction, with video-level activitynet finetuned model or clip-level activitynet finetuned model respectively.
+3. We evaluate the action detection performance of BMN, using  [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) submission for ActivityNet2017 Untrimmed Video Classification Track to assign label for each action proposal.
+
+:::
+
+\*We train BMN with the [official repo](https://github.com/JJBOY/BMN-Boundary-Matching-Network), evaluate its proposal generation and action detection performance with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) for label assigning.
+
+For more details on data preparation, you can refer to ActivityNet feature in [Data Preparation](/docs/data_preparation.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train BMN model on ActivityNet features dataset.
+
+```shell
+python tools/train.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+```
+
+For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting) .
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test BMN on ActivityNet feature dataset.
+
+```shell
+# Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
+python tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+```
+
+You can also test the action detection performance of the model, with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) prediction file and generated proposal file (`results.json` in last command).
+
+```shell
+python tools/analysis/report_map.py --proposal path/to/proposal_file
+```
+
+:::{note}
+
+1. (Optional) You can use the following command to generate a formatted proposal file, which will be fed into the action classifier (Currently supports SSN and P-GCN, not including TSN, I3D etc.) to get the classification result of proposals.
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+:::
+
+For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset) .
+
+## Citation
+
+```BibTeX
+@inproceedings{lin2019bmn,
+  title={Bmn: Boundary-matching network for temporal action proposal generation},
+  author={Lin, Tianwei and Liu, Xiao and Li, Xin and Ding, Errui and Wen, Shilei},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={3889--3898},
+  year={2019}
+}
+```
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{zhao2017cuhk,
+  title={Cuhk \& ethz \& siat submission to activitynet challenge 2017},
+  author={Zhao, Y and Zhang, B and Wu, Z and Yang, S and Zhou, L and Yan, S and Wang, L and Xiong, Y and Lin, D and Qiao, Y and others},
+  journal={arXiv preprint arXiv:1710.08011},
+  volume={8},
+  year={2017}
+}
+```
--- a/configs/localization/bmn/README_zh-CN.md
+++ b/configs/localization/bmn/README_zh-CN.md
+# BMN
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@inproceedings{lin2019bmn,
+  title={Bmn: Boundary-matching network for temporal action proposal generation},
+  author={Lin, Tianwei and Liu, Xiao and Li, Xin and Ding, Errui and Wen, Shilei},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={3889--3898},
+  year={2019}
+}
+```
+
+<!-- [DATASET] -->
+
+```BibTeX
+@article{zhao2017cuhk,
+  title={Cuhk \& ethz \& siat submission to activitynet challenge 2017},
+  author={Zhao, Y and Zhang, B and Wu, Z and Yang, S and Zhou, L and Yan, S and Wang, L and Xiong, Y and Lin, D and Qiao, Y and others},
+  journal={arXiv preprint arXiv:1710.08011},
+  volume={8},
+  year={2017}
+}
+```
+
+## 模型库
+
+### ActivityNet feature
+
+|                                                   配置文件                                                    |      特征      | GPU 数量 | AR@100 |  AUC  | AP@0.5 | AP@0.75 | AP@0.95 |  mAP  | GPU 显存占用 (M) | 推理时间 (s) |                                                                             ckpt                                                                             |                                                                       log                                                                        | json                                                                                                                                               |
+| :-----------------------------------------------------------------------------------------------------------: | :------------: | :------: | :----: | :---: | :----: | :-----: | :-----: | :---: | :--------------: | ------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [bmn_400x100_9e_2x8_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py) | cuhk_mean_100  |    2     | 75.28  | 67.22 | 42.47  |  31.31  |  9.92   | 30.34 |       5420       | 3.27         | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth) |    [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log)     | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json)    |
+|                                                                                                               | mmaction_video |    2     | 75.43  | 67.22 | 42.62  |  31.56  |  10.86  | 30.77 |       5420       | 3.27         |  [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth)  | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json) |
+|                                                                                                               | mmaction_clip  |    2     | 75.35  | 67.38 | 43.08  |  32.19  |  10.73  | 31.15 |       5420       | 3.27         |   [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth)   |  [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log)  | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json)   |
+|           [BMN-official](https://github.com/JJBOY/BMN-Boundary-Matching-Network) (for reference)\*            | cuhk_mean_100  |    -     | 75.27  | 67.49 | 42.22  |  30.98  |  9.22   | 30.00 |        -         | -            |                                                                              -                                                                               |                                                                        -                                                                         | -                                                                                                                                                  |
+
+- 注：
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 对于 **特征** 这一列，`cuhk_mean_100` 表示所使用的特征为利用 [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk) 代码库抽取的，被广泛利用的 CUHK ActivityNet 特征，
+   `mmaction_video` 和 `mmaction_clip` 分布表示所使用的特征为利用 MMAction 抽取的，视频级别 ActivityNet 预训练模型的特征；视频片段级别 ActivityNet 预训练模型的特征。
+3. MMAction2 使用 ActivityNet2017 未剪辑视频分类赛道上 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 所提交的结果来为每个视频的时序动作候选指定标签，以用于 BMN 模型评估。
+
+\*MMAction2 在 [原始代码库](https://github.com/JJBOY/BMN-Boundary-Matching-Network) 上训练 BMN，并且在 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 的对应标签上评估时序动作候选生成和时序检测的结果。
+
+对于数据集准备的细节，用户可参考 [数据集准备文档](/docs_zh_CN/data_preparation.md) 中的 ActivityNet 特征部分。
+
+## 如何训练
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：在 ActivityNet 特征上训练 BMN。
+
+```shell
+python tools/train.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+```
+
+更多训练细节，可参考 [基础教程](/docs_zh_CN/getting_started.md#%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE) 中的 **训练配置** 部分。
+
+## 如何测试
+
+用户可以使用以下指令进行模型测试。
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+例如：在 ActivityNet 特征上测试 BMN 模型。
+
+```shell
+# 注：如果需要进行指标验证，需确测试数据的保标注文件包含真实标签
+python tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+```
+
+用户也可以利用 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 的预测文件评估模型时序检测的结果，并生成时序动作候选文件（即命令中的 `results.json`）
+
+```shell
+python tools/analysis/report_map.py --proposal path/to/proposal_file
+```
+
+注：
+
+1. (可选项) 用户可以使用以下指令生成格式化的时序动作候选文件，该文件可被送入动作识别器中（目前只支持 SSN 和 P-GCN，不包括 TSN, I3D 等），以获得时序动作候选的分类结果。
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+更多测试细节，可参考 [基础教程](/docs_zh_CN/getting_started.md#%E6%B5%8B%E8%AF%95%E6%9F%90%E4%B8%AA%E6%95%B0%E6%8D%AE%E9%9B%86) 中的 **测试某个数据集** 部分。
--- a/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+++ b/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+_base_ = [
+    '../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_val.json'
+
+test_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(
+        type='Collect',
+        keys=['raw_feature'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature']),
+]
+train_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+val_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(
+        type='ToDataContainer',
+        fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+evaluation = dict(interval=1, metrics=['AR@AN'])
+
+# optimizer
+optimizer = dict(
+    type='Adam', lr=0.001, weight_decay=0.0001)  # this lr is used for 2 gpus
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=7)
+total_epochs = 9
+
+# runtime settings
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'
+output_config = dict(out=f'{work_dir}/results.json', output_format='json')
--- a/configs/localization/bmn/metafile.yml
+++ b/configs/localization/bmn/metafile.yml
+Collections:
+- Name: BMN
+  README: configs/localization/bmn/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1907.09702
+    Title: "BMN: Boundary-Matching Network for Temporal Action Proposal Generation"
+Models:
+- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+  In Collection: BMN
+  Metadata:
+    Batch Size: 8
+    Epochs: 9
+    Training Data: ActivityNet v1.3
+    Training Resources: 2 GPUs
+    feature: cuhk_mean_100
+  Name: bmn_400x100_9e_2x8_activitynet_feature (cuhk_mean_100)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AP@0.5: 42.47
+      AP@0.75: 31.31
+      AP@0.95: 9.92
+      AR@100: 75.28
+      AUC: 67.22
+      mAP: 30.34
+    Task: Temporal Action Localization
+  Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json
+  Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log
+  Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth
+- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+  In Collection: BMN
+  Metadata:
+    Batch Size: 8
+    Epochs: 9
+    Training Data: ActivityNet v1.3
+    Training Resources: 2 GPUs
+    feature: mmaction_video
+  Name: bmn_400x100_9e_2x8_activitynet_feature (mmaction_video)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AP@0.5: 42.62
+      AP@0.75: 31.56
+      AP@0.95: 10.86
+      AR@100: 75.43
+      AUC: 67.22
+      mAP: 30.77
+    Task: Temporal Action Localization
+  Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json
+  Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log
+  Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth
+- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
+  In Collection: BMN
+  Metadata:
+    Batch Size: 8
+    Epochs: 9
+    Training Data: ActivityNet v1.3
+    Training Resources: 2 GPUs
+    feature: mmaction_clip
+  Name: bmn_400x100_9e_2x8_activitynet_feature (mmaction_clip)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AP@0.5: 43.08
+      AP@0.75: 32.19
+      AP@0.95: 10.73
+      AR@100: 75.35
+      AUC: 67.38
+      mAP: 31.15
+    Task: Temporal Action Localization
+  Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json
+  Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log
+  Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth
--- a/configs/localization/bsn/README.md
+++ b/configs/localization/bsn/README.md
+# BSN
+
+[Bsn: Boundary sensitive network for temporal action proposal generation](https://openaccess.thecvf.com/content_ECCV_2018/html/Tianwei_Lin_BSN_Boundary_Sensitive_ECCV_2018_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Temporal action proposal generation is an important yet challenging problem, since temporal proposals with rich action content are indispensable for analysing real-world videos with long duration and high proportion irrelevant content. This problem requires methods not only generating proposals with precise temporal boundaries, but also retrieving proposals to cover truth action instances with high recall and high overlap using relatively fewer proposals. To address these difficulties, we introduce an effective proposal generation method, named Boundary-Sensitive Network (BSN), which adopts "local to global" fashion. Locally, BSN first locates temporal boundaries with high probabilities, then directly combines these boundaries as proposals. Globally, with Boundary-Sensitive Proposal feature, BSN retrieves proposals by evaluating the confidence of whether a proposal contains an action within its region. We conduct experiments on two challenging datasets: ActivityNet-1.3 and THUMOS14, where BSN outperforms other state-of-the-art temporal action proposal generation methods with high recall and high temporal precision. Finally, further experiments demonstrate that by combining existing action classifiers, our method significantly improves the state-of-the-art temporal action detection performance.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143016692-69efafbd-cec6-47f1-af45-371d0ff78a97.png" width="800"/>
+</div>
+
+## Results and Models
+
+### ActivityNet feature
+
+| config                                   |    feature     | gpus | pretrain | AR@100 |  AUC  |   gpu_mem(M)    |     iter time(s)      |                                                                                                                                                                                   ckpt                                                                                                                                                                                    |                                                                                                                                                                 log                                                                                                                                                                 |                                                                                                                                                                       json                                                                                                                                                                       |
+| :--------------------------------------- | :------------: | :--: | :------: | :----: | :---: | :-------------: | :-------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| bsn_400x100_1x16_20e_activitynet_feature | cuhk_mean_100  |  1   |   None   | 74.66  | 66.45 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json)  [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json) |
+|                                          | mmaction_video |  1   |   None   | 74.93  | 66.74 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) |           [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth)           |  [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log)  |      [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json)       |
+|                                          | mmaction_clip  |  1   |   None   | 75.19  | 66.81 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) |             [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth)             |    [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log)    |        [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json)         |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk), mmaction_video and mmaction_clip denote feature extracted by mmaction, with video-level activitynet finetuned model or clip-level activitynet finetuned model respectively.
+
+:::
+
+For more details on data preparation, you can refer to ActivityNet feature in [Data Preparation](/docs/data_preparation.md).
+
+## Train
+
+You can use the following commands to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Examples:
+
+1. train BSN(TEM) on ActivityNet features dataset.
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+2. train BSN(PEM) on PGM results.
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
+
+## Inference
+
+You can use the following commands to inference a model.
+
+1. For TEM Inference
+
+   ```shell
+   # Note: This could not be evaluated.
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. For PGM Inference
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. For PEM Inference
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+Examples:
+
+1. Inference BSN(TEM) with pretrained model.
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. Inference BSN(PGM) with pretrained model.
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode train
+   ```
+
+3. Inference BSN(PEM) with evaluation metric 'AR@AN' and output the results.
+
+   ```shell
+   # Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py  checkpoints/SOME_CHECKPOINT.pth  --eval AR@AN --out results.json
+   ```
+
+## Test
+
+You can use the following commands to test a model.
+
+1. TEM
+
+   ```shell
+   # Note: This could not be evaluated.
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. PGM
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. PEM
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+Examples:
+
+1. Test a TEM model on ActivityNet dataset.
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. Test a PGM model on ActivityNet dataset.
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode test
+   ```
+
+3. Test a PEM model with with evaluation metric 'AR@AN' and output the results.
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+   ```
+
+:::{note}
+
+1. (Optional) You can use the following command to generate a formatted proposal file, which will be fed into the action classifier (Currently supports only SSN and P-GCN, not including TSN, I3D etc.) to get the classification result of proposals.
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+:::
+
+For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
+
+## Citation
+
+```BibTeX
+@inproceedings{lin2018bsn,
+  title={Bsn: Boundary sensitive network for temporal action proposal generation},
+  author={Lin, Tianwei and Zhao, Xu and Su, Haisheng and Wang, Chongjing and Yang, Ming},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={3--19},
+  year={2018}
+}
+```
--- a/configs/localization/bsn/README_zh-CN.md
+++ b/configs/localization/bsn/README_zh-CN.md
+# BSN
+
+## 简介
+
+<!-- [ALGORITHM] -->
+
+```BibTeX
+@inproceedings{lin2018bsn,
+  title={Bsn: Boundary sensitive network for temporal action proposal generation},
+  author={Lin, Tianwei and Zhao, Xu and Su, Haisheng and Wang, Chongjing and Yang, Ming},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={3--19},
+  year={2018}
+}
+```
+
+## 模型库
+
+### ActivityNet feature
+
+| 配置文件                                 |      特征      | GPU 数量 | 预训练 | AR@100 |  AUC  | GPU 显存占用 (M) |     迭代时间 (s)      |                                                                                                                                                                                   ckpt                                                                                                                                                                                    |                                                                                                                                                                 log                                                                                                                                                                 |                                                                                                                                                                       json                                                                                                                                                                       |
+| :--------------------------------------- | :------------: | :------: | :----: | :----: | :---: | :--------------: | :-------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| bsn_400x100_1x16_20e_activitynet_feature | cuhk_mean_100  |    1     |  None  | 74.66  | 66.45 | 41(TEM)+25(PEM)  | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json)  [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json) |
+|                                          | mmaction_video |    1     |  None  | 74.93  | 66.74 | 41(TEM)+25(PEM)  | 0.074(TEM)+0.036(PEM) |           [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth)           |  [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log)  |      [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json)       |
+|                                          | mmaction_clip  |    1     |  None  | 75.19  | 66.81 | 41(TEM)+25(PEM)  | 0.074(TEM)+0.036(PEM) |             [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth)             |    [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log)    |        [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json)         |
+
+注：
+
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 对于 **特征** 这一列，`cuhk_mean_100` 表示所使用的特征为利用 [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk) 代码库抽取的，被广泛利用的 CUHK ActivityNet 特征，
+   `mmaction_video` 和 `mmaction_clip` 分布表示所使用的特征为利用 MMAction 抽取的，视频级别 ActivityNet 预训练模型的特征；视频片段级别 ActivityNet 预训练模型的特征。
+
+对于数据集准备的细节，用户可参考 [数据集准备文档](/docs_zh_CN/data_preparation.md) 中的 ActivityNet 特征部分。
+
+## 如何训练
+
+用户可以使用以下指令进行模型训练。
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+例如：
+
+1. 在 ActivityNet 特征上训练 BSN(TEM) 模型。
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+2. 基于 PGM 的结果训练 BSN(PEM)。
+
+   ```shell
+   python tools/train.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+   ```
+
+更多训练细节，可参考 [基础教程](/docs_zh_CN/getting_started.md#%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE) 中的 **训练配置** 部分。
+
+## 如何进行推理
+
+用户可以使用以下指令进行模型推理。
+
+1. 推理 TEM 模型。
+
+   ```shell
+   # Note: This could not be evaluated.
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. 推理 PGM 模型
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. 推理 PEM 模型
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+例如
+
+1. 利用预训练模型进行 BSN(TEM) 模型的推理。
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. 利用预训练模型进行 BSN(PGM) 模型的推理
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode train
+   ```
+
+3. 推理 BSN(PEM) 模型，并计算 'AR@AN' 指标，输出结果文件。
+
+   ```shell
+   # 注：如果需要进行指标验证，需确测试数据的保标注文件包含真实标签
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py  checkpoints/SOME_CHECKPOINT.pth  --eval AR@AN --out results.json
+   ```
+
+## 如何测试
+
+用户可以使用以下指令进行模型测试。
+
+1. TEM
+
+   ```shell
+   # 注：该命令无法进行指标验证
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+2. PGM
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
+   ```
+
+3. PEM
+
+   ```shell
+   python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+   ```
+
+例如：
+
+1. 在 ActivityNet 数据集上测试 TEM 模型。
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
+   ```
+
+2. 在 ActivityNet 数据集上测试 PGM 模型。
+
+   ```shell
+   python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode test
+   ```
+
+3. 测试 PEM 模型，并计算 'AR@AN' 指标，输出结果文件。
+
+   ```shell
+   python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
+   ```
+
+注：
+
+1. (可选项) 用户可以使用以下指令生成格式化的时序动作候选文件，该文件可被送入动作识别器中（目前只支持 SSN 和 P-GCN，不包括 TSN, I3D 等），以获得时序动作候选的分类结果。
+
+   ```shell
+   python tools/data/activitynet/convert_proposal_format.py
+   ```
+
+更多测试细节，可参考 [基础教程](/docs_zh_CN/getting_started.md#%E6%B5%8B%E8%AF%95%E6%9F%90%E4%B8%AA%E6%95%B0%E6%8D%AE%E9%9B%86) 中的 **测试某个数据集** 部分。
--- a/configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+++ b/configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+_base_ = [
+    '../../_base_/models/bsn_pem.py', '../../_base_/schedules/adam_20e.py',
+    '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_val.json'
+
+work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
+pgm_proposals_dir = f'{work_dir}/pgm_proposals/'
+pgm_features_dir = f'{work_dir}/pgm_features/'
+
+test_pipeline = [
+    dict(
+        type='LoadProposals',
+        top_k=1000,
+        pgm_proposals_dir=pgm_proposals_dir,
+        pgm_features_dir=pgm_features_dir),
+    dict(
+        type='Collect',
+        keys=['bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['bsp_feature'])
+]
+
+train_pipeline = [
+    dict(
+        type='LoadProposals',
+        top_k=500,
+        pgm_proposals_dir=pgm_proposals_dir,
+        pgm_features_dir=pgm_features_dir),
+    dict(
+        type='Collect',
+        keys=['bsp_feature', 'reference_temporal_iou'],
+        meta_name='video_meta',
+        meta_keys=[]),
+    dict(type='ToTensor', keys=['bsp_feature', 'reference_temporal_iou']),
+    dict(
+        type='ToDataContainer',
+        fields=(dict(key='bsp_feature', stack=False),
+                dict(key='reference_temporal_iou', stack=False)))
+]
+
+val_pipeline = [
+    dict(
+        type='LoadProposals',
+        top_k=1000,
+        pgm_proposals_dir=pgm_proposals_dir,
+        pgm_features_dir=pgm_features_dir),
+    dict(
+        type='Collect',
+        keys=['bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score'],
+        meta_name='video_meta',
+        meta_keys=[
+            'video_name', 'duration_second', 'duration_frame', 'annotations',
+            'feature_frame'
+        ]),
+    dict(type='ToTensor', keys=['bsp_feature'])
+]
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+evaluation = dict(interval=1, metrics=['AR@AN'])
+
+# runtime settings
+checkpoint_config = dict(interval=1, filename_tmpl='pem_epoch_{}.pth')
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+output_config = dict(out=f'{work_dir}/results.json', output_format='json')
--- a/configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+++ b/configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_test.json'
+
+work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
+tem_results_dir = f'{work_dir}/tem_results/'
+pgm_proposals_dir = f'{work_dir}/pgm_proposals/'
+pgm_features_dir = f'{work_dir}/pgm_features/'
+
+temporal_scale = 100
+pgm_proposals_cfg = dict(
+    pgm_proposals_thread=8, temporal_scale=temporal_scale, peak_threshold=0.5)
+pgm_features_test_cfg = dict(
+    pgm_features_thread=4,
+    top_k=1000,
+    num_sample_start=8,
+    num_sample_end=8,
+    num_sample_action=16,
+    num_sample_interp=3,
+    bsp_boundary_ratio=0.2)
+pgm_features_train_cfg = dict(
+    pgm_features_thread=4,
+    top_k=500,
+    num_sample_start=8,
+    num_sample_end=8,
+    num_sample_action=16,
+    num_sample_interp=3,
+    bsp_boundary_ratio=0.2)
--- a/configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+++ b/configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+_base_ = ['../../_base_/models/bsn_tem.py', '../../_base_/default_runtime.py']
+
+# dataset settings
+dataset_type = 'ActivityNetDataset'
+data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
+ann_file_train = 'data/ActivityNet/anet_anno_train.json'
+ann_file_val = 'data/ActivityNet/anet_anno_val.json'
+ann_file_test = 'data/ActivityNet/anet_anno_full.json'
+
+test_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(
+        type='Collect',
+        keys=['raw_feature'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature'])
+]
+train_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(type='ToDataContainer', fields=[dict(key='gt_bbox', stack=False)])
+]
+val_pipeline = [
+    dict(type='LoadLocalizationFeature'),
+    dict(type='GenerateLocalizationLabels'),
+    dict(
+        type='Collect',
+        keys=['raw_feature', 'gt_bbox'],
+        meta_name='video_meta',
+        meta_keys=['video_name']),
+    dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
+    dict(type='ToDataContainer', fields=[dict(key='gt_bbox', stack=False)])
+]
+
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=8,
+    train_dataloader=dict(drop_last=True),
+    val_dataloader=dict(videos_per_gpu=1),
+    test_dataloader=dict(videos_per_gpu=1),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        data_prefix=data_root_val),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        pipeline=val_pipeline,
+        data_prefix=data_root_val),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        data_prefix=data_root))
+
+# optimizer
+optimizer = dict(
+    type='Adam', lr=0.001, weight_decay=0.0001)  # this lr is used for 1 gpus
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=7)
+total_epochs = 20
+
+# runtime settings
+checkpoint_config = dict(interval=1, filename_tmpl='tem_epoch_{}.pth')
+log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
+workflow = [('train', 1), ('val', 1)]
+work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
+tem_results_dir = f'{work_dir}/tem_results/'
+output_config = dict(out=tem_results_dir, output_format='csv')
--- a/configs/localization/bsn/metafile.yml
+++ b/configs/localization/bsn/metafile.yml
+Collections:
+- Name: BSN
+  README: configs/localization/bsn/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1806.02964
+    Title: "BSN: Boundary Sensitive Network for Temporal Action Proposal Generation"
+Models:
+- Config:
+  - configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+  - configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+  - configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+  In Collection: BSN
+  Metadata:
+    Pretrained: None
+    Training Data: ActivityNet v1.3
+    Training Resources: 1 GPUs
+    feature: cuhk_mean_100
+  Name: bsn_400x100_1x16_20e_activitynet_feature (cuhk_mean_100)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AR@100: 74.66
+      AUC: 66.45
+    Task: Temporal Action Localization
+  Training Json Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json
+  Training Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log
+  Weights:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth
+- Config:
+  - configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+  - configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+  - configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+  In Collection: BSN
+  Metadata:
+    Pretrained: None
+    Training Data: ActivityNet v1.3
+    Training Resources: 1 GPUs
+    feature: mmaction_video
+  Name: bsn_400x100_1x16_20e_activitynet_feature (mmaction_video)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AR@100: 74.93
+      AUC: 66.74
+    Task: Temporal Action Localization
+  Training Json Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json
+  Training Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log
+  Weights:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth
+- Config:
+  - configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
+  - configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
+  - configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
+  In Collection: BSN
+  Metadata:
+    Pretrained: None
+    Training Data: ActivityNet v1.3
+    Training Resources: 1 GPUs
+    feature: mmaction_clip
+  Name: bsn_400x100_1x16_20e_activitynet_feature (mmaction_clip)
+  Results:
+  - Dataset: ActivityNet v1.3
+    Metrics:
+      AR@100: 75.19
+      AUC: 66.81
+    Task: Temporal Action Localization
+  Training Json Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json
+  Training Log:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log
+  Weights:
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth
+  - https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth
--- a/configs/localization/ssn/README.md
+++ b/configs/localization/ssn/README.md
+# SSN
+
+[Temporal Action Detection With Structured Segment Networks](https://openaccess.thecvf.com/content_iccv_2017/html/Zhao_Temporal_Action_Detection_ICCV_2017_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Detecting actions in untrimmed videos is an important yet challenging task. In this paper, we present the structured segment network (SSN), a novel framework which models the temporal structure of each action instance via a structured temporal pyramid. On top of the pyramid, we further introduce a decomposed discriminative model comprising two classifiers, respectively for classifying actions and determining completeness. This allows the framework to effectively distinguish positive proposals from background or incomplete ones, thus leading to both accurate recognition and localization. These components are integrated into a unified network that can be efficiently trained in an end-to-end fashion. Additionally, a simple yet effective temporal action proposal scheme, dubbed temporal actionness grouping (TAG) is devised to generate high quality action proposals. On two challenging benchmarks, THUMOS14 and ActivityNet, our method remarkably outperforms previous state-of-the-art methods, demonstrating superior accuracy and strong adaptivity in handling actions with various temporal structures.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143016899-017893d3-a907-4487-90a2-cb884088266c.png" width="800"/>
+</div>
+
+## Results and Models
+
+|                                          config                                           | gpus | backbone | pretrain | mAP@0.3 | mAP@0.4 | mAP@0.5 |                                              reference mAP@0.3                                              |                                              reference mAP@0.4                                              |                                              reference mAP@0.5                                              | gpu_mem(M) |                                                                    ckpt                                                                    |                                                      log                                                      | json                                                                                                                |                                                                            reference ckpt                                                                             |                                                               reference json                                                               |
+| :---------------------------------------------------------------------------------------: | :--: | :------: | :------: | :-----: | :-----: | :-----: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :--------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
+| [ssn_r50_450e_thumos14_rgb](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) |  8   | ResNet50 | ImageNet |  29.37  |  22.15  |  15.69  | [27.61](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [21.28](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [14.57](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) |    6352    | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/ssn_r50_450e_thumos14_rgb_20201012-1920ab16.pth) | [log](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log) | [json](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log.json) | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/ssn_r50_450e_thumos14_rgb_ref_20201014-b6f48f68.pth) | [json](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/20201008_103258.log.json) |
+
+:::{note}
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. Since SSN utilizes different structured temporal pyramid pooling methods at training and testing, please refer to [ssn_r50_450e_thumos14_rgb_train](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) at training and [ssn_r50_450e_thumos14_rgb_test](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py) at testing.
+3. We evaluate the action detection performance of SSN, using action proposals of TAG. For more details on data preparation, you can refer to thumos14 TAG proposals in [Data Preparation](/docs/data_preparation.md).
+4. The reference SSN in is evaluated with `ResNet50` backbone in MMAction, which is the same backbone with ours. Note that the original setting of MMAction SSN uses the `BNInception` backbone.
+
+:::
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train SSN model on thumos14 dataset.
+
+```shell
+python tools/train.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py
+```
+
+For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test BMN on ActivityNet feature dataset.
+
+```shell
+# Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
+python tools/test.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py checkpoints/SOME_CHECKPOINT.pth --eval mAP
+```
+
+For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
+
+## Citation
+
+```BibTeX
+@InProceedings{Zhao_2017_ICCV,
+author = {Zhao, Yue and Xiong, Yuanjun and Wang, Limin and Wu, Zhirong and Tang, Xiaoou and Lin, Dahua},
+title = {Temporal Action Detection With Structured Segment Networks},
+booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
+month = {Oct},
+year = {2017}
+}
+```