Commit aa58d024 authored by unknown's avatar unknown
Browse files

Initial add code.

parents
Pipeline #135 failed with stages
in 0 seconds
# model setting
model = dict(
type='FastRCNN',
backbone=dict(
type='ResNet3dSlowOnly',
depth=101,
pretrained=None,
pretrained2d=False,
lateral=False,
num_stages=4,
conv1_kernel=(1, 7, 7),
conv1_stride_t=1,
pool1_stride_t=1,
spatial_strides=(1, 2, 2, 1)),
roi_head=dict(
type='AVARoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor3D',
roi_layer_type='RoIAlign',
output_size=8,
with_temporal_pool=True),
bbox_head=dict(
type='BBoxHeadAVA',
in_channels=2048,
num_classes=81,
multilabel=True,
dropout_ratio=0.5)),
train_cfg=dict(
rcnn=dict(
assigner=dict(
type='MaxIoUAssignerAVA',
pos_iou_thr=0.9,
neg_iou_thr=0.9,
min_pos_iou=0.9),
sampler=dict(
type='RandomSampler',
num=32,
pos_fraction=1,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=1.0,
debug=False)),
test_cfg=dict(rcnn=dict(action_thr=0.002)))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
anno_root = 'data/ava/annotations'
ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
dict(type='RawFrameDecode'),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
dict(
type='ToDataContainer',
fields=[
dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
]),
dict(
type='Collect',
keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
meta_keys=['scores', 'entity_ids'])
]
# The testing is w/o. any cropping / flipping
val_pipeline = [
dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals']),
dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
dict(
type='Collect',
keys=['img', 'proposals'],
meta_keys=['scores', 'img_shape'],
nested=True)
]
data = dict(
videos_per_gpu=6,
workers_per_gpu=2,
# During testing, each video may have different shape
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
exclude_file=exclude_file_train,
pipeline=train_pipeline,
label_file=label_file,
proposal_file=proposal_file_train,
person_det_score_thr=0.9,
data_prefix=data_root),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
exclude_file=exclude_file_val,
pipeline=val_pipeline,
label_file=label_file,
proposal_file=proposal_file_val,
person_det_score_thr=0.9,
data_prefix=data_root))
data['test'] = data['val']
optimizer = dict(type='SGD', lr=0.075, momentum=0.9, weight_decay=0.00001)
# this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
step=[10, 15],
warmup='linear',
warmup_by_epoch=True,
warmup_iters=5,
warmup_ratio=0.1)
total_epochs = 20
checkpoint_config = dict(interval=1)
workflow = [('train', 1)]
evaluation = dict(interval=1, save_best='mAP@0.5IOU')
log_config = dict(
interval=20, hooks=[
dict(type='TextLoggerHook'),
])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = ('./work_dirs/ava/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb')
load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
'omni/'
'slowonly_r101_omni_8x8x1_kinetics400_rgb_20200926-b5dbb701.pth')
resume_from = None
find_unused_parameters = False
# model setting
model = dict(
type='FastRCNN',
backbone=dict(
type='ResNet3dSlowOnly',
depth=50,
pretrained=None,
pretrained2d=False,
lateral=False,
num_stages=4,
conv1_kernel=(1, 7, 7),
conv1_stride_t=1,
pool1_stride_t=1,
spatial_strides=(1, 2, 2, 1)),
roi_head=dict(
type='AVARoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor3D',
roi_layer_type='RoIAlign',
output_size=8,
with_temporal_pool=True),
bbox_head=dict(
type='BBoxHeadAVA',
in_channels=2048,
num_classes=81,
multilabel=True,
dropout_ratio=0.5)),
train_cfg=dict(
rcnn=dict(
assigner=dict(
type='MaxIoUAssignerAVA',
pos_iou_thr=0.9,
neg_iou_thr=0.9,
min_pos_iou=0.9),
sampler=dict(
type='RandomSampler',
num=32,
pos_fraction=1,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=1.0,
debug=False)),
test_cfg=dict(rcnn=dict(action_thr=0.002)))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
anno_root = 'data/ava/annotations'
ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
dict(type='RawFrameDecode'),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
dict(
type='ToDataContainer',
fields=[
dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
]),
dict(
type='Collect',
keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
meta_keys=['scores', 'entity_ids'])
]
# The testing is w/o. any cropping / flipping
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals']),
dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
dict(
type='Collect',
keys=['img', 'proposals'],
meta_keys=['scores', 'img_shape'],
nested=True)
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=2,
# During testing, each video may have different shape
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
exclude_file=exclude_file_train,
pipeline=train_pipeline,
label_file=label_file,
proposal_file=proposal_file_train,
person_det_score_thr=0.9,
data_prefix=data_root),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
exclude_file=exclude_file_val,
pipeline=val_pipeline,
label_file=label_file,
proposal_file=proposal_file_val,
person_det_score_thr=0.9,
data_prefix=data_root))
data['test'] = data['val']
optimizer = dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001)
# this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
step=[10, 15],
warmup='linear',
warmup_by_epoch=True,
warmup_iters=5,
warmup_ratio=0.1)
total_epochs = 20
checkpoint_config = dict(interval=1)
workflow = [('train', 1)]
evaluation = dict(interval=1, save_best='mAP@0.5IOU')
log_config = dict(
interval=20, hooks=[
dict(type='TextLoggerHook'),
])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = ('./work_dirs/ava/'
'slowonly_omnisource_pretrained_r50_4x16x1_20e_ava_rgb')
load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
'omni/'
'slowonly_r50_omni_4x16x1_kinetics400_rgb_20200926-51b1f7ea.pth')
resume_from = None
find_unused_parameters = False
# LFB
[Long-term feature banks for detailed video understanding](https://openaccess.thecvf.com/content_CVPR_2019/html/Wu_Long-Term_Feature_Banks_for_Detailed_Video_Understanding_CVPR_2019_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
To understand the world, we humans constantly need to relate the present to the past, and put events in context. In this paper, we enable existing video models to do the same. We propose a long-term feature bank---supportive information extracted over the entire span of a video---to augment state-of-the-art video models that otherwise would only view short clips of 2-5 seconds. Our experiments demonstrate that augmenting 3D convolutional networks with a long-term feature bank yields state-of-the-art results on three challenging video datasets: AVA, EPIC-Kitchens, and Charades.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143016220-21d90fb3-fd9f-499c-820f-f6c421bda7aa.png" width="800"/>
</div>
## Results and Models
### AVA2.1
| Model | Modality | Pretrained | Backbone | Input | gpus | Resolution | mAP | log | json | ckpt |
| :-----------------------------------------------------------------------------------------------------------------------------------------------------: | :------: | :----------: | :--------------------------------------------------------------------------------------------------: | :---: | :--: | :------------: | :---: | :------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 | 8 | short-side 256 | 24.11 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210224-2ae136d9.pth) |
| [lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 | 8 | short-side 256 | 20.17 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-19c330b7.pth) |
| [lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 | 8 | short-side 256 | 22.15 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-37efcd15.pth) |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. We use `slowonly_r50_4x16x1` instead of `I3D-R50-NL` in the original paper as the backbone of LFB, but we have achieved the similar improvement: (ours: 20.1 -> 24.11 vs. author: 22.1 -> 25.8).
3. Because the long-term features are randomly sampled in testing, the test accuracy may have some differences.
4. Before train or test lfb, you need to infer feature bank with the [lfb_slowonly_r50_ava_infer.py](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py). For more details on infer feature bank, you can refer to [Train](#Train) part.
5. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`.
6. The ROIHead now supports single-label classification (i.e. the network outputs at most
one-label per actor). This can be done by (a) setting multilabel=False during training and
the test_cfg.rcnn.action_thr for testing.
:::
## Train
### a. Infer long-term feature bank for training
Before train or test lfb, you need to infer long-term feature bank first.
Specifically, run the test on the training, validation, testing dataset with the config file [lfb_slowonly_r50_ava_infer](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py) (The config file will only infer the feature bank of training dataset and you need set `dataset_mode = 'val'` to infer the feature bank of validation dataset in the config file.), and the shared head [LFBInferHead](/mmaction/models/heads/lfb_infer_head.py) will generate the feature bank.
A long-term feature bank file of AVA training and validation datasets with float32 precision occupies 3.3 GB. If store the features with float16 precision, the feature bank occupies 1.65 GB.
You can use the following command to infer feature bank of AVA training and validation dataset and the feature bank will be stored in `lfb_prefix_path/lfb_train.pkl` and `lfb_prefix_path/lfb_val.pkl`.
```shell
# set `dataset_mode = 'train'` in lfb_slowonly_r50_ava_infer.py
python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
# set `dataset_mode = 'val'` in lfb_slowonly_r50_ava_infer.py
python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
```
We use [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth) from [slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) to infer feature bank.
### b. Train LFB
You can use the following command to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Example: train LFB model on AVA with half-precision long-term feature bank.
```shell
python tools/train.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
--validate --seed 0 --deterministic
```
For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
## Test
### a. Infer long-term feature bank for testing
Before train or test lfb, you also need to infer long-term feature bank first. If you have generated the feature bank file, you can skip it.
The step is the same with **Infer long-term feature bank for training** part in [Train](#Train).
### b. Test LFB
You can use the following command to test a model.
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Example: test LFB model on AVA with half-precision long-term feature bank and dump the result to a csv file.
```shell
python tools/test.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval mAP --out results.csv
```
For more details, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
## Citation
<!-- [DATASET] -->
```BibTeX
@inproceedings{gu2018ava,
title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={6047--6056},
year={2018}
}
```
```BibTeX
@inproceedings{wu2019long,
title={Long-term feature banks for detailed video understanding},
author={Wu, Chao-Yuan and Feichtenhofer, Christoph and Fan, Haoqi and He, Kaiming and Krahenbuhl, Philipp and Girshick, Ross},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={284--293},
year={2019}
}
```
# LFB
## 简介
<!-- [ALGORITHM] -->
```BibTeX
@inproceedings{wu2019long,
title={Long-term feature banks for detailed video understanding},
author={Wu, Chao-Yuan and Feichtenhofer, Christoph and Fan, Haoqi and He, Kaiming and Krahenbuhl, Philipp and Girshick, Ross},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={284--293},
year={2019}
}
```
## 模型库
### AVA2.1
| 配置文件 | 模态 | 预训练 | 主干网络 | 输入 | GPU 数量 | 分辨率 | 平均精度 | log | json | ckpt |
| :-----------------------------------------------------------------------------------------------------------------------------------------------------: | :--: | :----------: | :--------------------------------------------------------------------------------------------------: | :--: | :------: | :------: | :------: | :------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 | 8 | 短边 256 | 24.11 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210224-2ae136d9.pth) |
| [lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 | 8 | 短边 256 | 20.17 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-19c330b7.pth) |
| [lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py](/configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py) | RGB | Kinetics-400 | [slowonly_r50_4x16x1](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) | 4x16 | 8 | 短边 256 | 22.15 | [log](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log) | [json](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json) | [ckpt](https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-37efcd15.pth) |
- 注:
1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地,MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
依据 [线性缩放规则](https://arxiv.org/abs/1706.02677),当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时,需要根据批大小等比例地调节学习率。
如,lr=0.01 对应 4 GPUs x 2 video/gpu,以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
2. 本 LFB 模型暂没有使用原论文中的 `I3D-R50-NL` 作为主干网络,而是用 `slowonly_r50_4x16x1` 替代,但取得了同样的提升效果:(本模型:20.1 -> 24.11 而原论文模型:22.1 -> 25.8)。
3. 因为测试时,长时特征是被随机采样的,所以测试精度可能有一些偏差。
4. 在训练或测试 LFB 之前,用户需要使用配置文件特征库 [lfb_slowonly_r50_ava_infer.py](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py) 来推导长时特征库。有关推导长时特征库的更多细节,请参照[训练部分](#%E8%AE%AD%E7%BB%83)
5. 用户也可以直接从 [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) 或者 [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar) 下载 float32 或 float16 的长时特征库,并把它们放在 `lfb_prefix_path` 上。
## 训练
### a. 为训练 LFB 推导长时特征库
在训练或测试 LFB 之前,用户首先需要推导长时特征库。
具体来说,使用配置文件 [lfb_slowonly_r50_ava_infer](/configs/detection/lfb/lfb_slowonly_r50_ava_infer.py),在训练集、验证集、测试集上都运行一次模型测试。
配置文件的默认设置是推导训练集的长时特征库,用户需要将 `dataset_mode` 设置成 `'val'` 来推导验证集的长时特征库,在推导过程中。共享头 [LFBInferHead](/mmaction/models/heads/lfb_infer_head.py) 会生成长时特征库。
AVA 训练集和验证集的 float32 精度的长时特征库文件大约占 3.3 GB。如果以半精度来存储长时特征,文件大约占 1.65 GB。
用户可以使用以下命令来推导 AVA 训练集和验证集的长时特征库,而特征库会被存储为 `lfb_prefix_path/lfb_train.pkl``lfb_prefix_path/lfb_val.pkl`
```shell
# 在 lfb_slowonly_r50_ava_infer.py 中 设置 `dataset_mode = 'train'`
python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
# 在 lfb_slowonly_r50_ava_infer.py 中 设置 `dataset_mode = 'val'`
python tools/test.py configs/detection/lfb/lfb_slowonly_r50_ava_infer.py \
checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
```
MMAction2 使用来自配置文件 [slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb](/configs/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb.py) 的模型权重文件 [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth)作为推导长时特征库的 LFB 模型的主干网络的预训练模型。
### b. 训练 LFB
用户可以使用以下指令进行模型训练。
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
例如:使用半精度的长时特征库在 AVA 数据集上训练 LFB 模型。
```shell
python tools/train.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
--validate --seed 0 --deterministic
```
更多训练细节,可参考 [基础教程](/docs_zh_CN/getting_started.md#%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE) 中的 **训练配置** 部分。
## 测试
### a. 为测试 LFB 推导长时特征库
在训练或测试 LFB 之前,用户首先需要推导长时特征库。如果用户之前已经生成了特征库文件,可以跳过这一步。
这一步做法与[训练部分](#Train)中的 **为训练 LFB 推导长时特征库** 相同。
### b. 测试 LFB
用户可以使用以下指令进行模型测试。
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如:使用半精度的长时特征库在 AVA 数据集上测试 LFB 模型,并将结果导出为一个 json 文件。
```shell
python tools/test.py configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval mAP --out results.csv
```
更多测试细节,可参考 [基础教程](/docs_zh_CN/getting_started.md#%E6%B5%8B%E8%AF%95%E6%9F%90%E4%B8%AA%E6%95%B0%E6%8D%AE%E9%9B%86) 中的 **测试某个数据集** 部分。
_base_ = ['../_base_/models/slowonly_r50.py']
# model settings
lfb_prefix_path = 'data/ava/lfb_half'
max_num_sampled_feat = 5
window_size = 60
lfb_channels = 2048
dataset_modes = ('train', 'val')
model = dict(
roi_head=dict(
shared_head=dict(
type='FBOHead',
lfb_cfg=dict(
lfb_prefix_path=lfb_prefix_path,
max_num_sampled_feat=max_num_sampled_feat,
window_size=window_size,
lfb_channels=lfb_channels,
dataset_modes=dataset_modes,
device='gpu'),
fbo_cfg=dict(type='avg')),
bbox_head=dict(in_channels=4096)))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
anno_root = 'data/ava/annotations'
ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
dict(type='RawFrameDecode'),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
dict(
type='ToDataContainer',
fields=[
dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
]),
dict(
type='Collect',
keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
meta_keys=['scores', 'entity_ids', 'img_key'])
]
# The testing is w/o. any cropping / flipping
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals']),
dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
dict(
type='Collect',
keys=['img', 'proposals'],
meta_keys=['scores', 'img_shape', 'img_key'],
nested=True)
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
exclude_file=exclude_file_train,
pipeline=train_pipeline,
label_file=label_file,
proposal_file=proposal_file_train,
person_det_score_thr=0.9,
data_prefix=data_root),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
exclude_file=exclude_file_val,
pipeline=val_pipeline,
label_file=label_file,
proposal_file=proposal_file_val,
person_det_score_thr=0.9,
data_prefix=data_root))
data['test'] = data['val']
evaluation = dict(interval=1, save_best='mAP@0.5IOU')
optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
# this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
step=[10, 15],
warmup='linear',
warmup_by_epoch=True,
warmup_iters=5,
warmup_ratio=0.1)
total_epochs = 20
checkpoint_config = dict(interval=1)
workflow = [('train', 1)]
log_config = dict(
interval=20, hooks=[
dict(type='TextLoggerHook'),
])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb' # noqa E501
load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
resume_from = None
find_unused_parameters = False
_base_ = ['../_base_/models/slowonly_r50.py']
# model settings
lfb_prefix_path = 'data/ava/lfb_half'
max_num_sampled_feat = 5
window_size = 60
lfb_channels = 2048
dataset_modes = ('train', 'val')
model = dict(
roi_head=dict(
shared_head=dict(
type='FBOHead',
lfb_cfg=dict(
lfb_prefix_path=lfb_prefix_path,
max_num_sampled_feat=max_num_sampled_feat,
window_size=window_size,
lfb_channels=lfb_channels,
dataset_modes=dataset_modes,
device='gpu'),
fbo_cfg=dict(type='max')),
bbox_head=dict(in_channels=4096)))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
anno_root = 'data/ava/annotations'
ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
dict(type='RawFrameDecode'),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
dict(
type='ToDataContainer',
fields=[
dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
]),
dict(
type='Collect',
keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
meta_keys=['scores', 'entity_ids', 'img_key'])
]
# The testing is w/o. any cropping / flipping
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals']),
dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
dict(
type='Collect',
keys=['img', 'proposals'],
meta_keys=['scores', 'img_shape', 'img_key'],
nested=True)
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
exclude_file=exclude_file_train,
pipeline=train_pipeline,
label_file=label_file,
proposal_file=proposal_file_train,
person_det_score_thr=0.9,
data_prefix=data_root),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
exclude_file=exclude_file_val,
pipeline=val_pipeline,
label_file=label_file,
proposal_file=proposal_file_val,
person_det_score_thr=0.9,
data_prefix=data_root))
data['test'] = data['val']
evaluation = dict(interval=1, save_best='mAP@0.5IOU')
optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
# this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
step=[10, 15],
warmup='linear',
warmup_by_epoch=True,
warmup_iters=5,
warmup_ratio=0.1)
total_epochs = 20
checkpoint_config = dict(interval=1)
workflow = [('train', 1)]
log_config = dict(
interval=20, hooks=[
dict(type='TextLoggerHook'),
])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb' # noqa E501
load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
resume_from = None
find_unused_parameters = False
_base_ = ['../_base_/models/slowonly_r50.py']
# model settings
lfb_prefix_path = 'data/ava/lfb_half'
max_num_sampled_feat = 5
window_size = 60
lfb_channels = 2048
dataset_modes = ('train', 'val')
model = dict(
roi_head=dict(
shared_head=dict(
type='FBOHead',
lfb_cfg=dict(
lfb_prefix_path=lfb_prefix_path,
max_num_sampled_feat=max_num_sampled_feat,
window_size=window_size,
lfb_channels=lfb_channels,
dataset_modes=dataset_modes,
device='gpu'),
fbo_cfg=dict(
type='non_local',
st_feat_channels=2048,
lt_feat_channels=lfb_channels,
latent_channels=512,
num_st_feat=1,
num_lt_feat=window_size * max_num_sampled_feat,
num_non_local_layers=2,
st_feat_dropout_ratio=0.2,
lt_feat_dropout_ratio=0.2,
pre_activate=True)),
bbox_head=dict(in_channels=2560)))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
anno_root = 'data/ava/annotations'
ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
dict(type='RawFrameDecode'),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
dict(
type='ToDataContainer',
fields=[
dict(key=['proposals', 'gt_bboxes', 'gt_labels'], stack=False)
]),
dict(
type='Collect',
keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'],
meta_keys=['scores', 'entity_ids', 'img_key'])
]
# The testing is w/o. any cropping / flipping
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals']),
dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
dict(
type='Collect',
keys=['img', 'proposals'],
meta_keys=['scores', 'img_shape', 'img_key'],
nested=True)
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
exclude_file=exclude_file_train,
pipeline=train_pipeline,
label_file=label_file,
proposal_file=proposal_file_train,
person_det_score_thr=0.9,
data_prefix=data_root),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
exclude_file=exclude_file_val,
pipeline=val_pipeline,
label_file=label_file,
proposal_file=proposal_file_val,
person_det_score_thr=0.9,
data_prefix=data_root))
data['test'] = data['val']
evaluation = dict(interval=1, save_best='mAP@0.5IOU')
optimizer = dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=1e-05)
# this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
step=[10, 15],
warmup='linear',
warmup_by_epoch=True,
warmup_iters=5,
warmup_ratio=0.1)
total_epochs = 20
checkpoint_config = dict(interval=1)
workflow = [('train', 1)]
log_config = dict(
interval=20, hooks=[
dict(type='TextLoggerHook'),
])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb' # noqa E501
load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
resume_from = None
find_unused_parameters = False
# This config is used to generate long-term feature bank.
_base_ = ['../_base_/models/slowonly_r50.py']
# model settings
lfb_prefix_path = 'data/ava/lfb_half'
dataset_mode = 'train' # ['train', 'val', 'test']
model = dict(
roi_head=dict(
shared_head=dict(
type='LFBInferHead',
lfb_prefix_path=lfb_prefix_path,
dataset_mode=dataset_mode,
use_half_precision=True)))
# dataset settings
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
anno_root = 'data/ava/annotations'
ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv'
exclude_file_infer = (
f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv')
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
proposal_file_infer = (
f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl')
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
infer_pipeline = [
dict(
type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
# Rename is needed to use mmdet detectors
dict(type='Rename', mapping=dict(imgs='img')),
dict(type='ToTensor', keys=['img', 'proposals']),
dict(type='ToDataContainer', fields=[dict(key='proposals', stack=False)]),
dict(
type='Collect',
keys=['img', 'proposals'],
meta_keys=['scores', 'img_shape', 'img_key'],
nested=True)
]
data = dict(
videos_per_gpu=1,
workers_per_gpu=2,
test=dict(
type=dataset_type,
ann_file=ann_file_infer,
exclude_file=exclude_file_infer,
pipeline=infer_pipeline,
label_file=label_file,
proposal_file=proposal_file_infer,
person_det_score_thr=0.9,
data_prefix=data_root))
dist_params = dict(backend='nccl')
Collections:
- Name: LFB
README: configs/detection/lfb/README.md
Paper:
URL: https://arxiv.org/abs/1812.05038
Title: Long-Term Feature Banks for Detailed Video Understanding
Models:
- Config: configs/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
In Collection: LFB
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 20
Input: 4x16
Pretrained: Kinetics-400
Resolution: short-side 256
Training Data: AVA v2.1
Training Resources: 8 GPUs
Modality: RGB
Name: lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
Results:
- Dataset: AVA v2.1
Metrics:
mAP: 24.11
Task: Spatial Temporal Action Detection
Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log.json
Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210224_125052.log
Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_nl_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210224-2ae136d9.pth
- Config: configs/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
In Collection: LFB
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 20
Input: 4x16
Pretrained: Kinetics-400
Resolution: short-side 256
Training Data: AVA v2.1
Training Resources: 8 GPUs
Modality: RGB
Name: lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
Results:
- Dataset: AVA v2.1
Metrics:
mAP: 20.17
Task: Spatial Temporal Action Detection
Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json
Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log
Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_avg_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-19c330b7.pth
- Config: configs/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
In Collection: LFB
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 20
Input: 4x16
Pretrained: Kinetics-400
Resolution: short-side 256
Training Data: AVA v2.1
Training Resources: 8 GPUs
Modality: RGB
Name: lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb.py
Results:
- Dataset: AVA v2.1
Metrics:
mAP: 22.15
Task: Spatial Temporal Action Detection
Training Json Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log.json
Training Log: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/20210301_124812.log
Weights: https://download.openmmlab.com/mmaction/detection/lfb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb/lfb_max_kinetics_pretrained_slowonly_r50_4x16x1_20e_ava_rgb_20210301-37efcd15.pth
# BMN
[Bmn: Boundary-matching network for temporal action proposal generation](https://openaccess.thecvf.com/content_ICCV_2019/html/Lin_BMN_Boundary-Matching_Network_for_Temporal_Action_Proposal_Generation_ICCV_2019_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
Temporal action proposal generation is an challenging and promising task which aims to locate temporal regions in real-world videos where action or event may occur. Current bottom-up proposal generation methods can generate proposals with precise boundary, but cannot efficiently generate adequately reliable confidence scores for retrieving proposals. To address these difficulties, we introduce the Boundary-Matching (BM) mechanism to evaluate confidence scores of densely distributed proposals, which denote a proposal as a matching pair of starting and ending boundaries and combine all densely distributed BM pairs into the BM confidence map. Based on BM mechanism, we propose an effective, efficient and end-to-end proposal generation method, named Boundary-Matching Network (BMN), which generates proposals with precise temporal boundaries as well as reliable confidence scores simultaneously. The two-branches of BMN are jointly trained in an unified framework. We conduct experiments on two challenging datasets: THUMOS-14 and ActivityNet-1.3, where BMN shows significant performance improvement with remarkable efficiency and generalizability. Further, combining with existing action classifier, BMN can achieve state-of-the-art temporal action detection performance.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143016479-2ca7e8b6-a17b-4a4c-b4c9-ae731935cd91.png" width="800"/>
</div>
## Results and Models
### ActivityNet feature
| config | feature | gpus | AR@100 | AUC | AP@0.5 | AP@0.75 | AP@0.95 | mAP | gpu_mem(M) | iter time(s) | ckpt | log | json |
| :-----------------------------------------------------------------------------------------------------------: | :------------: | :--: | :----: | :---: | :----: | :-----: | :-----: | :---: | :--------: | ------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| [bmn_400x100_9e_2x8_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py) | cuhk_mean_100 | 2 | 75.28 | 67.22 | 42.47 | 31.31 | 9.92 | 30.34 | 5420 | 3.27 | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth) | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json) |
| | mmaction_video | 2 | 75.43 | 67.22 | 42.62 | 31.56 | 10.86 | 30.77 | 5420 | 3.27 | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth) | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json) |
| | mmaction_clip | 2 | 75.35 | 67.38 | 43.08 | 32.19 | 10.73 | 31.15 | 5420 | 3.27 | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth) | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json) |
| [BMN-official](https://github.com/JJBOY/BMN-Boundary-Matching-Network) (for reference)\* | cuhk_mean_100 | - | 75.27 | 67.49 | 42.22 | 30.98 | 9.22 | 30.00 | - | - | - | - | - |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk), mmaction_video and mmaction_clip denote feature extracted by mmaction, with video-level activitynet finetuned model or clip-level activitynet finetuned model respectively.
3. We evaluate the action detection performance of BMN, using [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) submission for ActivityNet2017 Untrimmed Video Classification Track to assign label for each action proposal.
:::
\*We train BMN with the [official repo](https://github.com/JJBOY/BMN-Boundary-Matching-Network), evaluate its proposal generation and action detection performance with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) for label assigning.
For more details on data preparation, you can refer to ActivityNet feature in [Data Preparation](/docs/data_preparation.md).
## Train
You can use the following command to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Example: train BMN model on ActivityNet features dataset.
```shell
python tools/train.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
```
For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting) .
## Test
You can use the following command to test a model.
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Example: test BMN on ActivityNet feature dataset.
```shell
# Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
python tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
```
You can also test the action detection performance of the model, with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) prediction file and generated proposal file (`results.json` in last command).
```shell
python tools/analysis/report_map.py --proposal path/to/proposal_file
```
:::{note}
1. (Optional) You can use the following command to generate a formatted proposal file, which will be fed into the action classifier (Currently supports SSN and P-GCN, not including TSN, I3D etc.) to get the classification result of proposals.
```shell
python tools/data/activitynet/convert_proposal_format.py
```
:::
For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset) .
## Citation
```BibTeX
@inproceedings{lin2019bmn,
title={Bmn: Boundary-matching network for temporal action proposal generation},
author={Lin, Tianwei and Liu, Xiao and Li, Xin and Ding, Errui and Wen, Shilei},
booktitle={Proceedings of the IEEE International Conference on Computer Vision},
pages={3889--3898},
year={2019}
}
```
<!-- [DATASET] -->
```BibTeX
@article{zhao2017cuhk,
title={Cuhk \& ethz \& siat submission to activitynet challenge 2017},
author={Zhao, Y and Zhang, B and Wu, Z and Yang, S and Zhou, L and Yan, S and Wang, L and Xiong, Y and Lin, D and Qiao, Y and others},
journal={arXiv preprint arXiv:1710.08011},
volume={8},
year={2017}
}
```
# BMN
## 简介
<!-- [ALGORITHM] -->
```BibTeX
@inproceedings{lin2019bmn,
title={Bmn: Boundary-matching network for temporal action proposal generation},
author={Lin, Tianwei and Liu, Xiao and Li, Xin and Ding, Errui and Wen, Shilei},
booktitle={Proceedings of the IEEE International Conference on Computer Vision},
pages={3889--3898},
year={2019}
}
```
<!-- [DATASET] -->
```BibTeX
@article{zhao2017cuhk,
title={Cuhk \& ethz \& siat submission to activitynet challenge 2017},
author={Zhao, Y and Zhang, B and Wu, Z and Yang, S and Zhou, L and Yan, S and Wang, L and Xiong, Y and Lin, D and Qiao, Y and others},
journal={arXiv preprint arXiv:1710.08011},
volume={8},
year={2017}
}
```
## 模型库
### ActivityNet feature
| 配置文件 | 特征 | GPU 数量 | AR@100 | AUC | AP@0.5 | AP@0.75 | AP@0.95 | mAP | GPU 显存占用 (M) | 推理时间 (s) | ckpt | log | json |
| :-----------------------------------------------------------------------------------------------------------: | :------------: | :------: | :----: | :---: | :----: | :-----: | :-----: | :---: | :--------------: | ------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| [bmn_400x100_9e_2x8_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py) | cuhk_mean_100 | 2 | 75.28 | 67.22 | 42.47 | 31.31 | 9.92 | 30.34 | 5420 | 3.27 | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth) | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json) |
| | mmaction_video | 2 | 75.43 | 67.22 | 42.62 | 31.56 | 10.86 | 30.77 | 5420 | 3.27 | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth) | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json) |
| | mmaction_clip | 2 | 75.35 | 67.38 | 43.08 | 32.19 | 10.73 | 31.15 | 5420 | 3.27 | [ckpt](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth) | [log](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log) | [json](https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json) |
| [BMN-official](https://github.com/JJBOY/BMN-Boundary-Matching-Network) (for reference)\* | cuhk_mean_100 | - | 75.27 | 67.49 | 42.22 | 30.98 | 9.22 | 30.00 | - | - | - | - | - |
- 注:
1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地,MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
依据 [线性缩放规则](https://arxiv.org/abs/1706.02677),当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时,需要根据批大小等比例地调节学习率。
如,lr=0.01 对应 4 GPUs x 2 video/gpu,以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
2. 对于 **特征** 这一列,`cuhk_mean_100` 表示所使用的特征为利用 [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk) 代码库抽取的,被广泛利用的 CUHK ActivityNet 特征,
`mmaction_video``mmaction_clip` 分布表示所使用的特征为利用 MMAction 抽取的,视频级别 ActivityNet 预训练模型的特征;视频片段级别 ActivityNet 预训练模型的特征。
3. MMAction2 使用 ActivityNet2017 未剪辑视频分类赛道上 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 所提交的结果来为每个视频的时序动作候选指定标签,以用于 BMN 模型评估。
\*MMAction2 在 [原始代码库](https://github.com/JJBOY/BMN-Boundary-Matching-Network) 上训练 BMN,并且在 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 的对应标签上评估时序动作候选生成和时序检测的结果。
对于数据集准备的细节,用户可参考 [数据集准备文档](/docs_zh_CN/data_preparation.md) 中的 ActivityNet 特征部分。
## 如何训练
用户可以使用以下指令进行模型训练。
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
例如:在 ActivityNet 特征上训练 BMN。
```shell
python tools/train.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
```
更多训练细节,可参考 [基础教程](/docs_zh_CN/getting_started.md#%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE) 中的 **训练配置** 部分。
## 如何测试
用户可以使用以下指令进行模型测试。
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如:在 ActivityNet 特征上测试 BMN 模型。
```shell
# 注:如果需要进行指标验证,需确测试数据的保标注文件包含真实标签
python tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
```
用户也可以利用 [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) 的预测文件评估模型时序检测的结果,并生成时序动作候选文件(即命令中的 `results.json`
```shell
python tools/analysis/report_map.py --proposal path/to/proposal_file
```
注:
1. (可选项) 用户可以使用以下指令生成格式化的时序动作候选文件,该文件可被送入动作识别器中(目前只支持 SSN 和 P-GCN,不包括 TSN, I3D 等),以获得时序动作候选的分类结果。
```shell
python tools/data/activitynet/convert_proposal_format.py
```
更多测试细节,可参考 [基础教程](/docs_zh_CN/getting_started.md#%E6%B5%8B%E8%AF%95%E6%9F%90%E4%B8%AA%E6%95%B0%E6%8D%AE%E9%9B%86) 中的 **测试某个数据集** 部分。
_base_ = [
'../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'ActivityNetDataset'
data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
ann_file_train = 'data/ActivityNet/anet_anno_train.json'
ann_file_val = 'data/ActivityNet/anet_anno_val.json'
ann_file_test = 'data/ActivityNet/anet_anno_val.json'
test_pipeline = [
dict(type='LoadLocalizationFeature'),
dict(
type='Collect',
keys=['raw_feature'],
meta_name='video_meta',
meta_keys=[
'video_name', 'duration_second', 'duration_frame', 'annotations',
'feature_frame'
]),
dict(type='ToTensor', keys=['raw_feature']),
]
train_pipeline = [
dict(type='LoadLocalizationFeature'),
dict(type='GenerateLocalizationLabels'),
dict(
type='Collect',
keys=['raw_feature', 'gt_bbox'],
meta_name='video_meta',
meta_keys=['video_name']),
dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
dict(
type='ToDataContainer',
fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
]
val_pipeline = [
dict(type='LoadLocalizationFeature'),
dict(type='GenerateLocalizationLabels'),
dict(
type='Collect',
keys=['raw_feature', 'gt_bbox'],
meta_name='video_meta',
meta_keys=[
'video_name', 'duration_second', 'duration_frame', 'annotations',
'feature_frame'
]),
dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
dict(
type='ToDataContainer',
fields=[dict(key='gt_bbox', stack=False, cpu_only=True)])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=8,
train_dataloader=dict(drop_last=True),
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
pipeline=test_pipeline,
data_prefix=data_root_val),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
pipeline=val_pipeline,
data_prefix=data_root_val),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
pipeline=train_pipeline,
data_prefix=data_root))
evaluation = dict(interval=1, metrics=['AR@AN'])
# optimizer
optimizer = dict(
type='Adam', lr=0.001, weight_decay=0.0001) # this lr is used for 2 gpus
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(policy='step', step=7)
total_epochs = 9
# runtime settings
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/'
output_config = dict(out=f'{work_dir}/results.json', output_format='json')
Collections:
- Name: BMN
README: configs/localization/bmn/README.md
Paper:
URL: https://arxiv.org/abs/1907.09702
Title: "BMN: Boundary-Matching Network for Temporal Action Proposal Generation"
Models:
- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
In Collection: BMN
Metadata:
Batch Size: 8
Epochs: 9
Training Data: ActivityNet v1.3
Training Resources: 2 GPUs
feature: cuhk_mean_100
Name: bmn_400x100_9e_2x8_activitynet_feature (cuhk_mean_100)
Results:
- Dataset: ActivityNet v1.3
Metrics:
AP@0.5: 42.47
AP@0.75: 31.31
AP@0.95: 9.92
AR@100: 75.28
AUC: 67.22
mAP: 30.34
Task: Temporal Action Localization
Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log.json
Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature.log
Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_9e_activitynet_feature/bmn_400x100_9e_activitynet_feature_20200619-42a3b111.pth
- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
In Collection: BMN
Metadata:
Batch Size: 8
Epochs: 9
Training Data: ActivityNet v1.3
Training Resources: 2 GPUs
feature: mmaction_video
Name: bmn_400x100_9e_2x8_activitynet_feature (mmaction_video)
Results:
- Dataset: ActivityNet v1.3
Metrics:
AP@0.5: 42.62
AP@0.75: 31.56
AP@0.95: 10.86
AR@100: 75.43
AUC: 67.22
mAP: 30.77
Task: Temporal Action Localization
Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.json
Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809.log
Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_video/bmn_400x100_2x8_9e_mmaction_video_20200809-c9fd14d2.pth
- Config: configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py
In Collection: BMN
Metadata:
Batch Size: 8
Epochs: 9
Training Data: ActivityNet v1.3
Training Resources: 2 GPUs
feature: mmaction_clip
Name: bmn_400x100_9e_2x8_activitynet_feature (mmaction_clip)
Results:
- Dataset: ActivityNet v1.3
Metrics:
AP@0.5: 43.08
AP@0.75: 32.19
AP@0.95: 10.73
AR@100: 75.35
AUC: 67.38
mAP: 31.15
Task: Temporal Action Localization
Training Json Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.json
Training Log: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809.log
Weights: https://download.openmmlab.com/mmaction/localization/bmn/bmn_400x100_2x8_9e_mmaction_clip/bmn_400x100_2x8_9e_mmaction_clip_20200809-10d803ce.pth
# BSN
[Bsn: Boundary sensitive network for temporal action proposal generation](https://openaccess.thecvf.com/content_ECCV_2018/html/Tianwei_Lin_BSN_Boundary_Sensitive_ECCV_2018_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
Temporal action proposal generation is an important yet challenging problem, since temporal proposals with rich action content are indispensable for analysing real-world videos with long duration and high proportion irrelevant content. This problem requires methods not only generating proposals with precise temporal boundaries, but also retrieving proposals to cover truth action instances with high recall and high overlap using relatively fewer proposals. To address these difficulties, we introduce an effective proposal generation method, named Boundary-Sensitive Network (BSN), which adopts "local to global" fashion. Locally, BSN first locates temporal boundaries with high probabilities, then directly combines these boundaries as proposals. Globally, with Boundary-Sensitive Proposal feature, BSN retrieves proposals by evaluating the confidence of whether a proposal contains an action within its region. We conduct experiments on two challenging datasets: ActivityNet-1.3 and THUMOS14, where BSN outperforms other state-of-the-art temporal action proposal generation methods with high recall and high temporal precision. Finally, further experiments demonstrate that by combining existing action classifiers, our method significantly improves the state-of-the-art temporal action detection performance.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143016692-69efafbd-cec6-47f1-af45-371d0ff78a97.png" width="800"/>
</div>
## Results and Models
### ActivityNet feature
| config | feature | gpus | pretrain | AR@100 | AUC | gpu_mem(M) | iter time(s) | ckpt | log | json |
| :--------------------------------------- | :------------: | :--: | :------: | :----: | :---: | :-------------: | :-------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| bsn_400x100_1x16_20e_activitynet_feature | cuhk_mean_100 | 1 | None | 74.66 | 66.45 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json) |
| | mmaction_video | 1 | None | 74.93 | 66.74 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json) |
| | mmaction_clip | 1 | None | 75.19 | 66.81 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json) |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk), mmaction_video and mmaction_clip denote feature extracted by mmaction, with video-level activitynet finetuned model or clip-level activitynet finetuned model respectively.
:::
For more details on data preparation, you can refer to ActivityNet feature in [Data Preparation](/docs/data_preparation.md).
## Train
You can use the following commands to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Examples:
1. train BSN(TEM) on ActivityNet features dataset.
```shell
python tools/train.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
```
2. train BSN(PEM) on PGM results.
```shell
python tools/train.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
```
For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
## Inference
You can use the following commands to inference a model.
1. For TEM Inference
```shell
# Note: This could not be evaluated.
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
2. For PGM Inference
```shell
python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
```
3. For PEM Inference
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Examples:
1. Inference BSN(TEM) with pretrained model.
```shell
python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
```
2. Inference BSN(PGM) with pretrained model.
```shell
python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode train
```
3. Inference BSN(PEM) with evaluation metric 'AR@AN' and output the results.
```shell
# Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
```
## Test
You can use the following commands to test a model.
1. TEM
```shell
# Note: This could not be evaluated.
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
2. PGM
```shell
python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
```
3. PEM
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Examples:
1. Test a TEM model on ActivityNet dataset.
```shell
python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
```
2. Test a PGM model on ActivityNet dataset.
```shell
python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode test
```
3. Test a PEM model with with evaluation metric 'AR@AN' and output the results.
```shell
python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
```
:::{note}
1. (Optional) You can use the following command to generate a formatted proposal file, which will be fed into the action classifier (Currently supports only SSN and P-GCN, not including TSN, I3D etc.) to get the classification result of proposals.
```shell
python tools/data/activitynet/convert_proposal_format.py
```
:::
For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
## Citation
```BibTeX
@inproceedings{lin2018bsn,
title={Bsn: Boundary sensitive network for temporal action proposal generation},
author={Lin, Tianwei and Zhao, Xu and Su, Haisheng and Wang, Chongjing and Yang, Ming},
booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
pages={3--19},
year={2018}
}
```
# BSN
## 简介
<!-- [ALGORITHM] -->
```BibTeX
@inproceedings{lin2018bsn,
title={Bsn: Boundary sensitive network for temporal action proposal generation},
author={Lin, Tianwei and Zhao, Xu and Su, Haisheng and Wang, Chongjing and Yang, Ming},
booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
pages={3--19},
year={2018}
}
```
## 模型库
### ActivityNet feature
| 配置文件 | 特征 | GPU 数量 | 预训练 | AR@100 | AUC | GPU 显存占用 (M) | 迭代时间 (s) | ckpt | log | json |
| :--------------------------------------- | :------------: | :------: | :----: | :----: | :---: | :--------------: | :-------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| bsn_400x100_1x16_20e_activitynet_feature | cuhk_mean_100 | 1 | None | 74.66 | 66.45 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json) |
| | mmaction_video | 1 | None | 74.93 | 66.74 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json) |
| | mmaction_clip | 1 | None | 75.19 | 66.81 | 41(TEM)+25(PEM) | 0.074(TEM)+0.036(PEM) | [ckpt_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth) [ckpt_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth) | [log_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log) [log_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log) | [json_tem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json) [json_pem](https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json) |
注:
1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地,MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
依据 [线性缩放规则](https://arxiv.org/abs/1706.02677),当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时,需要根据批大小等比例地调节学习率。
如,lr=0.01 对应 4 GPUs x 2 video/gpu,以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
2. 对于 **特征** 这一列,`cuhk_mean_100` 表示所使用的特征为利用 [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk) 代码库抽取的,被广泛利用的 CUHK ActivityNet 特征,
`mmaction_video``mmaction_clip` 分布表示所使用的特征为利用 MMAction 抽取的,视频级别 ActivityNet 预训练模型的特征;视频片段级别 ActivityNet 预训练模型的特征。
对于数据集准备的细节,用户可参考 [数据集准备文档](/docs_zh_CN/data_preparation.md) 中的 ActivityNet 特征部分。
## 如何训练
用户可以使用以下指令进行模型训练。
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
例如:
1. 在 ActivityNet 特征上训练 BSN(TEM) 模型。
```shell
python tools/train.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
```
2. 基于 PGM 的结果训练 BSN(PEM)。
```shell
python tools/train.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
```
更多训练细节,可参考 [基础教程](/docs_zh_CN/getting_started.md#%E8%AE%AD%E7%BB%83%E9%85%8D%E7%BD%AE) 中的 **训练配置** 部分。
## 如何进行推理
用户可以使用以下指令进行模型推理。
1. 推理 TEM 模型。
```shell
# Note: This could not be evaluated.
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
2. 推理 PGM 模型
```shell
python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
```
3. 推理 PEM 模型
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如
1. 利用预训练模型进行 BSN(TEM) 模型的推理。
```shell
python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
```
2. 利用预训练模型进行 BSN(PGM) 模型的推理
```shell
python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode train
```
3. 推理 BSN(PEM) 模型,并计算 'AR@AN' 指标,输出结果文件。
```shell
# 注:如果需要进行指标验证,需确测试数据的保标注文件包含真实标签
python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
```
## 如何测试
用户可以使用以下指令进行模型测试。
1. TEM
```shell
# 注:该命令无法进行指标验证
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
2. PGM
```shell
python tools/misc/bsn_proposal_generation.py ${CONFIG_FILE} [--mode ${MODE}]
```
3. PEM
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如:
1. 在 ActivityNet 数据集上测试 TEM 模型。
```shell
python tools/test.py configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth
```
2. 在 ActivityNet 数据集上测试 PGM 模型。
```shell
python tools/misc/bsn_proposal_generation.py configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py --mode test
```
3. 测试 PEM 模型,并计算 'AR@AN' 指标,输出结果文件。
```shell
python tools/test.py configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py checkpoints/SOME_CHECKPOINT.pth --eval AR@AN --out results.json
```
注:
1. (可选项) 用户可以使用以下指令生成格式化的时序动作候选文件,该文件可被送入动作识别器中(目前只支持 SSN 和 P-GCN,不包括 TSN, I3D 等),以获得时序动作候选的分类结果。
```shell
python tools/data/activitynet/convert_proposal_format.py
```
更多测试细节,可参考 [基础教程](/docs_zh_CN/getting_started.md#%E6%B5%8B%E8%AF%95%E6%9F%90%E4%B8%AA%E6%95%B0%E6%8D%AE%E9%9B%86) 中的 **测试某个数据集** 部分。
_base_ = [
'../../_base_/models/bsn_pem.py', '../../_base_/schedules/adam_20e.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'ActivityNetDataset'
data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
ann_file_train = 'data/ActivityNet/anet_anno_train.json'
ann_file_val = 'data/ActivityNet/anet_anno_val.json'
ann_file_test = 'data/ActivityNet/anet_anno_val.json'
work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
pgm_proposals_dir = f'{work_dir}/pgm_proposals/'
pgm_features_dir = f'{work_dir}/pgm_features/'
test_pipeline = [
dict(
type='LoadProposals',
top_k=1000,
pgm_proposals_dir=pgm_proposals_dir,
pgm_features_dir=pgm_features_dir),
dict(
type='Collect',
keys=['bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score'],
meta_name='video_meta',
meta_keys=[
'video_name', 'duration_second', 'duration_frame', 'annotations',
'feature_frame'
]),
dict(type='ToTensor', keys=['bsp_feature'])
]
train_pipeline = [
dict(
type='LoadProposals',
top_k=500,
pgm_proposals_dir=pgm_proposals_dir,
pgm_features_dir=pgm_features_dir),
dict(
type='Collect',
keys=['bsp_feature', 'reference_temporal_iou'],
meta_name='video_meta',
meta_keys=[]),
dict(type='ToTensor', keys=['bsp_feature', 'reference_temporal_iou']),
dict(
type='ToDataContainer',
fields=(dict(key='bsp_feature', stack=False),
dict(key='reference_temporal_iou', stack=False)))
]
val_pipeline = [
dict(
type='LoadProposals',
top_k=1000,
pgm_proposals_dir=pgm_proposals_dir,
pgm_features_dir=pgm_features_dir),
dict(
type='Collect',
keys=['bsp_feature', 'tmin', 'tmax', 'tmin_score', 'tmax_score'],
meta_name='video_meta',
meta_keys=[
'video_name', 'duration_second', 'duration_frame', 'annotations',
'feature_frame'
]),
dict(type='ToTensor', keys=['bsp_feature'])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=8,
train_dataloader=dict(drop_last=True),
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
pipeline=test_pipeline,
data_prefix=data_root_val),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
pipeline=val_pipeline,
data_prefix=data_root_val),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
pipeline=train_pipeline,
data_prefix=data_root))
evaluation = dict(interval=1, metrics=['AR@AN'])
# runtime settings
checkpoint_config = dict(interval=1, filename_tmpl='pem_epoch_{}.pth')
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
output_config = dict(out=f'{work_dir}/results.json', output_format='json')
# dataset settings
dataset_type = 'ActivityNetDataset'
data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
ann_file_train = 'data/ActivityNet/anet_anno_train.json'
ann_file_val = 'data/ActivityNet/anet_anno_val.json'
ann_file_test = 'data/ActivityNet/anet_anno_test.json'
work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
tem_results_dir = f'{work_dir}/tem_results/'
pgm_proposals_dir = f'{work_dir}/pgm_proposals/'
pgm_features_dir = f'{work_dir}/pgm_features/'
temporal_scale = 100
pgm_proposals_cfg = dict(
pgm_proposals_thread=8, temporal_scale=temporal_scale, peak_threshold=0.5)
pgm_features_test_cfg = dict(
pgm_features_thread=4,
top_k=1000,
num_sample_start=8,
num_sample_end=8,
num_sample_action=16,
num_sample_interp=3,
bsp_boundary_ratio=0.2)
pgm_features_train_cfg = dict(
pgm_features_thread=4,
top_k=500,
num_sample_start=8,
num_sample_end=8,
num_sample_action=16,
num_sample_interp=3,
bsp_boundary_ratio=0.2)
_base_ = ['../../_base_/models/bsn_tem.py', '../../_base_/default_runtime.py']
# dataset settings
dataset_type = 'ActivityNetDataset'
data_root = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
data_root_val = 'data/ActivityNet/activitynet_feature_cuhk/csv_mean_100/'
ann_file_train = 'data/ActivityNet/anet_anno_train.json'
ann_file_val = 'data/ActivityNet/anet_anno_val.json'
ann_file_test = 'data/ActivityNet/anet_anno_full.json'
test_pipeline = [
dict(type='LoadLocalizationFeature'),
dict(
type='Collect',
keys=['raw_feature'],
meta_name='video_meta',
meta_keys=['video_name']),
dict(type='ToTensor', keys=['raw_feature'])
]
train_pipeline = [
dict(type='LoadLocalizationFeature'),
dict(type='GenerateLocalizationLabels'),
dict(
type='Collect',
keys=['raw_feature', 'gt_bbox'],
meta_name='video_meta',
meta_keys=['video_name']),
dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
dict(type='ToDataContainer', fields=[dict(key='gt_bbox', stack=False)])
]
val_pipeline = [
dict(type='LoadLocalizationFeature'),
dict(type='GenerateLocalizationLabels'),
dict(
type='Collect',
keys=['raw_feature', 'gt_bbox'],
meta_name='video_meta',
meta_keys=['video_name']),
dict(type='ToTensor', keys=['raw_feature', 'gt_bbox']),
dict(type='ToDataContainer', fields=[dict(key='gt_bbox', stack=False)])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=8,
train_dataloader=dict(drop_last=True),
val_dataloader=dict(videos_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
pipeline=test_pipeline,
data_prefix=data_root_val),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
pipeline=val_pipeline,
data_prefix=data_root_val),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
pipeline=train_pipeline,
data_prefix=data_root))
# optimizer
optimizer = dict(
type='Adam', lr=0.001, weight_decay=0.0001) # this lr is used for 1 gpus
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(policy='step', step=7)
total_epochs = 20
# runtime settings
checkpoint_config = dict(interval=1, filename_tmpl='tem_epoch_{}.pth')
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
workflow = [('train', 1), ('val', 1)]
work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/'
tem_results_dir = f'{work_dir}/tem_results/'
output_config = dict(out=tem_results_dir, output_format='csv')
Collections:
- Name: BSN
README: configs/localization/bsn/README.md
Paper:
URL: https://arxiv.org/abs/1806.02964
Title: "BSN: Boundary Sensitive Network for Temporal Action Proposal Generation"
Models:
- Config:
- configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
- configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
- configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
In Collection: BSN
Metadata:
Pretrained: None
Training Data: ActivityNet v1.3
Training Resources: 1 GPUs
feature: cuhk_mean_100
Name: bsn_400x100_1x16_20e_activitynet_feature (cuhk_mean_100)
Results:
- Dataset: ActivityNet v1.3
Metrics:
AR@100: 74.66
AUC: 66.45
Task: Temporal Action Localization
Training Json Log:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log.json
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log.json
Training Log:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature.log
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature.log
Weights:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature/bsn_tem_400x100_1x16_20e_activitynet_feature_20200619-cd6accc3.pth
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature/bsn_pem_400x100_1x16_20e_activitynet_feature_20210203-1c27763d.pth
- Config:
- configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
- configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
- configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
In Collection: BSN
Metadata:
Pretrained: None
Training Data: ActivityNet v1.3
Training Resources: 1 GPUs
feature: mmaction_video
Name: bsn_400x100_1x16_20e_activitynet_feature (mmaction_video)
Results:
- Dataset: ActivityNet v1.3
Metrics:
AR@100: 74.93
AUC: 66.74
Task: Temporal Action Localization
Training Json Log:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.json
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.json
Training Log:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809.log
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809.log
Weights:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_video/bsn_tem_400x100_1x16_20e_mmaction_video_20200809-ad6ec626.pth
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_video/bsn_pem_400x100_1x16_20e_mmaction_video_20200809-aa861b26.pth
- Config:
- configs/localization/bsn/bsn_pem_400x100_1x16_20e_activitynet_feature.py
- configs/localization/bsn/bsn_pgm_400x100_activitynet_feature.py
- configs/localization/bsn/bsn_tem_400x100_1x16_20e_activitynet_feature.py
In Collection: BSN
Metadata:
Pretrained: None
Training Data: ActivityNet v1.3
Training Resources: 1 GPUs
feature: mmaction_clip
Name: bsn_400x100_1x16_20e_activitynet_feature (mmaction_clip)
Results:
- Dataset: ActivityNet v1.3
Metrics:
AR@100: 75.19
AUC: 66.81
Task: Temporal Action Localization
Training Json Log:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.json
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.json
Training Log:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809.log
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809.log
Weights:
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_tem_400x100_1x16_20e_mmaction_clip/bsn_tem_400x100_1x16_20e_mmaction_clip_20200809-0a563554.pth
- https://download.openmmlab.com/mmaction/localization/bsn/bsn_pem_400x100_1x16_20e_mmaction_clip/bsn_pem_400x100_1x16_20e_mmaction_clip_20200809-e32f61e6.pth
# SSN
[Temporal Action Detection With Structured Segment Networks](https://openaccess.thecvf.com/content_iccv_2017/html/Zhao_Temporal_Action_Detection_ICCV_2017_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
Detecting actions in untrimmed videos is an important yet challenging task. In this paper, we present the structured segment network (SSN), a novel framework which models the temporal structure of each action instance via a structured temporal pyramid. On top of the pyramid, we further introduce a decomposed discriminative model comprising two classifiers, respectively for classifying actions and determining completeness. This allows the framework to effectively distinguish positive proposals from background or incomplete ones, thus leading to both accurate recognition and localization. These components are integrated into a unified network that can be efficiently trained in an end-to-end fashion. Additionally, a simple yet effective temporal action proposal scheme, dubbed temporal actionness grouping (TAG) is devised to generate high quality action proposals. On two challenging benchmarks, THUMOS14 and ActivityNet, our method remarkably outperforms previous state-of-the-art methods, demonstrating superior accuracy and strong adaptivity in handling actions with various temporal structures.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143016899-017893d3-a907-4487-90a2-cb884088266c.png" width="800"/>
</div>
## Results and Models
| config | gpus | backbone | pretrain | mAP@0.3 | mAP@0.4 | mAP@0.5 | reference mAP@0.3 | reference mAP@0.4 | reference mAP@0.5 | gpu_mem(M) | ckpt | log | json | reference ckpt | reference json |
| :---------------------------------------------------------------------------------------: | :--: | :------: | :------: | :-----: | :-----: | :-----: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | :--------: | :----------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
| [ssn_r50_450e_thumos14_rgb](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) | 8 | ResNet50 | ImageNet | 29.37 | 22.15 | 15.69 | [27.61](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [21.28](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | [14.57](https://github.com/open-mmlab/mmaction/tree/c7e3b7c11fb94131be9b48a8e3d510589addc3ce#Get%20started) | 6352 | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/ssn_r50_450e_thumos14_rgb_20201012-1920ab16.pth) | [log](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log) | [json](https://download.openmmlab.com/mmaction/localization/ssn/ssn_r50_450e_thumos14_rgb/20201005_144656.log.json) | [ckpt](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/ssn_r50_450e_thumos14_rgb_ref_20201014-b6f48f68.pth) | [json](https://download.openmmlab.com/mmaction/localization/ssn/mmaction_reference/ssn_r50_450e_thumos14_rgb_ref/20201008_103258.log.json) |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. Since SSN utilizes different structured temporal pyramid pooling methods at training and testing, please refer to [ssn_r50_450e_thumos14_rgb_train](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py) at training and [ssn_r50_450e_thumos14_rgb_test](/configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py) at testing.
3. We evaluate the action detection performance of SSN, using action proposals of TAG. For more details on data preparation, you can refer to thumos14 TAG proposals in [Data Preparation](/docs/data_preparation.md).
4. The reference SSN in is evaluated with `ResNet50` backbone in MMAction, which is the same backbone with ours. Note that the original setting of MMAction SSN uses the `BNInception` backbone.
:::
## Train
You can use the following command to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Example: train SSN model on thumos14 dataset.
```shell
python tools/train.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_train.py
```
For more details and optional arguments infos, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
## Test
You can use the following command to test a model.
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Example: test BMN on ActivityNet feature dataset.
```shell
# Note: If evaluated, then please make sure the annotation file for test data contains groundtruth.
python tools/test.py configs/localization/ssn/ssn_r50_450e_thumos14_rgb_test.py checkpoints/SOME_CHECKPOINT.pth --eval mAP
```
For more details and optional arguments infos, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
## Citation
```BibTeX
@InProceedings{Zhao_2017_ICCV,
author = {Zhao, Yue and Xiong, Yuanjun and Wang, Limin and Wu, Zhirong and Tang, Xiaoou and Lin, Dahua},
title = {Temporal Action Detection With Structured Segment Networks},
booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
month = {Oct},
year = {2017}
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment