Commit 5b3e36dc authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add model TSM

parents
Pipeline #315 failed with stages
in 0 seconds
_base_ = [
'../../_base_/models/tpn_slowonly_r50.py',
'../../_base_/default_runtime.py'
]
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='ColorJitter'),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='ColorJitter'),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=8,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001,
nesterov=True) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='step', step=[75, 125])
total_epochs = 150
# runtime settings
work_dir = './work_dirs/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics400_rgb' # noqa: E501
_base_ = ['./tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb.py']
# model settings
model = dict(backbone=dict(pretrained=None))
# runtime settings
work_dir = './work_dirs/tpn_slowonly_r50_8x8x1_150e_kinetics400_rgb'
_base_ = [
'../../_base_/models/tpn_tsm_r50.py', '../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/sthv1/rawframes'
data_root_val = 'data/sthv1/rawframes'
ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt'
ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='ColorJitter'),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
twice_sample=True,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=8,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005,
nesterov=True) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
# learning policy
lr_config = dict(policy='step', step=[75, 125])
total_epochs = 150
# runtime settings
work_dir = './work_dirs/tpn_tsm_r50_1x1x8_150e_kinetics400_rgb'
# TRN
[Temporal Relational Reasoning in Videos](https://openaccess.thecvf.com/content_ECCV_2018/html/Bolei_Zhou_Temporal_Relational_Reasoning_ECCV_2018_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
Temporal relational reasoning, the ability to link meaningful transformations of objects or entities over time, is a fundamental property of intelligent species. In this paper, we introduce an effective and interpretable network module, the Temporal Relation Network (TRN), designed to learn and reason about temporal dependencies between video frames at multiple time scales. We evaluate TRN-equipped networks on activity recognition tasks using three recent video datasets - Something-Something, Jester, and Charades - which fundamentally depend on temporal relational reasoning. Our results demonstrate that the proposed TRN gives convolutional neural networks a remarkable capacity to discover temporal relations in videos. Through only sparsely sampled video frames, TRN-equipped networks can accurately predict human-object interactions in the Something-Something dataset and identify various human gestures on the Jester dataset with very competitive performance. TRN-equipped networks also outperform two-stream networks and 3D convolution networks in recognizing daily activities in the Charades dataset. Further analyses show that the models learn intuitive and interpretable visual common sense knowledge in videos.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143018998-d2120c3d-a9a7-4e4c-90b1-1e5ff1fd5f06.png" width="800"/>
</div>
## Results and Models
### Something-Something V1
| config | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | gpu_mem(M) | ckpt | log | json |
| :------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
| [trn_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 31.62 / 33.88 | 60.01 / 62.12 | 11010 | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/trn_r50_1x1x8_50e_sthv1_rgb_20210401-163704a8.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log.json) |
### Something-Something V2
| config | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | gpu_mem(M) | ckpt | log | json |
| :------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
| [trn_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py) | height 256 | 8 | ResNet50 | ImageNet | 48.39 / 51.28 | 76.58 / 78.65 | 11010 | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/trn_r50_1x1x8_50e_sthv2_rgb_20210816-7abbc4c1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log.json) |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop x 1 clip) and accurate setting (Three crop x 2 clip).
3. In the original [repository](https://github.com/zhoubolei/TRN-pytorch), the author augments data with random flipping on something-something dataset, but the augmentation method may be wrong due to the direct actions, such as `push left to right`. So, we replaced `flip` with `flip with label mapping`, and change the testing method `TenCrop`, which has five flipped crops, to `Twice Sample & ThreeCrop`.
4. We use `ResNet50` instead of `BNInception` as the backbone of TRN. When Training `TRN-ResNet50` on sthv1 dataset in the original repository, we get top1 (top5) accuracy 30.542 (58.627) vs. ours 31.62 (60.01).
:::
For more details on data preparation, you can refer to
- [preparing_sthv1](/tools/data/sthv1/README.md)
- [preparing_sthv2](/tools/data/sthv2/README.md)
## Train
You can use the following command to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Example: train TRN model on sthv1 dataset in a deterministic option with periodic validation.
```shell
python tools/train.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
--work-dir work_dirs/trn_r50_1x1x8_50e_sthv1_rgb \
--validate --seed 0 --deterministic
```
For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
## Test
You can use the following command to test a model.
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Example: test TRN model on sthv1 dataset and dump the result to a json file.
```shell
python tools/test.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json
```
For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
## Citation
```BibTeX
@article{zhou2017temporalrelation,
title = {Temporal Relational Reasoning in Videos},
author = {Zhou, Bolei and Andonian, Alex and Oliva, Aude and Torralba, Antonio},
journal={European Conference on Computer Vision},
year={2018}
}
```
# TRN
## 简介
<!-- [ALGORITHM] -->
```BibTeX
@article{zhou2017temporalrelation,
title = {Temporal Relational Reasoning in Videos},
author = {Zhou, Bolei and Andonian, Alex and Oliva, Aude and Torralba, Antonio},
journal={European Conference on Computer Vision},
year={2018}
}
```
## 模型库
### Something-Something V1
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | GPU 显存占用 (M) | ckpt | log | json |
| :------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
| [trn_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 31.62 / 33.88 | 60.01 / 62.12 | 11010 | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/trn_r50_1x1x8_50e_sthv1_rgb_20210401-163704a8.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log.json) |
### Something-Something V2
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | GPU 显存占用 (M) | ckpt | log | json |
| :------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
| [trn_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py) | 高 256 | 8 | ResNet50 | ImageNet | 48.39 / 51.28 | 76.58 / 78.65 | 11010 | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/trn_r50_1x1x8_50e_sthv2_rgb_20210816-7abbc4c1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log.json) |
注:
1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地,MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
依据 [线性缩放规则](https://arxiv.org/abs/1706.02677),当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时,需要根据批大小等比例地调节学习率。
如,lr=0.01 对应 4 GPUs x 2 video/gpu,以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
2. 对于 Something-Something 数据集,有两种测试方案:efficient(对应 center crop x 1 clip)和 accurate(对应 Three crop x 2 clip)。
3. 在原代码库中,作者在 Something-Something 数据集上使用了随机水平翻转,但这种数据增强方法有一些问题,因为 Something-Something 数据集有一些方向性的动作,比如`从左往右推`。所以 MMAction2 把`随机水平翻转`改为`带标签映射的水平翻转`,同时修改了测试模型的数据处理方法,即把`裁剪 10 个图像块`(这里面包括 5 个翻转后的图像块)修改成`采帧两次 & 裁剪 3 个图像块`
4. MMAction2 使用 `ResNet50` 代替 `BNInception` 作为 TRN 的主干网络。使用原代码,在 sthv1 数据集上训练 `TRN-ResNet50` 时,实验得到的 top1 (top5) 的准确度为 30.542 (58.627),而 MMAction2 的精度为 31.62 (60.01)。
关于数据处理的更多细节,用户可以参照
- [准备 sthv1](/tools/data/sthv1/README_zh-CN.md)
- [准备 sthv2](/tools/data/sthv2/README_zh-CN.md)
## 如何训练
用户可以使用以下指令进行模型训练。
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
例如:以一个确定性的训练方式,辅以定期的验证过程进行 TRN 模型在 sthv1 数据集上的训练。
```shell
python tools/train.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
--work-dir work_dirs/trn_r50_1x1x8_50e_sthv1_rgb \
--validate --seed 0 --deterministic
```
更多训练细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
## 如何测试
用户可以使用以下指令进行模型测试。
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如:在 sthv1 数据集上测试 TRN 模型,并将结果导出为一个 json 文件。
```shell
python tools/test.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json
```
更多测试细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
Collections:
- Name: TRN
README: configs/recognition/trn/README.md
Paper:
URL: https://arxiv.org/abs/1711.08496
Title: Temporal Relational Reasoning in Videos
Models:
- Config: configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py
In Collection: TRN
Metadata:
Architecture: ResNet50
Batch Size: 16
Epochs: 50
Parameters: 26641154
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: trn_r50_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 33.88
Top 1 Accuracy (efficient): 31.62
Top 5 Accuracy: 62.12
Top 5 Accuracy (efficient): 60.01
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log
Weights: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/trn_r50_1x1x8_50e_sthv1_rgb_20210401-163704a8.pth
- Config: configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py
In Collection: TRN
Metadata:
Architecture: ResNet50
Batch Size: 16
Epochs: 50
Parameters: 26641154
Pretrained: ImageNet
Resolution: height 256
Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Name: trn_r50_1x1x8_50e_sthv2_rgb
Results:
- Dataset: SthV2
Metrics:
Top 1 Accuracy: 51.28
Top 1 Accuracy (efficient): 48.39
Top 5 Accuracy: 78.65
Top 5 Accuracy (efficient): 76.58
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log
Weights: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/trn_r50_1x1x8_50e_sthv2_rgb_20210816-7abbc4c1.pth
_base_ = [
'../../_base_/models/trn_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=174))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/sthv1/rawframes'
data_root_val = 'data/sthv1/rawframes'
ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt'
ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52}
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
twice_sample=True,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
filename_tmpl='{:05}.jpg',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
filename_tmpl='{:05}.jpg',
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
filename_tmpl='{:05}.jpg',
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(lr=0.002, paramwise_cfg=dict(fc_lr5=False), weight_decay=5e-4)
# learning policy
lr_config = dict(policy='step', step=[30, 45])
total_epochs = 50
# runtime settings
find_unused_parameters = True
work_dir = './work_dirs/trn_r50_1x1x8_50e_sthv1_rgb/'
_base_ = [
'../../_base_/models/trn_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=174))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/sthv2/rawframes'
data_root_val = 'data/sthv2/rawframes'
ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt'
ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt'
ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt'
sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
twice_sample=True,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(lr=0.002, paramwise_cfg=dict(fc_lr5=False), weight_decay=5e-4)
# learning policy
lr_config = dict(policy='step', step=[30, 45])
total_epochs = 50
# runtime settings
find_unused_parameters = True
work_dir = './work_dirs/trn_r50_1x1x8_50e_sthv2_rgb/'
# TSM
[TSM: Temporal Shift Module for Efficient Video Understanding](https://openaccess.thecvf.com/content_ICCV_2019/html/Lin_TSM_Temporal_Shift_Module_for_Efficient_Video_Understanding_ICCV_2019_paper.html)
<!-- [ALGORITHM] -->
## Abstract
<!-- [ABSTRACT] -->
The explosive growth in video streaming gives rise to challenges on performing video understanding at high accuracy and low computation cost. Conventional 2D CNNs are computationally cheap but cannot capture temporal relationships; 3D CNN based methods can achieve good performance but are computationally intensive, making it expensive to deploy. In this paper, we propose a generic and effective Temporal Shift Module (TSM) that enjoys both high efficiency and high performance. Specifically, it can achieve the performance of 3D CNN but maintain 2D CNN's complexity. TSM shifts part of the channels along the temporal dimension; thus facilitate information exchanged among neighboring frames. It can be inserted into 2D CNNs to achieve temporal modeling at zero computation and zero parameters. We also extended TSM to online setting, which enables real-time low-latency online video recognition and video object detection. TSM is accurate and efficient: it ranks the first place on the Something-Something leaderboard upon publication; on Jetson Nano and Galaxy Note8, it achieves a low latency of 13ms and 35ms for online video recognition.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/34324155/143019083-abc0de39-9ea1-4175-be5c-073c90de64c3.png" width="800"/>
</div>
## Results and Models
### Kinetics-400
| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------------------------- | :------------: | :--: | :---------: | :------: | :------: | :------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | 340x256 | 8 | ResNet50 | ImageNet | 70.24 | 89.56 | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | 74.0 (8x1 frames) | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json) |
| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | short-side 256 | 8 | ResNet50 | ImageNet | 70.59 | 89.52 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json) |
| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | short-side 320 | 8 | ResNet50 | ImageNet | 70.73 | 89.81 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20210701-68d582b4.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log.json) |
| [tsm_r50_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb.py) | short-side 320 | 8 | ResNet50 | ImageNet | 71.90 | 90.03 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/tsm_r50_1x1x8_100e_kinetics400_rgb_20210701-7ff22268.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log.json) |
| [tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py](/configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py) | short-side 256 | 8 | ResNet50 | ImageNet | 70.48 | 89.40 | x | x | x | 7076 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json) |
| [tsm_r50_video_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py) | short-side 256 | 8 | ResNet50 | ImageNet | 70.25 | 89.66 | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | 74.0 (8x1 frames) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json) |
| [tsm_r50_dense_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb.py) | short-side 320 | 8 | ResNet50 | ImageNet | 73.46 | 90.84 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/tsm_r50_dense_1x1x8_50e_kinetics400_rgb_20210701-a54ff3d3.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log.json) |
| [tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) | short-side 320 | 8 | ResNet50 | ImageNet | 74.55 | 91.74 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20210701-e3e5e97f.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log.json) |
| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) | 340x256 | 8 | ResNet50 | ImageNet | 72.09 | 90.37 | [70.67](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) | [89.98](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) | 47.0 (16x1 frames) | 10404 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_340x256_1x1x16_50e_kinetics400_rgb_20201011-2f27f229.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log.json) |
| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) | short-side 256 | 8x4 | ResNet50 | ImageNet | 71.89 | 90.73 | x | x | x | 10398 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log.json) |
| [tsm_r50_1x1x16_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb.py) | short-side 320 | 8 | ResNet50 | ImageNet | 72.80 | 90.75 | x | x | x | 10398 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/tsm_r50_1x1x16_100e_kinetics400_rgb_20210701-41ac92b9.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log.json) |
| [tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py) | short-side 320 | 8x4 | ResNet50 | ImageNet | 72.03 | 90.25 | 71.81 | 90.36 | x | 8931 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200724-f00f1336.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log.json) |
| [tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py) | short-side 320 | 8x4 | ResNet50 | ImageNet | 70.70 | 89.90 | x | x | x | 10125 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200816-b93fd297.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log.json) |
| [tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py) | short-side 320 | 8x4 | ResNet50 | ImageNet | 71.60 | 90.34 | x | x | x | 8358 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb_20200724-d8ad84d2.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log.json) |
| [tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py) | short-side 320 | 8 | MobileNetV2 | ImageNet | 68.46 | 88.64 | x | x | x | 3385 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/tsm_mobilenetv2_dense_320p_1x1x8_100e_kinetics400_rgb_20210202-61135809.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log.json) |
| [tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py) | short-side 320 | 8 | MobileNetV2 | ImageNet | 69.89 | 89.01 | x | x | x | 3385 | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port_20210922-aa5cadf6.pth) | x | x |
### Diving48
| config | gpus | backbone | pretrain | top1 acc | top5 acc | gpu_mem(M) | ckpt | log | json |
| :--------------------------------------------------------------------------------------------------------- | :--: | :------: | :------: | :------: | :------: | :--------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_video_1x1x8_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py) | 8 | ResNet50 | ImageNet | 75.99 | 97.16 | 7070 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json) |
| [tsm_r50_video_1x1x16_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py) | 8 | ResNet50 | ImageNet | 81.62 | 97.66 | 7070 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json) |
### Something-Something V1
| config | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | reference top1 acc (efficient/accurate) | reference top5 acc (efficient/accurate) | gpu_mem(M) | ckpt | log | json |
| :----------------------------------------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 45.58 / 47.70 | 75.02 / 76.12 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json) |
| [tsm_r50_flip_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 47.10 / 48.51 | 76.02 / 77.56 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json) |
| [tsm_r50_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 47.16 / 48.90 | 76.07 / 77.92 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb_20210324-481268d9.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 47.65 / 48.66 | 76.67 / 77.41 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb-ee93e5e3.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 46.26 / 47.68 | 75.92 / 76.49 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb-4f4f4740.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 47.85 / 50.31 | 76.78 / 78.18 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb_20210324-76937692.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 47.77 / 49.03 | 76.82 / 77.83 | [47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 10390 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20211202-b922e5d2.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.json) |
| [tsm_r101_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 46.09 / 48.59 | 75.41 / 77.10 | [46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 9800 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20211202-49970a5b.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.json) |
### Something-Something V2
| config | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | reference top1 acc (efficient/accurate) | reference top5 acc (efficient/accurate) | gpu_mem(M) | ckpt | log | json |
| :--------------------------------------------------------------------------------------- | :--------: | :--: | :-------: | :------: | :---------------------------: | :---------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------: | :--------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py) | height 256 | 8 | ResNet50 | ImageNet | 59.11 / 61.82 | 85.39 / 86.80 | [xx / 61.2](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7069 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/tsm_r50_256h_1x1x8_50e_sthv2_rgb_20210816-032aa4da.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log.json) |
| [tsm_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py) | height 256 | 8 | ResNet50 | ImageNet | 61.06 / 63.19 | 86.66 / 87.93 | [xx / 63.1](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 10400 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_256h_1x1x16_50e_sthv2_rgb_20210331-0a45549c.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log.json) |
| [tsm_r101_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py) | height 256 | 8 | ResNet101 | ImageNet | 60.88 / 63.84 | 86.56 / 88.30 | [xx / 63.3](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 9727 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_256h_1x1x8_50e_sthv2_rgb_20210401-df97f3e1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log.json) |
### MixUp & CutMix on Something-Something V1
| config | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | delta top1 acc (efficient/accurate) | delta top5 acc (efficient/accurate) | ckpt | log | json |
| :--------------------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :---------------------------------: | :---------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_mixup_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 46.35 / 48.49 | 75.07 / 76.88 | +0.77 / +0.79 | +0.05 / +0.70 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb-9eca48e5.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_cutmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 45.92 / 47.46 | 75.23 / 76.71 | +0.34 / -0.24 | +0.21 / +0.59 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb-34934615.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.json) |
### Jester
| config | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | ckpt | log | json |
| ---------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_jester_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb.py) | height 100 | 8 | ResNet50 | ImageNet | 96.5 / 97.2 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb-c799267e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.json) |
### HMDB51
| config | gpus | backbone | pretrain | top1 acc | top5 acc | gpu_mem(M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------- | :--: | :------: | :---------: | :------: | :------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py) | 8 | ResNet50 | Kinetics400 | 72.68 | 92.03 | 10388 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb_20210630-10c74ee5.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log.json) |
| [tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py) | 8 | ResNet50 | Kinetics400 | 74.77 | 93.86 | 10388 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb_20210630-4785548e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log.json) |
### UCF101
| config | gpus | backbone | pretrain | top1 acc | top5 acc | gpu_mem(M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------- | :--: | :------: | :---------: | :------: | :------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py) | 8 | ResNet50 | Kinetics400 | 94.50 | 99.58 | 10389 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb_20210630-1fae312b.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log.json) |
| [tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py) | 8 | ResNet50 | Kinetics400 | 94.58 | 99.37 | 10389 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb_20210630-8df9c358.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log.json) |
:::{note}
1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time,
not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
3. The values in columns named after "reference" are the results got by training on the original repo, using the same model settings. The checkpoints for reference repo can be downloaded [here](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_reference_ckpt.rar).
4. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop x 1 clip) and accurate setting (Three crop x 2 clip), which is referred from the [original repo](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd).
We use efficient setting as default provided in config files, and it can be changed to accurate setting by
```python
...
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16, # `num_clips = 8` when using 8 segments
twice_sample=True, # set `twice_sample=True` for twice sample in accurate setting
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
# dict(type='CenterCrop', crop_size=224), it is used for efficient setting
dict(type='ThreeCrop', crop_size=256), # it is used for accurate setting
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
```
5. When applying Mixup and CutMix, we use the hyper parameter `alpha=0.2`.
6. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
7. The **infer_ckpt** means those checkpoints are ported from [TSM](https://github.com/mit-han-lab/temporal-shift-module/blob/master/test_models.py).
:::
For more details on data preparation, you can refer to corresponding parts in [Data Preparation](/docs/en/data_preparation.md).
## Train
You can use the following command to train a model.
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
Example: train TSM model on Kinetics-400 dataset in a deterministic option with periodic validation.
```shell
python tools/train.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
--work-dir work_dirs/tsm_r50_1x1x8_100e_kinetics400_rgb \
--validate --seed 0 --deterministic
```
For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
## Test
You can use the following command to test a model.
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
Example: test TSM model on Kinetics-400 dataset and dump the result to a json file.
```shell
python tools/test.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json
```
For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
## Citation
```BibTeX
@inproceedings{lin2019tsm,
title={TSM: Temporal Shift Module for Efficient Video Understanding},
author={Lin, Ji and Gan, Chuang and Han, Song},
booktitle={Proceedings of the IEEE International Conference on Computer Vision},
year={2019}
}
```
<!-- [BACKBONE] -->
```BibTeX
@article{NonLocal2018,
author = {Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He},
title = {Non-local Neural Networks},
journal = {CVPR},
year = {2018}
}
```
# TSM
## 简介
<!-- [ALGORITHM] -->
```BibTeX
@inproceedings{lin2019tsm,
title={TSM: Temporal Shift Module for Efficient Video Understanding},
author={Lin, Ji and Gan, Chuang and Han, Song},
booktitle={Proceedings of the IEEE International Conference on Computer Vision},
year={2019}
}
```
<!-- [BACKBONE] -->
```BibTeX
@article{NonLocal2018,
author = {Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He},
title = {Non-local Neural Networks},
journal = {CVPR},
year = {2018}
}
```
## 模型库
### Kinetics-400
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | 参考代码的 top1 准确率 | 参考代码的 top5 准确率 | 推理时间 (video/s) | GPU 显存占用 (M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------------------------- | :------: | :------: | :---------: | :------: | :---------: | :---------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :----------------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | 340x256 | 8 | ResNet50 | ImageNet | 70.24 | 89.56 | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | 74.0 (8x1 frames) | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json) |
| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | 短边 256 | 8 | ResNet50 | ImageNet | 70.59 | 89.52 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json) |
| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | 短边 320 | 8 | ResNet50 | ImageNet | 70.73 | 89.81 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20210701-68d582b4.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log.json) |
| [tsm_r50_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb.py) | 短边 320 | 8 | ResNet50 | ImageNet | 71.90 | 90.03 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/tsm_r50_1x1x8_100e_kinetics400_rgb_20210701-7ff22268.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log.json) |
| [tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py](/configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py) | 短边 256 | 8 | ResNet50 | ImageNet | 70.48 | 89.40 | x | x | x | 7076 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json) |
| [tsm_r50_video_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py) | 短边 256 | 8 | ResNet50 | ImageNet | 70.25 | 89.66 | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh) | 74.0 (8x1 frames) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json) |
| [tsm_r50_dense_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb.py) | 短边 320 | 8 | ResNet50 | ImageNet | 73.46 | 90.84 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/tsm_r50_dense_1x1x8_50e_kinetics400_rgb_20210701-a54ff3d3.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log.json) |
| [tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) | 短边 320 | 8 | ResNet50 | ImageNet | 74.55 | 91.74 | x | x | x | 7079 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20210701-e3e5e97f.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log.json) |
| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) | 340x256 | 8 | ResNet50 | ImageNet | 72.09 | 90.37 | [70.67](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) | [89.98](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) | 47.0 (16x1 frames) | 10404 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_340x256_1x1x16_50e_kinetics400_rgb_20201011-2f27f229.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log.json) |
| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) | 短边 256 | 8x4 | ResNet50 | ImageNet | 71.89 | 90.73 | x | x | x | 10398 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log.json) |
| [tsm_r50_1x1x16_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb.py) | 短边 320 | 8 | ResNet50 | ImageNet | 72.80 | 90.75 | x | x | x | 10398 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/tsm_r50_1x1x16_100e_kinetics400_rgb_20210701-41ac92b9.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log.json) |
| [tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py) | 短边 320 | 8x4 | ResNet50 | ImageNet | 72.03 | 90.25 | 71.81 | 90.36 | x | 8931 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200724-f00f1336.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log.json) |
| [tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py) | 短边 320 | 8x4 | ResNet50 | ImageNet | 70.70 | 89.90 | x | x | x | 10125 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200816-b93fd297.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log.json) |
| [tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py) | 短边 320 | 8x4 | ResNet50 | ImageNet | 71.60 | 90.34 | x | x | x | 8358 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb_20200724-d8ad84d2.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log.json) |
| [tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py) | 短边 320 | 8 | MobileNetV2 | ImageNet | 68.46 | 88.64 | x | x | x | 3385 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/tsm_mobilenetv2_dense_320p_1x1x8_100e_kinetics400_rgb_20210202-61135809.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log.json) |
| [tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py) | 短边 320 | 8 | MobileNetV2 | ImageNet | 69.89 | 89.01 | x | x | x | 3385 | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port_20210922-aa5cadf6.pth) | x | x |
### Diving48
| 配置文件 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | GPU 显存占用 (M) | ckpt | log | json |
| :--------------------------------------------------------------------------------------------------------- | :------: | :------: | :------: | :---------: | :---------: | :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_video_1x1x8_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py) | 8 | ResNet50 | ImageNet | 75.99 | 97.16 | 7070 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json) |
| [tsm_r50_video_1x1x16_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py) | 8 | ResNet50 | ImageNet | 81.62 | 97.66 | 7070 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json) |
### Something-Something V1
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | 参考代码的 top1 准确率 (efficient/accurate) | 参考代码的 top5 准确率 (efficient/accurate) | GPU 显存占用 (M) | ckpt | log | json |
| :----------------------------------------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 45.58 / 47.70 | 75.02 / 76.12 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json) |
| [tsm_r50_flip_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 47.10 / 48.51 | 76.02 / 77.56 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json) |
| [tsm_r50_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 47.16 / 48.90 | 76.07 / 77.92 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb_20210324-481268d9.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 47.65 / 48.66 | 76.67 / 77.41 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb-ee93e5e3.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 46.26 / 47.68 | 75.92 / 76.49 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb-4f4f4740.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 47.85 / 50.31 | 76.78 / 78.18 | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7077 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb_20210324-76937692.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 47.77 / 49.03 | 76.82 / 77.83 | [47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 10390 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20211202-b922e5d2.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.json) |
| [tsm_r101_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 46.09 / 48.59 | 75.41 / 77.10 | [46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 9800 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20201010-43fedf2e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.json) |
### Something-Something V2
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | 参考代码的 top1 准确率 (efficient/accurate) | 参考代码的 top5 准确率 (efficient/accurate) | GPU 显存占用 (M) | ckpt | log | json |
| :--------------------------------------------------------------------------------------- | :----: | :------: | :-------: | :------: | :------------------------------: | :------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py) | 高 256 | 8 | ResNet50 | ImageNet | 59.11 / 61.82 | 85.39 / 86.80 | [xx / 61.2](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 7069 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/tsm_r50_256h_1x1x8_50e_sthv2_rgb_20210816-032aa4da.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log.json) |
| [tsm_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py) | 高 256 | 8 | ResNet50 | ImageNet | 61.06 / 63.19 | 86.66 / 87.93 | [xx / 63.1](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 10400 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_256h_1x1x16_50e_sthv2_rgb_20210331-0a45549c.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log.json) |
| [tsm_r101_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py) | 高 256 | 8 | ResNet101 | ImageNet | 60.88 / 63.84 | 86.56 / 88.30 | [xx / 63.3](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | 9727 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_256h_1x1x8_50e_sthv2_rgb_20210401-df97f3e1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log.json) |
### Diving48
| 配置文件 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | GPU 显存占用 (M) | ckpt | log | json |
| :--------------------------------------------------------------------------------------------------------- | :------: | :------: | :------: | :---------: | :---------: | :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_video_1x1x8_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py) | 8 | ResNet50 | ImageNet | 75.99 | 97.16 | 7070 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json) |
| [tsm_r50_video_1x1x16_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py) | 8 | ResNet50 | ImageNet | 81.62 | 97.66 | 7070 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json) |
### MixUp & CutMix on Something-Something V1
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | top1 准确率变化 (efficient/accurate) | top5 准确率变化 (efficient/accurate) | ckpt | log | json |
| :--------------------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :----------------------------------: | :----------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_mixup_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 46.35 / 48.49 | 75.07 / 76.88 | +0.77 / +0.79 | +0.05 / +0.70 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb-9eca48e5.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.json) |
| [tsm_r50_cutmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 45.92 / 47.46 | 75.23 / 76.71 | +0.34 / -0.24 | +0.21 / +0.59 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb-34934615.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.json) |
### Jester
| 配置文件 | 分辨率 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 (efficient/accurate) | ckpt | log | json |
| ---------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
| [tsm_r50_1x1x8_50e_jester_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb.py) | 高 100 | 8 | ResNet50 | ImageNet | 96.5 / 97.2 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb-c799267e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.json) |
### HMDB51
| 配置文件 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | GPU 显存占用 (M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------- | :------: | :------: | :---------: | :---------: | :---------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py) | 8 | ResNet50 | Kinetics400 | 72.68 | 92.03 | 10388 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb_20210630-10c74ee5.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log.json) |
| [tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py) | 8 | ResNet50 | Kinetics400 | 74.77 | 93.86 | 10388 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb_20210630-4785548e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log.json) |
### UCF101
| 配置文件 | GPU 数量 | 主干网络 | 预训练 | top1 准确率 | top5 准确率 | GPU 显存占用 (M) | ckpt | log | json |
| :------------------------------------------------------------------------------------------------------------------------- | :------: | :------: | :---------: | :---------: | :---------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
| [tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py) | 8 | ResNet50 | Kinetics400 | 94.50 | 99.58 | 10389 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb_20210630-1fae312b.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log.json) |
| [tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py) | 8 | ResNet50 | Kinetics400 | 94.58 | 99.37 | 10389 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb_20210630-8df9c358.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log.json) |
注:
1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地,MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
依据 [线性缩放规则](https://arxiv.org/abs/1706.02677),当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时,需要根据批大小等比例地调节学习率。
如,lr=0.01 对应 4 GPUs x 2 video/gpu,以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
2. 这里的 **推理时间** 是根据 [基准测试脚本](/tools/analysis/benchmark.py) 获得的,采用测试时的采帧策略,且只考虑模型的推理时间,
并不包括 IO 时间以及预处理时间。对于每个配置,MMAction2 使用 1 块 GPU 并设置批大小(每块 GPU 处理的视频个数)为 1 来计算推理时间。
3. 参考代码的结果是通过使用相同的模型配置在原来的代码库上训练得到的。对应的模型权重文件可从 [这里](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_reference_ckpt.rar) 下载。
4. 对于 Something-Something 数据集,有两种测试方案:efficient(对应 center crop x 1 clip)和 accurate(对应 Three crop x 2 clip)。两种方案参考自 [原始代码库](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd)
MMAction2 使用 efficient 方案作为配置文件中的默认选择,用户可以通过以下方式转变为 accurate 方案:
```python
...
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16, # 当使用 8 个 视频段时,设置 `num_clips = 8`
twice_sample=True, # 设置 `twice_sample=True` 用于 accurate 方案中的 Twice Sample
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
# dict(type='CenterCrop', crop_size=224), 用于 efficient 方案
dict(type='ThreeCrop', crop_size=256), # 用于 accurate 方案
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
```
5. 当采用 Mixup 和 CutMix 的数据增强时,使用超参 `alpha=0.2`
6. 我们使用的 Kinetics400 验证集包含 19796 个视频,用户可以从 [验证集视频](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB) 下载这些视频。同时也提供了对应的 [数据列表](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (每行格式为:视频 ID,视频帧数目,类别序号)以及 [标签映射](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) (类别序号到类别名称)。
7. 这里的 **infer_ckpt** 表示该模型权重文件是从 [TSM](https://github.com/mit-han-lab/temporal-shift-module/blob/master/test_models.py) 导入的。
对于数据集准备的细节,用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 Kinetics400, Something-Something V1 and Something-Something V2 部分。
## 如何训练
用户可以使用以下指令进行模型训练。
```shell
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
例如:以一个确定性的训练方式,辅以定期的验证过程进行 TSM 模型在 Kinetics-400 数据集上的训练。
```shell
python tools/train.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
--work-dir work_dirs/tsm_r50_1x1x8_100e_kinetics400_rgb \
--validate --seed 0 --deterministic
```
更多训练细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
## 如何测试
用户可以使用以下指令进行模型测试。
```shell
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
例如:在 Kinetics-400 数据集上测试 TSM 模型,并将结果导出为一个 json 文件。
```shell
python tools/test.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
--out result.json
```
更多测试细节,可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
Collections:
- Name: TSM
README: configs/recognition/tsm/README.md
Paper:
URL: https://arxiv.org/abs/1811.08383
Title: "TSM: Temporal Shift Module for Efficient Video Understanding"
Models:
- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.24
Top 5 Accuracy: 89.56
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth
- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.59
Top 5 Accuracy: 89.52
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth
- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.73
Top 5 Accuracy: 89.81
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20210701-68d582b4.pth
- Config: configs/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 71.9
Top 5 Accuracy: 90.03
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/tsm_r50_1x1x8_100e_kinetics400_rgb_20210701-7ff22268.pth
- Config: configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.48
Top 5 Accuracy: 89.4
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth
- Config: configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_video_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.25
Top 5 Accuracy: 89.66
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth
- Config: configs/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_dense_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.46
Top 5 Accuracy: 90.84
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/tsm_r50_dense_1x1x8_50e_kinetics400_rgb_20210701-a54ff3d3.pth
- Config: configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 32965562368
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_dense_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 74.55
Top 5 Accuracy: 91.74
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20210701-e3e5e97f.pth
- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 50
FLOPs: 65931124736
Parameters: 24327632
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x16_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.09
Top 5 Accuracy: 90.37
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_340x256_1x1x16_50e_kinetics400_rgb_20201011-2f27f229.pth
- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 50
FLOPs: 65931124736
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: tsm_r50_1x1x16_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 71.89
Top 5 Accuracy: 90.73
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth
- Config: configs/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 100
FLOPs: 65931124736
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x16_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.80
Top 5 Accuracy: 90.75
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20210621_115844.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20210621_115844.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_1x1x16_50e_kinetics400_rgb_20210701-7c0c5d54.pth
- Config: configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 49457811456
Parameters: 31682000
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.03
Top 5 Accuracy: 90.25
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200724-f00f1336.pth
- Config: configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 41231355904
Parameters: 28007888
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.7
Top 5 Accuracy: 89.9
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200816-b93fd297.pth
- Config: configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 49457811456
Parameters: 31682000
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 71.6
Top 5 Accuracy: 90.34
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb_20200724-d8ad84d2.pth
- Config: configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: MobileNetV2
Batch Size: 8
Epochs: 100
FLOPs: 3337519104
Parameters: 2736272
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 68.46
Top 5 Accuracy: 88.64
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/tsm_mobilenetv2_dense_320p_1x1x8_100e_kinetics400_rgb_20210202-61135809.pth
- Config: configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32959795200
Parameters: 23606384
Pretrained: ImageNet
Training Data: Diving48
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_video_1x1x8_50e_diving48_rgb
Results:
- Dataset: Diving48
Metrics:
Top 1 Accuracy: 75.99
Top 5 Accuracy: 97.16
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth
- Config: configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 4
Epochs: 50
FLOPs: 65919590400
Parameters: 23606384
Pretrained: ImageNet
Training Data: Diving48
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_video_1x1x16_50e_diving48_rgb
Results:
- Dataset: Diving48
Metrics:
Top 1 Accuracy: 81.62
Top 5 Accuracy: 97.66
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth
- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32961859584
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 47.7
Top 1 Accuracy (efficient): 45.58
Top 5 Accuracy: 76.12
Top 5 Accuracy (efficient): 75.02
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth
reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32961859584
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_flip_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 48.51
Top 1 Accuracy (efficient): 47.1
Top 5 Accuracy: 77.56
Top 5 Accuracy (efficient): 76.02
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth
reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32961859584
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_randaugment_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 48.9
Top 1 Accuracy (efficient): 47.16
Top 5 Accuracy: 77.92
Top 5 Accuracy (efficient): 76.07
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb_20210324-481268d9.pth
reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 32961859584
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 50.31
Top 1 Accuracy (efficient): 47.85
Top 5 Accuracy: 78.18
Top 5 Accuracy (efficient): 76.78
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb_20210324-76937692.pth
reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 50
FLOPs: 65923719168
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x16_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 49.03
Top 1 Accuracy (efficient): 47.77
Top 5 Accuracy: 77.83
Top 5 Accuracy (efficient): 76.82
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20211202-b922e5d2.pth
reference top1 acc (efficient/accurate): '[47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 62782459904
Parameters: 42856686
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r101_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 48.59
Top 1 Accuracy (efficient): 46.09
Top 5 Accuracy: 77.10
Top 5 Accuracy (efficient): 75.41
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20211202-49970a5b.pth
reference top1 acc (efficient/accurate): '[46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 50
FLOPs: 32961859584
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 256
Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x8_50e_sthv2_rgb
Results:
- Dataset: SthV2
Metrics:
Top 1 Accuracy: 61.82
Top 1 Accuracy (efficient): 59.11
Top 5 Accuracy: 86.80
Top 5 Accuracy (efficient): 85.39
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/tsm_r50_256h_1x1x8_50e_sthv2_rgb_20210816-032aa4da.pth
reference top1 acc (efficient/accurate): '[57.98 / 60.69](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[84.57 / 86.28](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 50
FLOPs: 32961859584
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 256
Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x16_50e_sthv2_rgb
Results:
- Dataset: SthV2
Metrics:
Top 1 Accuracy: 63.19
Top 1 Accuracy (efficient): 61.06
Top 5 Accuracy: 87.93
Top 5 Accuracy (efficient): 86.66
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_256h_1x1x16_50e_sthv2_rgb_20210331-0a45549c.pth
reference top1 acc (efficient/accurate): '[xx / 63.1](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet101
Batch Size: 8
Epochs: 50
FLOPs: 62782459904
Parameters: 42856686
Pretrained: ImageNet
Resolution: height 256
Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r101_1x1x8_50e_sthv2_rgb
Results:
- Dataset: SthV2
Metrics:
Top 1 Accuracy: 63.84
Top 1 Accuracy (efficient): 60.88
Top 5 Accuracy: 88.30
Top 5 Accuracy (efficient): 86.56
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_256h_1x1x8_50e_sthv2_rgb_20210401-df97f3e1.pth
reference top1 acc (efficient/accurate): '[xx / 63.3](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
reference top5 acc (efficient/accurate): '[xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
- Config: configs/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 43051352064
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_mixup_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 48.49
Top 1 Accuracy (efficient): 46.35
Top 5 Accuracy: 76.88
Top 5 Accuracy (efficient): 75.07
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb-9eca48e5.pth
delta top1 acc (efficient/accurate): +0.77 / +0.79
delta top5 acc (efficient/accurate): +0.05 / +0.70
- Config: configs/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 43051352064
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_cutmix_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 47.46
Top 1 Accuracy (efficient): 45.92
Top 5 Accuracy: 76.71
Top 5 Accuracy (efficient): 75.23
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb-34934615.pth
delta top1 acc (efficient/accurate): +0.34 / -0.24
delta top5 acc (efficient/accurate): +0.21 / +0.59
- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 43048943616
Parameters: 23563355
Pretrained: ImageNet
Resolution: height 100
Training Data: Jester
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_r50_1x1x8_50e_jester_rgb
Results:
- Dataset: Jester
Metrics:
Top 1 Accuracy: 97.2
Top 1 Accuracy (efficient): 96.5
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb-c799267e.pth
- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 25
FLOPs: 32959844352
Parameters: 23612531
Pretrained: Kinetics400
Training Data: HMDB51
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb
Results:
- Dataset: HMDB51
Metrics:
Top 1 Accuracy: 72.68
Top 5 Accuracy: 92.03
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb_20210630-10c74ee5.pth
gpu_mem(M): '10388'
- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 25
FLOPs: 65919688704
Parameters: 23612531
Pretrained: Kinetics400
Training Data: HMDB51
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb
Results:
- Dataset: HMDB51
Metrics:
Top 1 Accuracy: 74.77
Top 5 Accuracy: 93.86
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb_20210630-4785548e.pth
gpu_mem(M): '10388'
- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 25
FLOPs: 32960663552
Parameters: 23714981
Pretrained: Kinetics400
Training Data: UCF101
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb
Results:
- Dataset: UCF101
Metrics:
Top 1 Accuracy: 94.5
Top 5 Accuracy: 99.58
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb_20210630-1fae312b.pth
gpu_mem(M): '10389'
- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py
In Collection: TSM
Metadata:
Architecture: ResNet50
Batch Size: 6
Epochs: 25
FLOPs: 65921327104
Parameters: 23714981
Pretrained: Kinetics400
Training Data: UCF101
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb
Results:
- Dataset: UCF101
Metrics:
Top 1 Accuracy: 94.58
Top 5 Accuracy: 99.37
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb_20210630-8df9c358.pth
gpu_mem(M): '10389'
- Config: configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py
In Collection: TSM
Metadata:
Architecture: MobileNetV2
Batch Size: 8
Epochs: 100
FLOPs: 3337519104
Parameters: 2736272
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 69.89
Top 5 Accuracy: 89.01
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port_20210922-aa5cadf6.pth
gpu_mem(M): '3385'
_base_ = [
'../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(num_segments=16),
cls_head=dict(num_classes=51, num_segments=16))
# dataset settings
split = 1
dataset_type = 'RawframeDataset'
data_root = 'data/hmdb51/rawframes'
data_root_val = 'data/hmdb51/rawframes'
ann_file_train = f'data/hmdb51/hmdb51_train_split_{split}_rawframes.txt'
ann_file_val = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
ann_file_test = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=6,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
lr=0.00075, # this lr is used for 8 gpus
)
# learning policy
lr_config = dict(policy='step', step=[10, 20])
total_epochs = 25
load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth' # noqa: E501
# runtime settings
work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/'
_base_ = [
'../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(num_segments=16),
cls_head=dict(num_classes=101, num_segments=16))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/ucf101/rawframes/'
data_root_val = 'data/ucf101/rawframes/'
split = 1 # official train/test splits. valid numbers: 1, 2, 3
ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt'
ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=6,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
lr=0.00075, # this lr is used for 8 gpus
)
# learning policy
lr_config = dict(policy='step', step=[10, 20])
total_epochs = 25
load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth' # noqa: E501
# runtime settings
work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/'
_base_ = [
'../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(num_segments=8),
cls_head=dict(num_classes=51, num_segments=8))
# dataset settings
split = 1
dataset_type = 'RawframeDataset'
data_root = 'data/hmdb51/rawframes'
data_root_val = 'data/hmdb51/rawframes'
ann_file_train = f'data/hmdb51/hmdb51_train_split_{split}_rawframes.txt'
ann_file_val = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
ann_file_test = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
lr=0.0015, # this lr is used for 8 gpus
)
# learning policy
lr_config = dict(policy='step', step=[10, 20])
total_epochs = 25
load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth' # noqa: E501
# runtime settings
work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/'
_base_ = [
'../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(num_segments=8),
cls_head=dict(num_classes=101, num_segments=8))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/ucf101/rawframes/'
data_root_val = 'data/ucf101/rawframes/'
split = 1 # official train/test splits. valid numbers: 1, 2, 3
ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt'
ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
lr=0.0015, # this lr is used for 8 gpus
)
# learning policy
lr_config = dict(policy='step', step=[10, 20])
total_epochs = 25
load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth' # noqa: E501
# runtime settings
work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/'
_base_ = [
'../../_base_/models/tsm_mobilenet_v2.py',
'../../_base_/schedules/sgd_tsm_mobilenet_v2_100e.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='DenseSampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='DenseSampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# runtime settings
checkpoint_config = dict(interval=1)
work_dir = './work_dirs/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/'
_base_ = [
'../../_base_/models/tsm_mobilenet_v2.py',
'../../_base_/schedules/sgd_tsm_mobilenet_v2_100e.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/kinetics400/videos_train'
data_root_val = 'data/kinetics400/videos_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='DenseSampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='DenseSampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
lr=0.01, # this lr is used for 8 gpus
)
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/tsm_mobilenetv2_dense_video_1x1x8_100e_kinetics400_rgb/' # noqa
_base_ = ['../../_base_/models/tsm_mobilenet_v2.py']
# dataset settings
dataset_type = 'VideoDataset'
data_root_val = 'data/kinetics400/videos_val'
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_pipeline = [
dict(type='DecordInit'),
dict(
type='DenseSampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=4,
workers_per_gpu=2,
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
_base_ = [
'../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
non_local_cfg=dict(
sub_sample=True,
use_scale=False,
norm_cfg=dict(type='BN3d', requires_grad=True),
mode='dot_product')))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# runtime settings
work_dir = './work_dirs/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/'
_base_ = [
'../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
non_local_cfg=dict(
sub_sample=True,
use_scale=False,
norm_cfg=dict(type='BN3d', requires_grad=True),
mode='embedded_gaussian')))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# runtime settings
work_dir = './work_dirs/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/' # noqa: E501
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment