add model TSM

5b3e36dc · Sugon_ldc · 5b3e36dc · 5b3e36dc · 5b3e36dc · 5b3e36dc
Commit 5b3e36dc authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/configs/recognition/tpn/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb.py
+++ b/configs/recognition/tpn/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb.py
+_base_ = [
+    '../../_base_/models/tpn_slowonly_r50.py',
+    '../../_base_/default_runtime.py'
+]
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='ColorJitter'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='ColorJitter'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=8,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001,
+    nesterov=True)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[75, 125])
+total_epochs = 150
+# runtime settings
+work_dir = './work_dirs/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics400_rgb'  # noqa: E501
--- a/configs/recognition/tpn/tpn_slowonly_r50_8x8x1_150e_kinetics_rgb.py
+++ b/configs/recognition/tpn/tpn_slowonly_r50_8x8x1_150e_kinetics_rgb.py
+_base_ = ['./tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb.py']
+# model settings
+model = dict(backbone=dict(pretrained=None))
+# runtime settings
+work_dir = './work_dirs/tpn_slowonly_r50_8x8x1_150e_kinetics400_rgb'
--- a/configs/recognition/tpn/tpn_tsm_r50_1x1x8_150e_sthv1_rgb.py
+++ b/configs/recognition/tpn/tpn_tsm_r50_1x1x8_150e_sthv1_rgb.py
+_base_ = [
+    '../../_base_/models/tpn_tsm_r50.py', '../../_base_/default_runtime.py'
+]
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/sthv1/rawframes'
+data_root_val = 'data/sthv1/rawframes'
+ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt'
+ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
+ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='ColorJitter'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        twice_sample=True,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=8,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005,
+    nesterov=True)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[75, 125])
+total_epochs = 150
+# runtime settings
+work_dir = './work_dirs/tpn_tsm_r50_1x1x8_150e_kinetics400_rgb'
--- a/configs/recognition/trn/README.md
+++ b/configs/recognition/trn/README.md
+# TRN
+[Temporal Relational Reasoning in Videos](https://openaccess.thecvf.com/content_ECCV_2018/html/Bolei_Zhou_Temporal_Relational_Reasoning_ECCV_2018_paper.html)
+<!-- [ALGORITHM] -->
+## Abstract
+<!-- [ABSTRACT] -->
+Temporal relational reasoning, the ability to link meaningful transformations of objects or entities over time, is a fundamental property of intelligent species. In this paper, we introduce an effective and interpretable network module, the Temporal Relation Network (TRN), designed to learn and reason about temporal dependencies between video frames at multiple time scales. We evaluate TRN-equipped networks on activity recognition tasks using three recent video datasets - Something-Something, Jester, and Charades - which fundamentally depend on temporal relational reasoning. Our results demonstrate that the proposed TRN gives convolutional neural networks a remarkable capacity to discover temporal relations in videos. Through only sparsely sampled video frames, TRN-equipped networks can accurately predict human-object interactions in the Something-Something dataset and identify various human gestures on the Jester dataset with very competitive performance. TRN-equipped networks also outperform two-stream networks and 3D convolution networks in recognizing daily activities in the Charades dataset. Further analyses show that the models learn intuitive and interpretable visual common sense knowledge in videos.
+<!-- [IMAGE] -->
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143018998-d2120c3d-a9a7-4e4c-90b1-1e5ff1fd5f06.png" width="800"/>
+</div>
+## Results and Models
+### Something-Something V1
+| config                                                                                 | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | gpu_mem(M) |                                                                     ckpt                                                                      |                                                      log                                                       |                                                         json                                                         |
+| :------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
+| [trn_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py) | height 100 |  8   | ResNet50 | ImageNet |         31.62 / 33.88         |         60.01 / 62.12         |   11010    | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/trn_r50_1x1x8_50e_sthv1_rgb_20210401-163704a8.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log.json) |
+### Something-Something V2
+| config                                                                                 | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | gpu_mem(M) |                                                                     ckpt                                                                      |                                                      log                                                       |                                                         json                                                         |
+| :------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
+| [trn_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py) | height 256 |  8   | ResNet50 | ImageNet |         48.39 / 51.28         |         76.58 / 78.65         |   11010    | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/trn_r50_1x1x8_50e_sthv2_rgb_20210816-7abbc4c1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log.json) |
+:::{note}
+1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop x 1 clip) and accurate setting (Three crop x 2 clip).
+3. In the original [repository](https://github.com/zhoubolei/TRN-pytorch), the author augments data with random flipping on something-something dataset, but the augmentation method may be wrong due to the direct actions, such as `push left to right`. So, we replaced `flip` with `flip with label mapping`, and change the testing method `TenCrop`, which has five flipped crops, to `Twice Sample & ThreeCrop`.
+4. We use `ResNet50` instead of `BNInception` as the backbone of TRN. When Training `TRN-ResNet50` on sthv1 dataset in the original repository, we get top1 (top5) accuracy 30.542 (58.627) vs. ours 31.62 (60.01).
+:::
+For more details on data preparation, you can refer to
+- [preparing_sthv1](/tools/data/sthv1/README.md)
+- [preparing_sthv2](/tools/data/sthv2/README.md)
+## Train
+You can use the following command to train a model.
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+Example: train TRN model on sthv1 dataset in a deterministic option with periodic validation.
+```shell
+python tools/train.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
+    --work-dir work_dirs/trn_r50_1x1x8_50e_sthv1_rgb \
+    --validate --seed 0 --deterministic
+```
+For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
+## Test
+You can use the following command to test a model.
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+Example: test TRN model on sthv1 dataset and dump the result to a json file.
+```shell
+python tools/test.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json
+```
+For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
+## Citation
+```BibTeX
+@article{zhou2017temporalrelation,
+    title = {Temporal Relational Reasoning in Videos},
+    author = {Zhou, Bolei and Andonian, Alex and Oliva, Aude and Torralba, Antonio},
+    journal={European Conference on Computer Vision},
+    year={2018}
+}
+```
--- a/configs/recognition/trn/README_zh-CN.md
+++ b/configs/recognition/trn/README_zh-CN.md
+# TRN
+## 简介
+<!-- [ALGORITHM] -->
+```BibTeX
+@article{zhou2017temporalrelation,
+    title = {Temporal Relational Reasoning in Videos},
+    author = {Zhou, Bolei and Andonian, Alex and Oliva, Aude and Torralba, Antonio},
+    journal={European Conference on Computer Vision},
+    year={2018}
+}
+```
+## 模型库
+### Something-Something V1
+| 配置文件                                                                               | 分辨率 | GPU 数量 | 主干网络 |  预训练  | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | GPU 显存占用 (M) |                                                                     ckpt                                                                      |                                                      log                                                       |                                                         json                                                         |
+| :------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
+| [trn_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py) | 高 100 |    8     | ResNet50 | ImageNet |          31.62 / 33.88           |          60.01 / 62.12           |      11010       | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/trn_r50_1x1x8_50e_sthv1_rgb_20210401-163704a8.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log.json) |
+### Something-Something V2
+| 配置文件                                                                               | 分辨率 | GPU 数量 | 主干网络 |  预训练  | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | GPU 显存占用 (M) |                                                                     ckpt                                                                      |                                                      log                                                       |                                                         json                                                         |
+| :------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
+| [trn_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py) | 高 256 |    8     | ResNet50 | ImageNet |          48.39 / 51.28           |          76.58 / 78.65           |      11010       | [ckpt](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/trn_r50_1x1x8_50e_sthv2_rgb_20210816-7abbc4c1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log) | [json](https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log.json) |
+注：
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 对于 Something-Something 数据集，有两种测试方案：efficient（对应 center crop x 1 clip）和 accurate（对应 Three crop x 2 clip）。
+3. 在原代码库中，作者在 Something-Something 数据集上使用了随机水平翻转，但这种数据增强方法有一些问题，因为 Something-Something 数据集有一些方向性的动作，比如`从左往右推`。所以 MMAction2 把`随机水平翻转`改为`带标签映射的水平翻转`，同时修改了测试模型的数据处理方法，即把`裁剪 10 个图像块`（这里面包括 5 个翻转后的图像块）修改成`采帧两次 & 裁剪 3 个图像块`。
+4. MMAction2 使用 `ResNet50` 代替 `BNInception` 作为 TRN 的主干网络。使用原代码，在 sthv1 数据集上训练 `TRN-ResNet50` 时，实验得到的 top1 (top5) 的准确度为 30.542 (58.627)，而 MMAction2 的精度为 31.62 (60.01)。
+关于数据处理的更多细节，用户可以参照
+- [准备 sthv1](/tools/data/sthv1/README_zh-CN.md)
+- [准备 sthv2](/tools/data/sthv2/README_zh-CN.md)
+## 如何训练
+用户可以使用以下指令进行模型训练。
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+例如：以一个确定性的训练方式，辅以定期的验证过程进行 TRN 模型在 sthv1 数据集上的训练。
+```shell
+python tools/train.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
+    --work-dir work_dirs/trn_r50_1x1x8_50e_sthv1_rgb \
+    --validate --seed 0 --deterministic
+```
+更多训练细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
+## 如何测试
+用户可以使用以下指令进行模型测试。
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+例如：在 sthv1 数据集上测试 TRN 模型，并将结果导出为一个 json 文件。
+```shell
+python tools/test.py configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json
+```
+更多测试细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
--- a/configs/recognition/trn/metafile.yml
+++ b/configs/recognition/trn/metafile.yml
+Collections:
+- Name: TRN
+  README: configs/recognition/trn/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1711.08496
+    Title: Temporal Relational Reasoning in Videos
+Models:
+- Config: configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py
+  In Collection: TRN
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 16
+    Epochs: 50
+    Parameters: 26641154
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: trn_r50_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 33.88
+      Top 1 Accuracy (efficient): 31.62
+      Top 5 Accuracy: 62.12
+      Top 5 Accuracy (efficient): 60.01
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/20210326_103948.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb/trn_r50_1x1x8_50e_sthv1_rgb_20210401-163704a8.pth
+- Config: configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py
+  In Collection: TRN
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 16
+    Epochs: 50
+    Parameters: 26641154
+    Pretrained: ImageNet
+    Resolution: height 256
+    Training Data: SthV2
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: trn_r50_1x1x8_50e_sthv2_rgb
+  Results:
+  - Dataset: SthV2
+    Metrics:
+      Top 1 Accuracy: 51.28
+      Top 1 Accuracy (efficient): 48.39
+      Top 5 Accuracy: 78.65
+      Top 5 Accuracy (efficient): 76.58
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/20210816_221356.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb/trn_r50_1x1x8_50e_sthv2_rgb_20210816-7abbc4c1.pth
--- a/configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py
+++ b/configs/recognition/trn/trn_r50_1x1x8_50e_sthv1_rgb.py
+_base_ = [
+    '../../_base_/models/trn_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(cls_head=dict(num_classes=174))
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/sthv1/rawframes'
+data_root_val = 'data/sthv1/rawframes'
+ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt'
+ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
+ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
+sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52}
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        twice_sample=True,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        filename_tmpl='{:05}.jpg',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        filename_tmpl='{:05}.jpg',
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(lr=0.002, paramwise_cfg=dict(fc_lr5=False), weight_decay=5e-4)
+# learning policy
+lr_config = dict(policy='step', step=[30, 45])
+total_epochs = 50
+# runtime settings
+find_unused_parameters = True
+work_dir = './work_dirs/trn_r50_1x1x8_50e_sthv1_rgb/'
--- a/configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py
+++ b/configs/recognition/trn/trn_r50_1x1x8_50e_sthv2_rgb.py
+_base_ = [
+    '../../_base_/models/trn_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(cls_head=dict(num_classes=174))
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/sthv2/rawframes'
+data_root_val = 'data/sthv2/rawframes'
+ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt'
+ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt'
+ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt'
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        twice_sample=True,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(lr=0.002, paramwise_cfg=dict(fc_lr5=False), weight_decay=5e-4)
+# learning policy
+lr_config = dict(policy='step', step=[30, 45])
+total_epochs = 50
+# runtime settings
+find_unused_parameters = True
+work_dir = './work_dirs/trn_r50_1x1x8_50e_sthv2_rgb/'
--- a/configs/recognition/tsm/README.md
+++ b/configs/recognition/tsm/README.md
+# TSM
+[TSM: Temporal Shift Module for Efficient Video Understanding](https://openaccess.thecvf.com/content_ICCV_2019/html/Lin_TSM_Temporal_Shift_Module_for_Efficient_Video_Understanding_ICCV_2019_paper.html)
+<!-- [ALGORITHM] -->
+## Abstract
+<!-- [ABSTRACT] -->
+The explosive growth in video streaming gives rise to challenges on performing video understanding at high accuracy and low computation cost. Conventional 2D CNNs are computationally cheap but cannot capture temporal relationships; 3D CNN based methods can achieve good performance but are computationally intensive, making it expensive to deploy. In this paper, we propose a generic and effective Temporal Shift Module (TSM) that enjoys both high efficiency and high performance. Specifically, it can achieve the performance of 3D CNN but maintain 2D CNN's complexity. TSM shifts part of the channels along the temporal dimension; thus facilitate information exchanged among neighboring frames. It can be inserted into 2D CNNs to achieve temporal modeling at zero computation and zero parameters. We also extended TSM to online setting, which enables real-time low-latency online video recognition and video object detection. TSM is accurate and efficient: it ranks the first place on the Something-Something leaderboard upon publication; on Jetson Nano and Galaxy Note8, it achieves a low latency of 13ms and 35ms for online video recognition.
+<!-- [IMAGE] -->
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143019083-abc0de39-9ea1-4175-be5c-073c90de64c3.png" width="800"/>
+</div>
+## Results and Models
+### Kinetics-400
+| config                                                                                                                                       |   resolution   | gpus |  backbone   | pretrain | top1 acc | top5 acc |                                                                reference top1 acc                                                                 |                                                                reference top5 acc                                                                 | inference_time(video/s) | gpu_mem(M) |                                                                                                ckpt                                                                                                 |                                                                                     log                                                                                     |                                                                                     json                                                                                      |
+| :------------------------------------------------------------------------------------------------------------------------------------------- | :------------: | :--: | :---------: | :------: | :------: | :------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py)                                           |    340x256     |  8   |  ResNet50   | ImageNet |  70.24   |  89.56   | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  |    74.0 (8x1 frames)    |    7079    |                      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth)                      |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log)                             |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json)                           |
+| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py)                                           | short-side 256 |  8   |  ResNet50   | ImageNet |  70.59   |  89.52   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    7079    |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth)                 |                          [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log)                          |                        [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json)                        |
+| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py)                                           | short-side 320 |  8   |  ResNet50   | ImageNet |  70.73   |  89.81   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    7079    |                      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20210701-68d582b4.pth)                      |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log)                             |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log.json)                           |
+| [tsm_r50_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb.py)                                         | short-side 320 |  8   |  ResNet50   | ImageNet |  71.90   |  90.03   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    7079    |                     [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/tsm_r50_1x1x8_100e_kinetics400_rgb_20210701-7ff22268.pth)                     |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log)                            |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log.json)                          |
+| [tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py](/configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py)            | short-side 256 |  8   |  ResNet50   | ImageNet |  70.48   |  89.40   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    7076    |        [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth)        | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json) |
+| [tsm_r50_video_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py)                               | short-side 256 |  8   |  ResNet50   | ImageNet |  70.25   |  89.66   | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  |    74.0 (8x1 frames)    |    7077    |               [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth)               |           [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log)            |         [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json)          |
+| [tsm_r50_dense_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb.py)                               | short-side 320 |  8   |  ResNet50   | ImageNet |  73.46   |  90.84   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    7079    |                [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/tsm_r50_dense_1x1x8_50e_kinetics400_rgb_20210701-a54ff3d3.pth)                |                         [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log)                          |                       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log.json)                        |
+| [tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py)                             | short-side 320 |  8   |  ResNet50   | ImageNet |  74.55   |  91.74   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    7079    |               [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20210701-e3e5e97f.pth)               |                         [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log)                         |                       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log.json)                       |
+| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py)                                         |    340x256     |  8   |  ResNet50   | ImageNet |  72.09   |  90.37   | [70.67](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) | [89.98](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) |   47.0 (16x1 frames)    |   10404    |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_340x256_1x1x16_50e_kinetics400_rgb_20201011-2f27f229.pth)                 |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log)                            |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log.json)                          |
+| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py)                                         | short-side 256 | 8x4  |  ResNet50   | ImageNet |  71.89   |  90.73   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |   10398    |                [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth)                |                         [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log)                          |                       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log.json)                        |
+| [tsm_r50_1x1x16_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb.py)                                       | short-side 320 |  8   |  ResNet50   | ImageNet |  72.80   |  90.75   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |   10398    |                    [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/tsm_r50_1x1x16_100e_kinetics400_rgb_20210701-41ac92b9.pth)                    |                           [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log)                            |                         [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log.json)                          |
+| [tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py) | short-side 320 | 8x4  |  ResNet50   | ImageNet |  72.03   |  90.25   |                                                                       71.81                                                                       |                                                                       90.36                                                                       |            x            |    8931    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200724-f00f1336.pth) |                  [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log)                  |                [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log.json)                |
+| [tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py)                   | short-side 320 | 8x4  |  ResNet50   | ImageNet |  70.70   |  89.90   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |   10125    |          [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200816-b93fd297.pth)          |                      [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log)                       |                    [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log.json)                     |
+| [tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py)             | short-side 320 | 8x4  |  ResNet50   | ImageNet |  71.60   |  90.34   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    8358    |       [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb_20200724-d8ad84d2.pth)       |                     [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log)                     |                   [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log.json)                   |
+| [tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py)             | short-side 320 |  8   | MobileNetV2 | ImageNet |  68.46   |  88.64   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    3385    |    [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/tsm_mobilenetv2_dense_320p_1x1x8_100e_kinetics400_rgb_20210202-61135809.pth)     |                     [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log)                     |                   [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log.json)                   |
+| [tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py)             | short-side 320 |  8   | MobileNetV2 | ImageNet |  69.89   |  89.01   |                                                                         x                                                                         |                                                                         x                                                                         |            x            |    3385    |                            [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port_20210922-aa5cadf6.pth)                             |                                                                                      x                                                                                      |                                                                                       x                                                                                       |
+### Diving48
+| config                                                                                                     | gpus | backbone | pretrain | top1 acc | top5 acc | gpu_mem(M) |                                                                               ckpt                                                                                |                                                           log                                                            |                                                              json                                                              |
+| :--------------------------------------------------------------------------------------------------------- | :--: | :------: | :------: | :------: | :------: | :--------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_video_1x1x8_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py)   |  8   | ResNet50 | ImageNet |  75.99   |  97.16   |    7070    |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json)  |
+| [tsm_r50_video_1x1x16_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py) |  8   | ResNet50 | ImageNet |  81.62   |  97.66   |    7070    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json) |
+### Something-Something V1
+| config                                                                                                                   | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) |                                           reference top1 acc (efficient/accurate)                                            |                                           reference top5 acc (efficient/accurate)                                            | gpu_mem(M) |                                                                                      ckpt                                                                                       |                                                                             log                                                                              |                                                                              json                                                                              |
+| :----------------------------------------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py)                                   | height 100 |  8   | ResNet50 | ImageNet |         45.58 / 47.70         |         75.02 / 76.12         | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    7077    |                  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth)                  |                        [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log)                        |                      [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json)                      |
+| [tsm_r50_flip_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py)                         | height 100 |  8   | ResNet50 | ImageNet |         47.10 / 48.51         |         76.02 / 77.56         | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    7077    |             [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth)             |                     [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log)                      |                   [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json)                    |
+| [tsm_r50_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py)           | height 100 |  8   | ResNet50 | ImageNet |         47.16 / 48.90         |         76.07 / 77.92         | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    7077    |      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb_20210324-481268d9.pth)      |      [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.log)      |      [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.json)      |
+| [tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.py)   | height 100 |  8   | ResNet50 | ImageNet |         47.65 / 48.66         |         76.67 / 77.41         | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    7077    |      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb-ee93e5e3.pth)       |  [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.log)  |  [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.json)  |
+| [tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.py)             | height 100 |  8   | ResNet50 | ImageNet |         46.26 / 47.68         |         75.92 / 76.49         | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    7077    |           [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb-4f4f4740.pth)            |       [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.log)       |       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.json)       |
+| [tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py) | height 100 |  8   | ResNet50 | ImageNet |         47.85 / 50.31         |         76.78 / 78.18         | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    7077    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb_20210324-76937692.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.json) |
+| [tsm_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py)                                 | height 100 |  8   | ResNet50 | ImageNet |         47.77 / 49.03         |         76.82 / 77.83         | [47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |   10390    |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20211202-b922e5d2.pth)                 |                 [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.log)                 |                 [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.json)                 |
+| [tsm_r101_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py)                                 | height 100 |  8   | ResNet50 | ImageNet |         46.09 / 48.59         |         75.41 / 77.10         | [46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    9800    |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20211202-49970a5b.pth)                 |                 [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.log)                 |                 [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.json)                 |
+### Something-Something V2
+| config                                                                                   | resolution | gpus | backbone  | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) |                                         reference top1 acc (efficient/accurate)                                          |                                        reference top5 acc (efficient/accurate)                                         | gpu_mem(M) |                                                                         ckpt                                                                         |                                                       log                                                       |                                                         json                                                          |
+| :--------------------------------------------------------------------------------------- | :--------: | :--: | :-------: | :------: | :---------------------------: | :---------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------: | :--------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py)   | height 256 |  8   | ResNet50  | ImageNet |         59.11 / 61.82         |         85.39 / 86.80         | [xx / 61.2](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    7069    |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/tsm_r50_256h_1x1x8_50e_sthv2_rgb_20210816-032aa4da.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log.json)  |
+| [tsm_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py)  | height 256 |  8   | ResNet50  | ImageNet |         61.06 / 63.19         |         86.66 / 87.93         | [xx / 63.1](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |   10400    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_256h_1x1x16_50e_sthv2_rgb_20210331-0a45549c.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log.json) |
+| [tsm_r101_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py) | height 256 |  8   | ResNet101 | ImageNet |         60.88 / 63.84         |         86.56 / 88.30         | [xx / 63.3](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |    9727    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_256h_1x1x8_50e_sthv2_rgb_20210401-df97f3e1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log.json) |
+### MixUp & CutMix on Something-Something V1
+| config                                                                                               | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | delta top1 acc (efficient/accurate) | delta top5 acc (efficient/accurate) |                                                                        ckpt                                                                        |                                                                   log                                                                    |                                                                    json                                                                    |
+| :--------------------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :---------------------------------: | :---------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_mixup_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py)   | height 100 |  8   | ResNet50 | ImageNet |         46.35 / 48.49         |         75.07 / 76.88         |            +0.77 / +0.79            |            +0.05 / +0.70            |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb-9eca48e5.pth)  |  [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.log)  |  [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.json)  |
+| [tsm_r50_cutmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py) | height 100 |  8   | ResNet50 | ImageNet |         45.92 / 47.46         |         75.23 / 76.71         |            +0.34 / -0.24            |            +0.21 / +0.59            | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb-34934615.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.json) |
+### Jester
+| config                                                                                   | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) |                                                                  ckpt                                                                  |                                                             log                                                              |                                                              json                                                              |
+| ---------------------------------------------------------------------------------------- | :--------: | :--: | :------: | :------: | :---------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_jester_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb.py) | height 100 |  8   | ResNet50 | ImageNet |          96.5 / 97.2          | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb-c799267e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.json) |
+### HMDB51
+| config                                                                                                                     | gpus | backbone |  pretrain   | top1 acc | top5 acc | gpu_mem(M) |                                                                                       ckpt                                                                                        |                                                               log                                                                |                                                                  json                                                                  |
+| :------------------------------------------------------------------------------------------------------------------------- | :--: | :------: | :---------: | :------: | :------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py)   |  8   | ResNet50 | Kinetics400 |  72.68   |  92.03   |   10388    |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb_20210630-10c74ee5.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log.json)  |
+| [tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py) |  8   | ResNet50 | Kinetics400 |  74.77   |  93.86   |   10388    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb_20210630-4785548e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log.json) |
+### UCF101
+| config                                                                                                                     | gpus | backbone |  pretrain   | top1 acc | top5 acc | gpu_mem(M) |                                                                                       ckpt                                                                                        |                                                               log                                                                |                                                                  json                                                                  |
+| :------------------------------------------------------------------------------------------------------------------------- | :--: | :------: | :---------: | :------: | :------: | :--------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py)   |  8   | ResNet50 | Kinetics400 |  94.50   |  99.58   |   10389    |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb_20210630-1fae312b.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log.json)  |
+| [tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py) |  8   | ResNet50 | Kinetics400 |  94.58   |  99.37   |   10389    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb_20210630-8df9c358.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log.json) |
+:::{note}
+1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
+2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time,
+   not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
+3. The values in columns named after "reference" are the results got by training on the original repo, using the same model settings. The checkpoints for reference repo can be downloaded [here](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_reference_ckpt.rar).
+4. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop x 1 clip) and accurate setting (Three crop x 2 clip), which is referred from the [original repo](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd).
+   We use efficient setting as default provided in config files, and it can be changed to accurate setting by
+```python
+...
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=16,   # `num_clips = 8` when using 8 segments
+        twice_sample=True,    # set `twice_sample=True` for twice sample in accurate setting
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    # dict(type='CenterCrop', crop_size=224), it is used for efficient setting
+    dict(type='ThreeCrop', crop_size=256),  # it is used for accurate setting
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+```
+5. When applying Mixup and CutMix, we use the hyper parameter `alpha=0.2`.
+6. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
+7. The **infer_ckpt** means those checkpoints are ported from [TSM](https://github.com/mit-han-lab/temporal-shift-module/blob/master/test_models.py).
+:::
+For more details on data preparation, you can refer to corresponding parts in [Data Preparation](/docs/en/data_preparation.md).
+## Train
+You can use the following command to train a model.
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+Example: train TSM model on Kinetics-400 dataset in a deterministic option with periodic validation.
+```shell
+python tools/train.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
+    --work-dir work_dirs/tsm_r50_1x1x8_100e_kinetics400_rgb \
+    --validate --seed 0 --deterministic
+```
+For more details, you can refer to **Training setting** part in [getting_started](/docs/en/getting_started.md#training-setting).
+## Test
+You can use the following command to test a model.
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+Example: test TSM model on Kinetics-400 dataset and dump the result to a json file.
+```shell
+python tools/test.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json
+```
+For more details, you can refer to **Test a dataset** part in [getting_started](/docs/en/getting_started.md#test-a-dataset).
+## Citation
+```BibTeX
+@inproceedings{lin2019tsm,
+  title={TSM: Temporal Shift Module for Efficient Video Understanding},
+  author={Lin, Ji and Gan, Chuang and Han, Song},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  year={2019}
+}
+```
+<!-- [BACKBONE] -->
+```BibTeX
+@article{NonLocal2018,
+  author =   {Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He},
+  title =    {Non-local Neural Networks},
+  journal =  {CVPR},
+  year =     {2018}
+}
+```
--- a/configs/recognition/tsm/README_zh-CN.md
+++ b/configs/recognition/tsm/README_zh-CN.md
+# TSM
+## 简介
+<!-- [ALGORITHM] -->
+```BibTeX
+@inproceedings{lin2019tsm,
+  title={TSM: Temporal Shift Module for Efficient Video Understanding},
+  author={Lin, Ji and Gan, Chuang and Han, Song},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  year={2019}
+}
+```
+<!-- [BACKBONE] -->
+```BibTeX
+@article{NonLocal2018,
+  author =   {Xiaolong Wang and Ross Girshick and Abhinav Gupta and Kaiming He},
+  title =    {Non-local Neural Networks},
+  journal =  {CVPR},
+  year =     {2018}
+}
+```
+## 模型库
+### Kinetics-400
+| 配置文件                                                                                                                                     |  分辨率  | GPU 数量 |  主干网络   |  预训练  | top1 准确率 | top5 准确率 |                                                              参考代码的 top1 准确率                                                               |                                                              参考代码的 top5 准确率                                                               | 推理时间 (video/s) | GPU 显存占用 (M) |                                                                                                ckpt                                                                                                 |                                                                                     log                                                                                     |                                                                                     json                                                                                      |
+| :------------------------------------------------------------------------------------------------------------------------------------------- | :------: | :------: | :---------: | :------: | :---------: | :---------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :----------------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py)                                           | 340x256  |    8     |  ResNet50   | ImageNet |    70.24    |    89.56    | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  | 74.0 (8x1 frames)  |       7079       |                      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth)                      |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log)                             |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json)                           |
+| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py)                                           | 短边 256 |    8     |  ResNet50   | ImageNet |    70.59    |    89.52    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       7079       |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth)                 |                          [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log)                          |                        [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json)                        |
+| [tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py)                                           | 短边 320 |    8     |  ResNet50   | ImageNet |    70.73    |    89.81    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       7079       |                      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20210701-68d582b4.pth)                      |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log)                             |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log.json)                           |
+| [tsm_r50_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb.py)                                         | 短边 320 |    8     |  ResNet50   | ImageNet |    71.90    |    90.03    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       7079       |                     [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/tsm_r50_1x1x8_100e_kinetics400_rgb_20210701-7ff22268.pth)                     |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log)                            |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log.json)                          |
+| [tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py](/configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py)            | 短边 256 |    8     |  ResNet50   | ImageNet |    70.48    |    89.40    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       7076       |        [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth)        | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json) |
+| [tsm_r50_video_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py)                               | 短边 256 |    8     |  ResNet50   | ImageNet |    70.25    |    89.66    | [70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  | [89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)  | 74.0 (8x1 frames)  |       7077       |               [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth)               |           [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log)            |         [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json)          |
+| [tsm_r50_dense_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb.py)                               | 短边 320 |    8     |  ResNet50   | ImageNet |    73.46    |    90.84    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       7079       |                [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/tsm_r50_dense_1x1x8_50e_kinetics400_rgb_20210701-a54ff3d3.pth)                |                         [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log)                          |                       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log.json)                        |
+| [tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py)                             | 短边 320 |    8     |  ResNet50   | ImageNet |    74.55    |    91.74    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       7079       |               [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20210701-e3e5e97f.pth)               |                         [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log)                         |                       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log.json)                       |
+| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py)                                         | 340x256  |    8     |  ResNet50   | ImageNet |    72.09    |    90.37    | [70.67](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) | [89.98](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh) | 47.0 (16x1 frames) |      10404       |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_340x256_1x1x16_50e_kinetics400_rgb_20201011-2f27f229.pth)                 |                            [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log)                            |                          [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log.json)                          |
+| [tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py)                                         | 短边 256 |   8x4    |  ResNet50   | ImageNet |    71.89    |    90.73    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |      10398       |                [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth)                |                         [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log)                          |                       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log.json)                        |
+| [tsm_r50_1x1x16_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb.py)                                       | 短边 320 |    8     |  ResNet50   | ImageNet |    72.80    |    90.75    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |      10398       |                    [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/tsm_r50_1x1x16_100e_kinetics400_rgb_20210701-41ac92b9.pth)                    |                           [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log)                            |                         [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb/20210618_193859.log.json)                          |
+| [tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py) | 短边 320 |   8x4    |  ResNet50   | ImageNet |    72.03    |    90.25    |                                                                       71.81                                                                       |                                                                       90.36                                                                       |         x          |       8931       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200724-f00f1336.pth) |                  [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log)                  |                [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log.json)                |
+| [tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py)                   | 短边 320 |   8x4    |  ResNet50   | ImageNet |    70.70    |    89.90    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |      10125       |          [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200816-b93fd297.pth)          |                      [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log)                       |                    [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log.json)                     |
+| [tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py)             | 短边 320 |   8x4    |  ResNet50   | ImageNet |    71.60    |    90.34    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       8358       |       [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb_20200724-d8ad84d2.pth)       |                     [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log)                     |                   [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log.json)                   |
+| [tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py)             | 短边 320 |    8     | MobileNetV2 | ImageNet |    68.46    |    88.64    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       3385       |    [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/tsm_mobilenetv2_dense_320p_1x1x8_100e_kinetics400_rgb_20210202-61135809.pth)     |                     [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log)                     |                   [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log.json)                   |
+| [tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port](/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py)             | 短边 320 |    8     | MobileNetV2 | ImageNet |    69.89    |    89.01    |                                                                         x                                                                         |                                                                         x                                                                         |         x          |       3385       |                            [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port_20210922-aa5cadf6.pth)                             |                                                                                      x                                                                                      |                                                                                       x                                                                                       |
+### Diving48
+| 配置文件                                                                                                   | GPU 数量 | 主干网络 |  预训练  | top1 准确率 | top5 准确率 | GPU 显存占用 (M) |                                                                               ckpt                                                                                |                                                           log                                                            |                                                              json                                                              |
+| :--------------------------------------------------------------------------------------------------------- | :------: | :------: | :------: | :---------: | :---------: | :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_video_1x1x8_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py)   |    8     | ResNet50 | ImageNet |    75.99    |    97.16    |       7070       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json)  |
+| [tsm_r50_video_1x1x16_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py) |    8     | ResNet50 | ImageNet |    81.62    |    97.66    |       7070       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json) |
+### Something-Something V1
+| 配置文件                                                                                                                 | 分辨率 | GPU 数量 | 主干网络 |  预训练  | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) |                                         参考代码的 top1 准确率 (efficient/accurate)                                          |                                         参考代码的 top5 准确率 (efficient/accurate)                                          | GPU 显存占用 (M) |                                                                                      ckpt                                                                                       |                                                                             log                                                                              |                                                                              json                                                                              |
+| :----------------------------------------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :--------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py)                                   | 高 100 |    8     | ResNet50 | ImageNet |          45.58 / 47.70           |          75.02 / 76.12           | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       7077       |                  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth)                  |                        [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log)                        |                      [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json)                      |
+| [tsm_r50_flip_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py)                         | 高 100 |    8     | ResNet50 | ImageNet |          47.10 / 48.51           |          76.02 / 77.56           | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       7077       |             [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth)             |                     [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log)                      |                   [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json)                    |
+| [tsm_r50_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py)           | 高 100 |    8     | ResNet50 | ImageNet |          47.16 / 48.90           |          76.07 / 77.92           | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       7077       |      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb_20210324-481268d9.pth)      |      [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.log)      |      [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.json)      |
+| [tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.py)   | 高 100 |    8     | ResNet50 | ImageNet |          47.65 / 48.66           |          76.67 / 77.41           | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       7077       |      [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb-ee93e5e3.pth)       |  [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.log)  |  [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_randaugment_1x1x8_50e_sthv1_rgb.json)  |
+| [tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.py)             | 高 100 |    8     | ResNet50 | ImageNet |          46.26 / 47.68           |          75.92 / 76.49           | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       7077       |           [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb-4f4f4740.pth)            |       [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.log)       |       [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb/tsm_r50_ptv_augmix_1x1x8_50e_sthv1_rgb.json)       |
+| [tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py) | 高 100 |    8     | ResNet50 | ImageNet |          47.85 / 50.31           |          76.78 / 78.18           | [45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       7077       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb_20210324-76937692.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.json) |
+| [tsm_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py)                                 | 高 100 |    8     | ResNet50 | ImageNet |          47.77 / 49.03           |          76.82 / 77.83           | [47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |      10390       |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20211202-b922e5d2.pth)                 |                 [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.log)                 |                 [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.json)                 |
+| [tsm_r101_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py)                                 | 高 100 |    8     | ResNet50 | ImageNet |          46.09 / 48.59           |          75.41 / 77.10           | [46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       9800       |                 [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20201010-43fedf2e.pth)                 |                 [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.log)                 |                 [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.json)                 |
+### Something-Something V2
+| 配置文件                                                                                 | 分辨率 | GPU 数量 | 主干网络  |  预训练  | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) |                                       参考代码的 top1 准确率 (efficient/accurate)                                        |                                      参考代码的 top5 准确率 (efficient/accurate)                                       | GPU 显存占用 (M) |                                                                         ckpt                                                                         |                                                       log                                                       |                                                         json                                                          |
+| :--------------------------------------------------------------------------------------- | :----: | :------: | :-------: | :------: | :------------------------------: | :------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py)   | 高 256 |    8     | ResNet50  | ImageNet |          59.11 / 61.82           |          85.39 / 86.80           | [xx / 61.2](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       7069       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/tsm_r50_256h_1x1x8_50e_sthv2_rgb_20210816-032aa4da.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log.json)  |
+| [tsm_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py)  | 高 256 |    8     | ResNet50  | ImageNet |          61.06 / 63.19           |          86.66 / 87.93           | [xx / 63.1](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |      10400       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_256h_1x1x16_50e_sthv2_rgb_20210331-0a45549c.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log.json) |
+| [tsm_r101_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py) | 高 256 |    8     | ResNet101 | ImageNet |          60.88 / 63.84           |          86.56 / 88.30           | [xx / 63.3](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) | [xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training) |       9727       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_256h_1x1x8_50e_sthv2_rgb_20210401-df97f3e1.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log.json) |
+### Diving48
+| 配置文件                                                                                                   | GPU 数量 | 主干网络 |  预训练  | top1 准确率 | top5 准确率 | GPU 显存占用 (M) |                                                                               ckpt                                                                                |                                                           log                                                            |                                                              json                                                              |
+| :--------------------------------------------------------------------------------------------------------- | :------: | :------: | :------: | :---------: | :---------: | :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_video_1x1x8_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py)   |    8     | ResNet50 | ImageNet |    75.99    |    97.16    |       7070       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json)  |
+| [tsm_r50_video_1x1x16_50e_diving48_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py) |    8     | ResNet50 | ImageNet |    81.62    |    97.66    |       7070       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json) |
+### MixUp & CutMix on Something-Something V1
+| 配置文件                                                                                             | 分辨率 | GPU 数量 | 主干网络 |  预训练  | top1 准确率 (efficient/accurate) | top5 准确率 (efficient/accurate) | top1 准确率变化 (efficient/accurate) | top5 准确率变化 (efficient/accurate) |                                                                        ckpt                                                                        |                                                                   log                                                                    |                                                                    json                                                                    |
+| :--------------------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------: | :----------------------------------: | :----------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_mixup_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py)   | 高 100 |    8     | ResNet50 | ImageNet |          46.35 / 48.49           |          75.07 / 76.88           |            +0.77 / +0.79             |            +0.05 / +0.70             |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb-9eca48e5.pth)  |  [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.log)  |  [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.json)  |
+| [tsm_r50_cutmix_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py) | 高 100 |    8     | ResNet50 | ImageNet |          45.92 / 47.46           |          75.23 / 76.71           |            +0.34 / -0.24             |            +0.21 / +0.59             | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb-34934615.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.json) |
+### Jester
+| 配置文件                                                                                 | 分辨率 | GPU 数量 | 主干网络 |  预训练  | top1 准确率 (efficient/accurate) |                                                                  ckpt                                                                  |                                                             log                                                              |                                                              json                                                              |
+| ---------------------------------------------------------------------------------------- | :----: | :------: | :------: | :------: | :------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_r50_1x1x8_50e_jester_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb.py) | 高 100 |    8     | ResNet50 | ImageNet |           96.5 / 97.2            | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb-c799267e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.json) |
+### HMDB51
+| 配置文件                                                                                                                   | GPU 数量 | 主干网络 |   预训练    | top1 准确率 | top5 准确率 | GPU 显存占用 (M) |                                                                                       ckpt                                                                                        |                                                               log                                                                |                                                                  json                                                                  |
+| :------------------------------------------------------------------------------------------------------------------------- | :------: | :------: | :---------: | :---------: | :---------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py)   |    8     | ResNet50 | Kinetics400 |    72.68    |    92.03    |      10388       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb_20210630-10c74ee5.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log.json)  |
+| [tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py) |    8     | ResNet50 | Kinetics400 |    74.77    |    93.86    |      10388       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb_20210630-4785548e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log.json) |
+### UCF101
+| 配置文件                                                                                                                   | GPU 数量 | 主干网络 |   预训练    | top1 准确率 | top5 准确率 | GPU 显存占用 (M) |                                                                                       ckpt                                                                                        |                                                               log                                                                |                                                                  json                                                                  |
+| :------------------------------------------------------------------------------------------------------------------------- | :------: | :------: | :---------: | :---------: | :---------: | :--------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: |
+| [tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py)   |    8     | ResNet50 | Kinetics400 |    94.50    |    99.58    |      10389       |  [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb_20210630-1fae312b.pth)  | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log)  | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log.json)  |
+| [tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb](/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py) |    8     | ResNet50 | Kinetics400 |    94.58    |    99.37    |      10389       | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb_20210630-8df9c358.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log.json) |
+注：
+1. 这里的 **GPU 数量** 指的是得到模型权重文件对应的 GPU 个数。默认地，MMAction2 所提供的配置文件对应使用 8 块 GPU 进行训练的情况。
+   依据 [线性缩放规则](https://arxiv.org/abs/1706.02677)，当用户使用不同数量的 GPU 或者每块 GPU 处理不同视频个数时，需要根据批大小等比例地调节学习率。
+   如，lr=0.01 对应 4 GPUs x 2 video/gpu，以及 lr=0.08 对应 16 GPUs x 4 video/gpu。
+2. 这里的 **推理时间** 是根据 [基准测试脚本](/tools/analysis/benchmark.py) 获得的，采用测试时的采帧策略，且只考虑模型的推理时间，
+   并不包括 IO 时间以及预处理时间。对于每个配置，MMAction2 使用 1 块 GPU 并设置批大小（每块 GPU 处理的视频个数）为 1 来计算推理时间。
+3. 参考代码的结果是通过使用相同的模型配置在原来的代码库上训练得到的。对应的模型权重文件可从 [这里](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_reference_ckpt.rar) 下载。
+4. 对于 Something-Something 数据集，有两种测试方案：efficient（对应 center crop x 1 clip）和 accurate（对应 Three crop x 2 clip）。两种方案参考自 [原始代码库](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd)。
+   MMAction2 使用 efficient 方案作为配置文件中的默认选择，用户可以通过以下方式转变为 accurate 方案：
+```python
+...
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=16,   # 当使用 8 个 视频段时，设置 `num_clips = 8`
+        twice_sample=True,    # 设置 `twice_sample=True` 用于 accurate 方案中的 Twice Sample
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    # dict(type='CenterCrop', crop_size=224), 用于 efficient 方案
+    dict(type='ThreeCrop', crop_size=256),  # 用于 accurate 方案
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+```
+5. 当采用 Mixup 和 CutMix 的数据增强时，使用超参 `alpha=0.2`。
+6. 我们使用的 Kinetics400 验证集包含 19796 个视频，用户可以从 [验证集视频](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB) 下载这些视频。同时也提供了对应的 [数据列表](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) （每行格式为：视频 ID，视频帧数目，类别序号）以及 [标签映射](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) （类别序号到类别名称）。
+7. 这里的 **infer_ckpt** 表示该模型权重文件是从 [TSM](https://github.com/mit-han-lab/temporal-shift-module/blob/master/test_models.py) 导入的。
+对于数据集准备的细节，用户可参考 [数据集准备文档](/docs/zh_cn/data_preparation.md) 中的 Kinetics400, Something-Something V1 and Something-Something V2 部分。
+## 如何训练
+用户可以使用以下指令进行模型训练。
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+例如：以一个确定性的训练方式，辅以定期的验证过程进行 TSM 模型在 Kinetics-400 数据集上的训练。
+```shell
+python tools/train.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
+    --work-dir work_dirs/tsm_r50_1x1x8_100e_kinetics400_rgb \
+    --validate --seed 0 --deterministic
+```
+更多训练细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#训练配置) 中的 **训练配置** 部分。
+## 如何测试
+用户可以使用以下指令进行模型测试。
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+例如：在 Kinetics-400 数据集上测试 TSM 模型，并将结果导出为一个 json 文件。
+```shell
+python tools/test.py configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json
+```
+更多测试细节，可参考 [基础教程](/docs/zh_cn/getting_started.md#测试某个数据集) 中的 **测试某个数据集** 部分。
--- a/configs/recognition/tsm/metafile.yml
+++ b/configs/recognition/tsm/metafile.yml
+Collections:
+- Name: TSM
+  README: configs/recognition/tsm/README.md
+  Paper:
+    URL: https://arxiv.org/abs/1811.08383
+    Title: "TSM: Temporal Shift Module for Efficient Video Understanding"
+Models:
+- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: 340x256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 70.24
+      Top 5 Accuracy: 89.56
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth
+- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 70.59
+      Top 5 Accuracy: 89.52
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth
+- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 70.73
+      Top 5 Accuracy: 89.81
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20210616_021451.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20210701-68d582b4.pth
+- Config: configs/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 100
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x8_100e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 71.9
+      Top 5 Accuracy: 90.03
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/20210617_103543.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_100e_kinetics400_rgb/tsm_r50_1x1x8_100e_kinetics400_rgb_20210701-7ff22268.pth
+- Config: configs/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 70.48
+      Top 5 Accuracy: 89.4
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb/tsm_r50_gpu_normalize_1x1x8_50e_kinetics400_rgb_20210219-bf96e6cc.pth
+- Config: configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_video_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 70.25
+      Top 5 Accuracy: 89.66
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth
+- Config: configs/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_dense_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 73.46
+      Top 5 Accuracy: 90.84
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/20210617_103245.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_50e_kinetics400_rgb/tsm_r50_dense_1x1x8_50e_kinetics400_rgb_20210701-a54ff3d3.pth
+- Config: configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 100
+    FLOPs: 32965562368
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_dense_1x1x8_100e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 74.55
+      Top 5 Accuracy: 91.74
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20210613_034931.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20210701-e3e5e97f.pth
+- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 50
+    FLOPs: 65931124736
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: 340x256
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x16_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 72.09
+      Top 5 Accuracy: 90.37
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20201011_205356.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_340x256_1x1x16_50e_kinetics400_rgb_20201011-2f27f229.pth
+- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 50
+    FLOPs: 65931124736
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 256
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x16_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 71.89
+      Top 5 Accuracy: 90.73
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20201010_224825.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth
+- Config: configs/recognition/tsm/tsm_r50_1x1x16_100e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 100
+    FLOPs: 65931124736
+    Parameters: 24327632
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x16_100e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 72.80
+      Top 5 Accuracy: 90.75
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20210621_115844.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20210621_115844.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_1x1x16_50e_kinetics400_rgb_20210701-7c0c5d54.pth
+- Config: configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 49457811456
+    Parameters: 31682000
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 72.03
+      Top 5 Accuracy: 90.25
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200724_120023.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200724-f00f1336.pth
+- Config: configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 41231355904
+    Parameters: 28007888
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 70.7
+      Top 5 Accuracy: 89.9
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/20200815_210253.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb_20200816-b93fd297.pth
+- Config: configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 49457811456
+    Parameters: 31682000
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 32 GPUs
+  Modality: RGB
+  Name: tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 71.6
+      Top 5 Accuracy: 90.34
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/20200723_220442.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb_20200724-d8ad84d2.pth
+- Config: configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: MobileNetV2
+    Batch Size: 8
+    Epochs: 100
+    FLOPs: 3337519104
+    Parameters: 2736272
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 68.46
+      Top 5 Accuracy: 88.64
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/20210129_024936.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/tsm_mobilenetv2_dense_320p_1x1x8_100e_kinetics400_rgb_20210202-61135809.pth
+- Config: configs/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32959795200
+    Parameters: 23606384
+    Pretrained: ImageNet
+    Training Data: Diving48
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_video_1x1x8_50e_diving48_rgb
+  Results:
+  - Dataset: Diving48
+    Metrics:
+      Top 1 Accuracy: 75.99
+      Top 5 Accuracy: 97.16
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/20210426_012424.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_50e_diving48_rgb/tsm_r50_video_1x1x8_50e_diving48_rgb_20210426-aba5aa3d.pth
+- Config: configs/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 4
+    Epochs: 50
+    FLOPs: 65919590400
+    Parameters: 23606384
+    Pretrained: ImageNet
+    Training Data: Diving48
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_video_1x1x16_50e_diving48_rgb
+  Results:
+  - Dataset: Diving48
+    Metrics:
+      Top 1 Accuracy: 81.62
+      Top 5 Accuracy: 97.66
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/20210426_012823.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x16_50e_diving48_rgb/tsm_r50_video_1x1x16_50e_diving48_rgb_20210426-aa9631c0.pth
+- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32961859584
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 47.7
+      Top 1 Accuracy (efficient): 45.58
+      Top 5 Accuracy: 76.12
+      Top 5 Accuracy (efficient): 75.02
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth
+  reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32961859584
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_flip_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 48.51
+      Top 1 Accuracy (efficient): 47.1
+      Top 5 Accuracy: 77.56
+      Top 5 Accuracy (efficient): 76.02
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth
+  reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32961859584
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_randaugment_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 48.9
+      Top 1 Accuracy (efficient): 47.16
+      Top 5 Accuracy: 77.92
+      Top 5 Accuracy (efficient): 76.07
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_randaugment_1x1x8_50e_sthv1_rgb_20210324-481268d9.pth
+  reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 32961859584
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 50.31
+      Top 1 Accuracy (efficient): 47.85
+      Top 5 Accuracy: 78.18
+      Top 5 Accuracy (efficient): 76.78
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb/tsm_r50_flip_randaugment_1x1x8_50e_sthv1_rgb_20210324-76937692.pth
+  reference top1 acc (efficient/accurate): '[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 50
+    FLOPs: 65923719168
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x16_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 49.03
+      Top 1 Accuracy (efficient): 47.77
+      Top 5 Accuracy: 77.83
+      Top 5 Accuracy (efficient): 76.82
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20211202-b922e5d2.pth
+  reference top1 acc (efficient/accurate): '[47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 62782459904
+    Parameters: 42856686
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r101_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 48.59
+      Top 1 Accuracy (efficient): 46.09
+      Top 5 Accuracy: 77.10
+      Top 5 Accuracy (efficient): 75.41
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20211202-49970a5b.pth
+  reference top1 acc (efficient/accurate): '[46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 50
+    FLOPs: 32961859584
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 256
+    Training Data: SthV2
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x8_50e_sthv2_rgb
+  Results:
+  - Dataset: SthV2
+    Metrics:
+      Top 1 Accuracy: 61.82
+      Top 1 Accuracy (efficient): 59.11
+      Top 5 Accuracy: 86.80
+      Top 5 Accuracy (efficient): 85.39
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/20210816_224310.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb/tsm_r50_256h_1x1x8_50e_sthv2_rgb_20210816-032aa4da.pth
+  reference top1 acc (efficient/accurate): '[57.98 / 60.69](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[84.57 / 86.28](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 50
+    FLOPs: 32961859584
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 256
+    Training Data: SthV2
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x16_50e_sthv2_rgb
+  Results:
+  - Dataset: SthV2
+    Metrics:
+      Top 1 Accuracy: 63.19
+      Top 1 Accuracy (efficient): 61.06
+      Top 5 Accuracy: 87.93
+      Top 5 Accuracy (efficient): 86.66
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20210331_134458.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_256h_1x1x16_50e_sthv2_rgb_20210331-0a45549c.pth
+  reference top1 acc (efficient/accurate): '[xx / 63.1](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet101
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 62782459904
+    Parameters: 42856686
+    Pretrained: ImageNet
+    Resolution: height 256
+    Training Data: SthV2
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r101_1x1x8_50e_sthv2_rgb
+  Results:
+  - Dataset: SthV2
+    Metrics:
+      Top 1 Accuracy: 63.84
+      Top 1 Accuracy (efficient): 60.88
+      Top 5 Accuracy: 88.30
+      Top 5 Accuracy (efficient): 86.56
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20210401_143656.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_256h_1x1x8_50e_sthv2_rgb_20210401-df97f3e1.pth
+  reference top1 acc (efficient/accurate): '[xx / 63.3](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+  reference top5 acc (efficient/accurate): '[xx / xx](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)'
+- Config: configs/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 43051352064
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_mixup_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 48.49
+      Top 1 Accuracy (efficient): 46.35
+      Top 5 Accuracy: 76.88
+      Top 5 Accuracy (efficient): 75.07
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_mixup_1x1x8_50e_sthv1_rgb/tsm_r50_mixup_1x1x8_50e_sthv1_rgb-9eca48e5.pth
+  delta top1 acc (efficient/accurate): +0.77 / +0.79
+  delta top5 acc (efficient/accurate): +0.05 / +0.70
+- Config: configs/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 43051352064
+    Parameters: 23864558
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: SthV1
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_cutmix_1x1x8_50e_sthv1_rgb
+  Results:
+  - Dataset: SthV1
+    Metrics:
+      Top 1 Accuracy: 47.46
+      Top 1 Accuracy (efficient): 45.92
+      Top 5 Accuracy: 76.71
+      Top 5 Accuracy (efficient): 75.23
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb/tsm_r50_cutmix_1x1x8_50e_sthv1_rgb-34934615.pth
+  delta top1 acc (efficient/accurate): +0.34 / -0.24
+  delta top5 acc (efficient/accurate): +0.21 / +0.59
+- Config: configs/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 8
+    Epochs: 50
+    FLOPs: 43048943616
+    Parameters: 23563355
+    Pretrained: ImageNet
+    Resolution: height 100
+    Training Data: Jester
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_r50_1x1x8_50e_jester_rgb
+  Results:
+  - Dataset: Jester
+    Metrics:
+      Top 1 Accuracy: 97.2
+      Top 1 Accuracy (efficient): 96.5
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_jester_rgb/tsm_r50_1x1x8_50e_jester_rgb-c799267e.pth
+- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 25
+    FLOPs: 32959844352
+    Parameters: 23612531
+    Pretrained: Kinetics400
+    Training Data: HMDB51
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb
+  Results:
+  - Dataset: HMDB51
+    Metrics:
+      Top 1 Accuracy: 72.68
+      Top 5 Accuracy: 92.03
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/20210605_182554.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb_20210630-10c74ee5.pth
+  gpu_mem(M): '10388'
+- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 25
+    FLOPs: 65919688704
+    Parameters: 23612531
+    Pretrained: Kinetics400
+    Training Data: HMDB51
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb
+  Results:
+  - Dataset: HMDB51
+    Metrics:
+      Top 1 Accuracy: 74.77
+      Top 5 Accuracy: 93.86
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/20210605_182505.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb_20210630-4785548e.pth
+  gpu_mem(M): '10388'
+- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 12
+    Epochs: 25
+    FLOPs: 32960663552
+    Parameters: 23714981
+    Pretrained: Kinetics400
+    Training Data: UCF101
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb
+  Results:
+  - Dataset: UCF101
+    Metrics:
+      Top 1 Accuracy: 94.5
+      Top 5 Accuracy: 99.58
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/20210605_182720.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb_20210630-1fae312b.pth
+  gpu_mem(M): '10389'
+- Config: configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: ResNet50
+    Batch Size: 6
+    Epochs: 25
+    FLOPs: 65921327104
+    Parameters: 23714981
+    Pretrained: Kinetics400
+    Training Data: UCF101
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb
+  Results:
+  - Dataset: UCF101
+    Metrics:
+      Top 1 Accuracy: 94.58
+      Top 5 Accuracy: 99.37
+    Task: Action Recognition
+  Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log.json
+  Training Log: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/20210605_182720.log
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb_20210630-8df9c358.pth
+  gpu_mem(M): '10389'
+- Config: configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py
+  In Collection: TSM
+  Metadata:
+    Architecture: MobileNetV2
+    Batch Size: 8
+    Epochs: 100
+    FLOPs: 3337519104
+    Parameters: 2736272
+    Pretrained: ImageNet
+    Resolution: short-side 320
+    Training Data: Kinetics-400
+    Training Resources: 8 GPUs
+  Modality: RGB
+  Name: tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port
+  Results:
+  - Dataset: Kinetics-400
+    Metrics:
+      Top 1 Accuracy: 69.89
+      Top 5 Accuracy: 89.01
+    Task: Action Recognition
+  Weights: https://download.openmmlab.com/mmaction/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_kinetics400_rgb_port_20210922-aa5cadf6.pth
+  gpu_mem(M): '3385'
--- a/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py
+++ b/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(num_segments=16),
+    cls_head=dict(num_classes=51, num_segments=16))
+# dataset settings
+split = 1
+dataset_type = 'RawframeDataset'
+data_root = 'data/hmdb51/rawframes'
+data_root_val = 'data/hmdb51/rawframes'
+ann_file_train = f'data/hmdb51/hmdb51_train_split_{split}_rawframes.txt'
+ann_file_val = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
+ann_file_test = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=16,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=16,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=6,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    lr=0.00075,  # this lr is used for 8 gpus
+)
+# learning policy
+lr_config = dict(policy='step', step=[10, 20])
+total_epochs = 25
+load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth'  # noqa: E501
+# runtime settings
+work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x16_25e_hmdb51_rgb/'
--- a/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py
+++ b/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(num_segments=16),
+    cls_head=dict(num_classes=101, num_segments=16))
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/ucf101/rawframes/'
+data_root_val = 'data/ucf101/rawframes/'
+split = 1  # official train/test splits. valid numbers: 1, 2, 3
+ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt'
+ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
+ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=16,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=16,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=6,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    lr=0.00075,  # this lr is used for 8 gpus
+)
+# learning policy
+lr_config = dict(policy='step', step=[10, 20])
+total_epochs = 25
+load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20201010-85645c2a.pth'  # noqa: E501
+# runtime settings
+work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x16_25e_ucf101_rgb/'
--- a/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py
+++ b/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(num_segments=8),
+    cls_head=dict(num_classes=51, num_segments=8))
+# dataset settings
+split = 1
+dataset_type = 'RawframeDataset'
+data_root = 'data/hmdb51/rawframes'
+data_root_val = 'data/hmdb51/rawframes'
+ann_file_train = f'data/hmdb51/hmdb51_train_split_{split}_rawframes.txt'
+ann_file_val = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
+ann_file_test = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    lr=0.0015,  # this lr is used for 8 gpus
+)
+# learning policy
+lr_config = dict(policy='step', step=[10, 20])
+total_epochs = 25
+load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth'  # noqa: E501
+# runtime settings
+work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x8_25e_hmdb51_rgb/'
--- a/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py
+++ b/configs/recognition/tsm/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(num_segments=8),
+    cls_head=dict(num_classes=101, num_segments=8))
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/ucf101/rawframes/'
+data_root_val = 'data/ucf101/rawframes/'
+split = 1  # official train/test splits. valid numbers: 1, 2, 3
+ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt'
+ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
+ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    lr=0.0015,  # this lr is used for 8 gpus
+)
+# learning policy
+lr_config = dict(policy='step', step=[10, 20])
+total_epochs = 25
+load_from = 'https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth'  # noqa: E501
+# runtime settings
+work_dir = './work_dirs/tsm_k400_pretrained_r50_1x1x8_25e_ucf101_rgb/'
--- a/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py
+++ b/configs/recognition/tsm/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_mobilenet_v2.py',
+    '../../_base_/schedules/sgd_tsm_mobilenet_v2_100e.py',
+    '../../_base_/default_runtime.py'
+]
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='DenseSampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='DenseSampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# runtime settings
+checkpoint_config = dict(interval=1)
+work_dir = './work_dirs/tsm_mobilenetv2_dense_1x1x8_100e_kinetics400_rgb/'
--- a/configs/recognition/tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py
+++ b/configs/recognition/tsm/tsm_mobilenetv2_video_dense_1x1x8_100e_kinetics400_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_mobilenet_v2.py',
+    '../../_base_/schedules/sgd_tsm_mobilenet_v2_100e.py',
+    '../../_base_/default_runtime.py'
+]
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='DenseSampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='DenseSampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# optimizer
+optimizer = dict(
+    lr=0.01,  # this lr is used for 8 gpus
+)
+# runtime settings
+checkpoint_config = dict(interval=5)
+work_dir = './work_dirs/tsm_mobilenetv2_dense_video_1x1x8_100e_kinetics400_rgb/'  # noqa
--- a/configs/recognition/tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py
+++ b/configs/recognition/tsm/tsm_mobilenetv2_video_inference_dense_1x1x8_100e_kinetics400_rgb.py
+_base_ = ['../../_base_/models/tsm_mobilenet_v2.py']
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='DenseSampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=4,
+    workers_per_gpu=2,
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
--- a/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py
+++ b/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
+        non_local_cfg=dict(
+            sub_sample=True,
+            use_scale=False,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='dot_product')))
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# runtime settings
+work_dir = './work_dirs/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb/'
--- a/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
+++ b/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
+_base_ = [
+    '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py',
+    '../../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
+        non_local_cfg=dict(
+            sub_sample=True,
+            use_scale=False,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='embedded_gaussian')))
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+# runtime settings
+work_dir = './work_dirs/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb/'  # noqa: E501