add model TSM

5b3e36dc · Sugon_ldc · 5b3e36dc · 5b3e36dc · 5b3e36dc · 5b3e36dc
Commit 5b3e36dc authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/configs/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint.py
+++ b/configs/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint.py
+model = dict(
+    type='SkeletonGCN',
+    backbone=dict(
+        type='STGCN',
+        in_channels=3,
+        edge_importance_weighting=True,
+        graph_cfg=dict(layout='coco', strategy='spatial')),
+    cls_head=dict(
+        type='STGCNHead',
+        num_classes=60,
+        in_channels=256,
+        loss_cls=dict(type='CrossEntropyLoss')),
+    train_cfg=None,
+    test_cfg=None)
+
+dataset_type = 'PoseDataset'
+ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl'
+ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl'
+train_pipeline = [
+    dict(type='PaddingWithLoop', clip_len=300),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', input_format='NCTVM'),
+    dict(type='PoseNormalize'),
+    dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['keypoint'])
+]
+val_pipeline = [
+    dict(type='PaddingWithLoop', clip_len=300),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', input_format='NCTVM'),
+    dict(type='PoseNormalize'),
+    dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['keypoint'])
+]
+test_pipeline = [
+    dict(type='PaddingWithLoop', clip_len=300),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', input_format='NCTVM'),
+    dict(type='PoseNormalize'),
+    dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['keypoint'])
+]
+data = dict(
+    videos_per_gpu=16,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix='',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix='',
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix='',
+        pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[10, 50])
+total_epochs = 80
+checkpoint_config = dict(interval=5)
+evaluation = dict(interval=5, metrics=['top_k_accuracy'])
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/stgcn_80e_ntu60_xsub_keypoint/'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint_3d.py
+++ b/configs/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint_3d.py
+model = dict(
+    type='SkeletonGCN',
+    backbone=dict(
+        type='STGCN',
+        in_channels=3,
+        edge_importance_weighting=True,
+        graph_cfg=dict(layout='ntu-rgb+d', strategy='spatial')),
+    cls_head=dict(
+        type='STGCNHead',
+        num_classes=60,
+        in_channels=256,
+        loss_cls=dict(type='CrossEntropyLoss')),
+    train_cfg=None,
+    test_cfg=None)
+
+dataset_type = 'PoseDataset'
+ann_file_train = 'data/ntu/nturgb+d_skeletons_60_3d_nmtvc/xsub/train.pkl'
+ann_file_val = 'data/ntu/nturgb+d_skeletons_60_3d_nmtvc/xsub/val.pkl'
+train_pipeline = [
+    dict(type='PaddingWithLoop', clip_len=300),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', input_format='NCTVM'),
+    dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['keypoint'])
+]
+val_pipeline = [
+    dict(type='PaddingWithLoop', clip_len=300),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', input_format='NCTVM'),
+    dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['keypoint'])
+]
+test_pipeline = [
+    dict(type='PaddingWithLoop', clip_len=300),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', input_format='NCTVM'),
+    dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['keypoint'])
+]
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    test_dataloader=dict(videos_per_gpu=1),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix='',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix='',
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix='',
+        pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[10, 50])
+total_epochs = 80
+checkpoint_config = dict(interval=3)
+evaluation = dict(interval=3, metrics=['top_k_accuracy'])
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/stgcn_80e_ntu60_xsub_keypoint_3d/'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/demo/README.md
+++ b/demo/README.md
+# Demo
+
+## Outline
+
+- [Modify configs through script arguments](#modify-config-through-script-arguments): Tricks to directly modify configs through script arguments.
+- [Video demo](#video-demo): A demo script to predict the recognition result using a single video.
+- [SpatioTemporal Action Detection Video Demo](#spatiotemporal-action-detection-video-demo): A demo script to predict the SpatioTemporal Action Detection result using a single video.
+- [Video GradCAM Demo](#video-gradcam-demo): A demo script to visualize GradCAM results using a single video.
+- [Webcam demo](#webcam-demo): A demo script to implement real-time action recognition from a web camera.
+- [Long Video demo](#long-video-demo): a demo script to predict different labels using a single long video.
+- [SpatioTemporal Action Detection Webcam Demo](#spatiotemporal-action-detection-webcam-demo): A demo script to implement real-time spatio-temporal action detection from a web camera.
+- [Skeleton-based Action Recognition Demo](#skeleton-based-action-recognition-demo): A demo script to predict the skeleton-based action recognition result using a single video.
+- [Video Structuralize Demo](#video-structuralize-demo): A demo script to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video.
+- [Audio Demo](#audio-demo): A demo script to predict the recognition result using a single audio file.
+
+## Modify configs through script arguments
+
+When running demos using our provided scripts, you may specify `--cfg-options` to in-place modify the config.
+
+- Update config keys of dict.
+
+  The config options can be specified following the order of the dict keys in the original config.
+  For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode.
+
+- Update keys inside a list of configs.
+
+  Some config dicts are composed as a list in your config. For example, the training pipeline `data.train.pipeline` is normally a list
+  e.g. `[dict(type='SampleFrames'), ...]`. If you want to change `'SampleFrames'` to `'DenseSampleFrames'` in the pipeline,
+  you may specify `--cfg-options data.train.pipeline.0.type=DenseSampleFrames`.
+
+- Update values of list/tuples.
+
+  If the value to be updated is a list or a tuple. For example, the config file normally sets `workflow=[('train', 1)]`. If you want to
+  change this key, you may specify `--cfg-options workflow="[(train,1),(val,1)]"`. Note that the quotation mark " is necessary to
+  support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value.
+
+## Video demo
+
+We provide a demo script to predict the recognition result using a single video. In order to get predict results in range `[0, 1]`, make sure to set `model['test_cfg'] = dict(average_clips='prob')` in config file.
+
+```shell
+python demo/demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} {LABEL_FILE} [--use-frames] \
+    [--device ${DEVICE_TYPE}] [--fps {FPS}] [--font-scale {FONT_SCALE}] [--font-color {FONT_COLOR}] \
+    [--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}]
+```
+
+Optional arguments:
+
+- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input.
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30.
+- `FONT_SCALE`: Font scale of the label added in the video. If not specified, it will be 0.5.
+- `FONT_COLOR`: Font color of the label added in the video. If not specified, it will be `white`.
+- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio.
+- `RESIZE_ALGORITHM`: Resize algorithm used for resizing. If not specified, it will be set to `bicubic`.
+- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Recognize a video file as input by using a TSN model on cuda by default.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
+   ```
+
+2. Recognize a video file as input by using a TSN model on cuda by default, loading checkpoint from url.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+       https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
+   ```
+
+3. Recognize a list of rawframes as input by using a TSN model on cpu.
+
+   ```shell
+   python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       PATH_TO_FRAMES/ LABEL_FILE --use-frames --device cpu
+   ```
+
+4. Recognize a video file as input by using a TSN model and then generate an mp4 file.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --out-filename demo/demo_out.mp4
+   ```
+
+5. Recognize a list of rawframes as input by using a TSN model and then generate a gif file.
+
+   ```shell
+   python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       PATH_TO_FRAMES/ LABEL_FILE --use-frames --out-filename demo/demo_out.gif
+   ```
+
+6. Recognize a video file as input by using a TSN model, then generate an mp4 file with a given resolution and resize algorithm.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --target-resolution 340 256 --resize-algorithm bilinear \
+       --out-filename demo/demo_out.mp4
+   ```
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   # If either dimension is set to -1, the frames are resized by keeping the existing aspect ratio
+   # For --target-resolution 170 -1, original resolution (340, 256) -> target resolution (170, 128)
+   python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --target-resolution 170 -1 --resize-algorithm bilinear \
+       --out-filename demo/demo_out.mp4
+   ```
+
+7. Recognize a video file as input by using a TSN model, then generate an mp4 file with a label in a red color and fontscale 1.
+
+   ```shell
+   # The demo.mp4 and label_map_k400.txt are both from Kinetics-400
+   python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --font-scale 1 --font-color red \
+       --out-filename demo/demo_out.mp4
+   ```
+
+8. Recognize a list of rawframes as input by using a TSN model and then generate an mp4 file with 24 fps.
+
+   ```shell
+   python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
+       checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+       PATH_TO_FRAMES/ LABEL_FILE --use-frames --fps 24 --out-filename demo/demo_out.gif
+   ```
+
+## SpatioTemporal Action Detection Video Demo
+
+We provide a demo script to predict the SpatioTemporal Action Detection result using a single video.
+
+```shell
+python demo/demo_spatiotemporal_det.py --video ${VIDEO_FILE} \
+    [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--output-stepsize ${OUTPUT_STEPSIZE}] \
+    [--output-fps ${OUTPUT_FPS}]
+```
+
+Optional arguments:
+
+- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint URL.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
+- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Default: 0.5.
+- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Default: `cuda:0`.
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: `demo/stdet_demo.mp4`.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames.  Default: 8.
+- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Default: 4.
+- `OUTPUT_FPS`: The FPS of demo video output. Default: 6.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4.
+
+```shell
+python demo/demo_spatiotemporal_det.py --video demo/demo.mp4 \
+    --config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
+    --checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --action-score-thr 0.5 \
+    --label-map tools/data/ava/label_map.txt \
+    --predict-stepsize 8 \
+    --output-stepsize 4 \
+    --output-fps 6
+```
+
+## Video GradCAM Demo
+
+We provide a demo script to visualize GradCAM results using a single video.
+
+```shell
+python demo/demo_gradcam.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} [--use-frames] \
+    [--device ${DEVICE_TYPE}] [--target-layer-name ${TARGET_LAYER_NAME}] [--fps {FPS}] \
+    [--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}]
+```
+
+- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input.
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30.
+- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file.
+- `TARGET_LAYER_NAME`: Layer name to generate GradCAM localization map.
+- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio.
+- `RESIZE_ALGORITHM`: Resize algorithm used for resizing. If not specified, it will be set to `bilinear`.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Get GradCAM results of a I3D model, using a video file as input and then generate an gif file with 10 fps.
+
+   ```shell
+   python demo/demo_gradcam.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
+       checkpoints/i3d_r50_video_32x2x1_100e_kinetics400_rgb_20200826-e31c6f52.pth demo/demo.mp4 \
+       --target-layer-name backbone/layer4/1/relu --fps 10 \
+       --out-filename demo/demo_gradcam.gif
+   ```
+
+2. Get GradCAM results of a TSM model, using a video file as input and then generate an gif file, loading checkpoint from url.
+
+   ```shell
+   python demo/demo_gradcam.py configs/recognition/tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py \
+       https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth \
+       demo/demo.mp4 --target-layer-name backbone/layer4/1/relu --out-filename demo/demo_gradcam_tsm.gif
+   ```
+
+## Webcam demo
+
+We provide a demo script to implement real-time action recognition from web camera. In order to get predict results in range `[0, 1]`, make sure to set `model.['test_cfg'] = dict(average_clips='prob')` in config file.
+
+```shell
+python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${LABEL_FILE} \
+    [--device ${DEVICE_TYPE}] [--camera-id ${CAMERA_ID}] [--threshold ${THRESHOLD}] \
+    [--average-size ${AVERAGE_SIZE}] [--drawing-fps ${DRAWING_FPS}] [--inference-fps ${INFERENCE_FPS}]
+```
+
+Optional arguments:
+
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `CAMERA_ID`: ID of camera device If not specified, it will be set to 0.
+- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.
+- `AVERAGE_SIZE`: Number of latest clips to be averaged for prediction. If not specified, it will be set to 1.
+- `DRAWING_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 20.
+- `INFERENCE_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 4.
+
+:::{note}
+If your hardware is good enough, increasing the value of `DRAWING_FPS` and `INFERENCE_FPS` will get a better experience.
+:::
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times
+   and outputting result labels with score higher than 0.2.
+
+   ```shell
+   python demo/webcam_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+     checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth tools/data/kinetics/label_map_k400.txt --average-size 5 \
+     --threshold 0.2 --device cpu
+   ```
+
+2. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times
+   and outputting result labels with score higher than 0.2, loading checkpoint from url.
+
+   ```shell
+   python demo/webcam_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+     https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+     tools/data/kinetics/label_map_k400.txt --average-size 5 --threshold 0.2 --device cpu
+   ```
+
+3. Recognize the action from web camera as input by using a I3D model on gpu by default, averaging the score per 5 times
+   and outputting result labels with score higher than 0.2.
+
+   ```shell
+   python demo/webcam_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
+     checkpoints/i3d_r50_32x2x1_100e_kinetics400_rgb_20200614-c25ef9a4.pth tools/data/kinetics/label_map_k400.txt \
+     --average-size 5 --threshold 0.2
+   ```
+
+:::{note}
+Considering the efficiency difference for users' hardware, Some modifications might be done to suit the case.
+Users can change:
+
+1). `SampleFrames` step (especially the number of `clip_len` and `num_clips`) of `test_pipeline` in the config file, like `--cfg-options data.test.pipeline.0.num_clips=3`.
+2). Change to the suitable Crop methods like `TenCrop`, `ThreeCrop`, `CenterCrop`, etc. in `test_pipeline` of the config file, like `--cfg-options data.test.pipeline.4.type=CenterCrop`.
+3). Change the number of `--average-size`. The smaller, the faster.
+:::
+
+## Long video demo
+
+We provide a demo script to predict different labels using a single long video. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
+
+```shell
+python demo/long_video_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \
+    ${OUT_FILE} [--input-step ${INPUT_STEP}] [--device ${DEVICE_TYPE}] [--threshold ${THRESHOLD}]
+```
+
+Optional arguments:
+
+- `OUT_FILE`: Path to the output, either video or json file
+- `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1.
+- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01.
+- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames (`sample_length` indicates the size of temporal window from which you sample frames, which equals to `clip_len x frame_interval`). For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. The desired value of `STRIDE` is (0, 1\], while it also works for `STRIDE > 1` (the generated predictions will be too sparse). Default: 0.
+- `LABEL_COLOR`: Font Color of the labels in (B, G, R). Default is white, that is (256, 256, 256).
+- `MSG_COLOR`: Font Color of the messages in (B, G, R). Default is gray, that is (128, 128, 128).
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Predict different labels in a long video by using a TSN model on cpu, with 3 frames for input steps (that is, random sample one from each 3 frames)
+   and outputting result labels with score higher than 0.2.
+
+   ```shell
+   python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+     checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
+     --input-step 3 --device cpu --threshold 0.2
+   ```
+
+2. Predict different labels in a long video by using a TSN model on cpu, with 3 frames for input steps (that is, random sample one from each 3 frames)
+   and outputting result labels with score higher than 0.2, loading checkpoint from url.
+
+   ```shell
+   python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+     https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+     PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
+   ```
+
+3. Predict different labels in a long video from web by using a TSN model on cpu, with 3 frames for input steps (that is, random sample one from each 3 frames)
+   and outputting result labels with score higher than 0.2, loading checkpoint from url.
+
+   ```shell
+   python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+     https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+     https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4 \
+     tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
+   ```
+
+4. Predict different labels in a long video by using a I3D model on gpu, with input_step=1, threshold=0.01 as default and print the labels in cyan.
+
+   ```shell
+   python demo/long_video_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
+     checkpoints/i3d_r50_256p_32x2x1_100e_kinetics400_rgb_20200801-7d9f44de.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
+     --label-color 255 255 0
+   ```
+
+5. Predict different labels in a long video by using a I3D model on gpu and save the results as a `json` file
+
+   ```shell
+   python demo/long_video_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
+     checkpoints/i3d_r50_256p_32x2x1_100e_kinetics400_rgb_20200801-7d9f44de.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt ./results.json
+   ```
+
+## SpatioTemporal Action Detection Webcam Demo
+
+We provide a demo script to implement real-time spatio-temporal action detection from a web camera.
+
+```shell
+python demo/webcam_demo_spatiotemporal_det.py \
+    [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--input-video] ${INPUT_VIDEO} \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--output-fps ${OUTPUT_FPS}] \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--show] \
+    [--display-height] ${DISPLAY_HEIGHT} \
+    [--display-width] ${DISPLAY_WIDTH} \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--clip-vis-length] ${CLIP_VIS_LENGTH}
+```
+
+Optional arguments:
+
+- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint path or URL.
+- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Default: 0.4.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
+- `INPUT_VIDEO`: The webcam id or video path of the source. Default: `0`.
+- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Default: `cuda:0`.
+- `OUTPUT_FPS`: The FPS of demo video output. Default: 15.
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: None.
+- `--show`: Whether to show predictions with `cv2.imshow`.
+- `DISPLAY_HEIGHT`: The height of the display frame. Default: 0.
+- `DISPLAY_WIDTH`: The width of the display frame. Default: 0. If `DISPLAY_HEIGHT <= 0 and DISPLAY_WIDTH <= 0`, the display frame and input video share the same shape.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames. Default: 8.
+- `CLIP_VIS_LENGTH`: The number of the draw frames for each clip. In other words, for each clip, there are at most `CLIP_VIS_LENGTH` frames to be draw around the keyframe. DEFAULT: 8.
+
+Tips to get a better experience for webcam demo:
+
+- How to choose `--output-fps`?
+
+  - `--output-fps` should be almost equal to read thread fps.
+  - Read thread fps is printed by logger in format `DEBUG:__main__:Read Thread: {duration} ms, {fps} fps`
+
+- How to choose `--predict-stepsize`?
+
+  - It's related to how to choose human detector and spatio-temporval model.
+  - Overall, the duration of read thread for each task should be greater equal to that of model inference.
+  - The durations for read/inference are both printed by logger.
+  - Larger `--predict-stepsize` leads to larger duration for read thread.
+  - In order to fully take the advantage of computation resources, decrease the value of `--predict-stepsize`.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 40 frames, and FPS of the output is 20. Show predictions with `cv2.imshow`.
+
+```shell
+python demo/webcam_demo_spatiotemporal_det.py \
+    --input-video 0 \
+    --config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
+    --checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --action-score-thr 0.5 \
+    --label-map tools/data/ava/label_map.txt \
+    --predict-stepsize 40 \
+    --output-fps 20 \
+    --show
+```
+
+## Skeleton-based Action Recognition Demo
+
+We provide a demo script to predict the skeleton-based action recognition result using a single video.
+
+```shell
+python demo/demo_skeleton.py ${VIDEO_FILE} ${OUT_FILENAME} \
+    [--config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
+    [--checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
+    [--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--short-side] ${SHORT_SIDE}
+```
+
+Optional arguments:
+
+- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path.
+- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or URL.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
+- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint).
+- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint URL (trained on COCO-Keypoint).
+- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Default: `cuda:0`.
+- `SHORT_SIDE`: The short side used for frame extraction. Default: 480.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D-NTURGB+D-120-Xsub-keypoint as the skeleton-based action recognizer.
+
+```shell
+python demo/demo_skeleton.py demo/ntu_sample.avi demo/skeleton_demo.mp4 \
+    --config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
+    --checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint/slowonly_r50_u48_240e_ntu120_xsub_keypoint-6736b03f.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --pose-config demo/hrnet_w32_coco_256x192.py \
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --label-map tools/data/skeleton/label_map_ntu120.txt
+```
+
+2. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, STGCN-NTURGB+D-60-Xsub-keypoint as the skeleton-based action recognizer.
+
+```shell
+python demo/demo_skeleton.py demo/ntu_sample.avi demo/skeleton_demo.mp4 \
+    --config configs/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint.py \
+    --checkpoint https://download.openmmlab.com/mmaction/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint/stgcn_80e_ntu60_xsub_keypoint-e7bb9653.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --pose-config demo/hrnet_w32_coco_256x192.py \
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --label-map tools/data/skeleton/label_map_ntu120.txt
+```
+
+## Video Structuralize Demo
+
+We provide a demo script to to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video.
+
+```shell
+python demo/demo_video_structuralize.py
+    [--rgb-stdet-config ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--rgb-stdet-checkpoint ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--skeleton-stdet-checkpoint ${SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
+    [--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
+    [--skeleton-config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
+    [--skeleton-checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
+    [--rgb-config ${RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
+    [--rgb-checkpoint ${RGB_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
+    [--use-skeleton-stdet ${USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD}] \
+    [--use-skeleton-recog ${USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRE}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRE}] \
+    [--video ${VIDEO_FILE}] \
+    [--label-map-stdet ${LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION}] \
+    [--device ${DEVICE}] \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--output-stepsize ${OUTPU_STEPSIZE}] \
+    [--output-fps ${OUTPUT_FPS}] \
+    [--cfg-options]
+```
+
+Optional arguments:
+
+- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The rgb-based spatio temoral action detection config file path.
+- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The rgb-based spatio temoral action detection checkpoint path or URL.
+- `SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The skeleton-based spatio temoral action detection checkpoint path or URL.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint).
+- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint URL (trained on COCO-Keypoint).
+- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path.
+- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or URL.
+- `RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The rgb-based action recognition config file path.
+- `RGB_BASED_ACTION_RECOGNITION_CHECKPOINT`: The rgb-based action recognition checkpoint path or URL.
+- `USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD`: Use skeleton-based spatio temporal action detection method.
+- `USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD`: Use skeleton-based action recognition method.
+- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
+- `ACTION_DETECTION_SCORE_THRE`: The score threshold for action detection. Default: 0.4.
+- `LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION`: The label map for spatio temporal action detection used. Default: `tools/data/ava/label_map.txt`.
+- `LABEL_MAP`: The label map for action recognition. Default: `tools/data/kinetics/label_map_k400.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Default: `cuda:0`.
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: `demo/test_stdet_recognition_output.mp4`.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames.  Default: 8.
+- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Default: 1.
+- `OUTPUT_FPS`: The FPS of demo video output. Default: 24.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer and the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py
+    --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --pose-config demo/hrnet_w32_coco_256x192.py
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
+    hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
+    --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/
+    posec3d_k400.pth \
+    --use-skeleton-stdet \
+    --use-skeleton-recog \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
+
+2. Use the Faster RCNN as the human detector, TSN-R50-1x1x3 as the rgb-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py
+    --rgb-stdet-config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
+    --rgb-stdet-checkpoint  https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --rgb-config configs/recognition/tsn/
+    tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+    --rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/
+    tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/
+    tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
+
+3. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py
+    --rgb-stdet-config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
+    --rgb-stdet-checkpoint  https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --pose-config demo/hrnet_w32_coco_256x192.py
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
+    hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
+    --skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/
+    posec3d_k400.pth \
+    --use-skeleton-recog \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
+
+4. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, TSN-R50-1x1x3 as the rgb-based action recognizer, PoseC3D as the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
+
+```shell
+python demo/demo_video_structuralize.py
+    --skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
+    --det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --pose-config demo/hrnet_w32_coco_256x192.py
+    --pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
+    hrnet_w32_coco_256x192-c78dce93_20200708.pth \
+    --skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
+    --rgb-config configs/recognition/tsn/
+    tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
+    --rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/
+    tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/
+    tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
+    --use-skeleton-stdet \
+    --label-map-stdet tools/data/ava/label_map.txt \
+    --label-map tools/data/kinetics/label_map_k400.txt
+```
+
+## Audio Demo
+
+Demo script to predict the audio-based action recognition using a single audio feature.
+
+The script `extract_audio.py` can be used to extract audios from videos and the script `build_audio_features.py` can be used to extract the audio features.
+
+```shell
+python demo/demo_audio.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${AUDIO_FILE} {LABEL_FILE} [--device ${DEVICE}]
+```
+
+Optional arguments:
+
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda devices like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
+or use checkpoint url from `configs/` to directly load the corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
+
+1. Recognize an audio file as input by using a tsn model on cuda by default.
+
+   ```shell
+   python demo/demo_audio.py \
+       configs/recognition_audio/resnet/tsn_r18_64x1x1_100e_kinetics400_audio_feature.py \
+       https://download.openmmlab.com/mmaction/recognition/audio_recognition/tsn_r18_64x1x1_100e_kinetics400_audio_feature/tsn_r18_64x1x1_100e_kinetics400_audio_feature_20201012-bf34df6c.pth \
+       audio_feature.npy label_map_k400.txt
+   ```
--- a/demo/demo.gif
+++ b/demo/demo.gif
--- a/demo/demo.ipynb
+++ b/demo/demo.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from mmaction.apis import init_recognizer, inference_recognizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "config_file = '../configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py'\n",
+    "# download the checkpoint from model zoo and put it in `checkpoints/`\n",
+    "checkpoint_file = '../checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# build the model from a config file and a checkpoint file\n",
+    "model = init_recognizer(config_file, checkpoint_file, device='cpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# test a single video and show the result:\n",
+    "video = 'demo.mp4'\n",
+    "label = '../../tools/data/kinetics/label_map_k400.txt'\n",
+    "results = inference_recognizer(model, video)\n",
+    "\n",
+    "labels = open(label).readlines()\n",
+    "labels = [x.strip() for x in labels]\n",
+    "results = [(labels[k[0]], k[1]) for k in results]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "arm wrestling:  29.61644\n",
+      "rock scissors paper:  10.754839\n",
+      "shaking hands:  9.9084\n",
+      "clapping:  9.189912\n",
+      "massaging feet:  8.305307\n"
+     ]
+    }
+   ],
+   "source": [
+    "# show the results\n",
+    "for result in results:\n",
+    "    print(f'{result[0]}: ', result[1])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/demo/demo.mp4
+++ b/demo/demo.mp4
--- a/demo/demo.py
+++ b/demo/demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+import torch
+import webcolors
+from mmcv import Config, DictAction
+
+from mmaction.apis import inference_recognizer, init_recognizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('video', help='video file/url or rawframes directory')
+    parser.add_argument('label', help='label file')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--use-frames',
+        default=False,
+        action='store_true',
+        help='whether to use rawframes as input')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--fps',
+        default=30,
+        type=int,
+        help='specify fps value of the output video when using rawframes to '
+        'generate file')
+    parser.add_argument(
+        '--font-scale',
+        default=0.5,
+        type=float,
+        help='font scale of the label in output video')
+    parser.add_argument(
+        '--font-color',
+        default='white',
+        help='font color of the label in output video')
+    parser.add_argument(
+        '--target-resolution',
+        nargs=2,
+        default=None,
+        type=int,
+        help='Target resolution (w, h) for resizing the frames when using a '
+        'video as input. If either dimension is set to -1, the frames are '
+        'resized by keeping the existing aspect ratio')
+    parser.add_argument(
+        '--resize-algorithm',
+        default='bicubic',
+        help='resize algorithm applied to generate video')
+    parser.add_argument('--out-filename', default=None, help='output filename')
+    args = parser.parse_args()
+    return args
+
+
+def get_output(video_path,
+               out_filename,
+               label,
+               fps=30,
+               font_scale=0.5,
+               font_color='white',
+               target_resolution=None,
+               resize_algorithm='bicubic',
+               use_frames=False):
+    """Get demo output using ``moviepy``.
+
+    This function will generate video file or gif file from raw video or
+    frames, by using ``moviepy``. For more information of some parameters,
+    you can refer to: https://github.com/Zulko/moviepy.
+
+    Args:
+        video_path (str): The video file path or the rawframes directory path.
+            If ``use_frames`` is set to True, it should be rawframes directory
+            path. Otherwise, it should be video file path.
+        out_filename (str): Output filename for the generated file.
+        label (str): Predicted label of the generated file.
+        fps (int): Number of picture frames to read per second. Default: 30.
+        font_scale (float): Font scale of the label. Default: 0.5.
+        font_color (str): Font color of the label. Default: 'white'.
+        target_resolution (None | tuple[int | None]): Set to
+            (desired_width desired_height) to have resized frames. If either
+            dimension is None, the frames are resized by keeping the existing
+            aspect ratio. Default: None.
+        resize_algorithm (str): Support "bicubic", "bilinear", "neighbor",
+            "lanczos", etc. Default: 'bicubic'. For more information,
+            see https://ffmpeg.org/ffmpeg-scaler.html
+        use_frames: Determine Whether to use rawframes as input. Default:False.
+    """
+
+    if video_path.startswith(('http://', 'https://')):
+        raise NotImplementedError
+
+    try:
+        # In case of a segment fault when import decord in the head of demo
+        import decord
+        from moviepy.editor import ImageSequenceClip
+    except ImportError:
+        raise ImportError('Please install moviepy to enable output file.')
+
+    # Channel Order is BGR
+    if use_frames:
+        frame_list = sorted(
+            [osp.join(video_path, x) for x in os.listdir(video_path)])
+        frames = [cv2.imread(x) for x in frame_list]
+    else:
+        video = decord.VideoReader(video_path)
+        frames = [x.asnumpy()[..., ::-1] for x in video]
+
+    if target_resolution:
+        w, h = target_resolution
+        frame_h, frame_w, _ = frames[0].shape
+        if w == -1:
+            w = int(h / frame_h * frame_w)
+        if h == -1:
+            h = int(w / frame_w * frame_h)
+        frames = [cv2.resize(f, (w, h)) for f in frames]
+
+    textsize = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, font_scale,
+                               1)[0]
+    textheight = textsize[1]
+    padding = 10
+    location = (padding, padding + textheight)
+
+    if isinstance(font_color, str):
+        font_color = webcolors.name_to_rgb(font_color)[::-1]
+
+    frames = [np.array(frame) for frame in frames]
+    for frame in frames:
+        cv2.putText(frame, label, location, cv2.FONT_HERSHEY_DUPLEX,
+                    font_scale, font_color, 1)
+
+    # RGB order
+    frames = [x[..., ::-1] for x in frames]
+    video_clips = ImageSequenceClip(frames, fps=fps)
+
+    out_type = osp.splitext(out_filename)[1][1:]
+    if out_type == 'gif':
+        video_clips.write_gif(out_filename)
+    else:
+        video_clips.write_videofile(out_filename, remove_temp=True)
+
+
+def main():
+    args = parse_args()
+    # assign the desired device.
+    device = torch.device(args.device)
+
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+
+    # build the recognizer from a config file and checkpoint file/url
+    model = init_recognizer(cfg, args.checkpoint, device=device)
+
+    # e.g. use ('backbone', ) to return backbone feature
+    output_layer_names = None
+
+    # test a single video or rawframes of a single video
+    if output_layer_names:
+        results, returned_feature = inference_recognizer(
+            model, args.video, outputs=output_layer_names)
+    else:
+        results = inference_recognizer(model, args.video)
+
+    labels = open(args.label).readlines()
+    labels = [x.strip() for x in labels]
+    results = [(labels[k[0]], k[1]) for k in results]
+
+    print('The top-5 labels with corresponding scores are:')
+    for result in results:
+        print(f'{result[0]}: ', result[1])
+
+    if args.out_filename is not None:
+
+        if args.target_resolution is not None:
+            if args.target_resolution[0] == -1:
+                assert isinstance(args.target_resolution[1], int)
+                assert args.target_resolution[1] > 0
+            if args.target_resolution[1] == -1:
+                assert isinstance(args.target_resolution[0], int)
+                assert args.target_resolution[0] > 0
+            args.target_resolution = tuple(args.target_resolution)
+
+        get_output(
+            args.video,
+            args.out_filename,
+            results[0][0],
+            fps=args.fps,
+            font_scale=args.font_scale,
+            font_color=args.font_color,
+            target_resolution=args.target_resolution,
+            resize_algorithm=args.resize_algorithm,
+            use_frames=args.use_frames)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/demo_audio.py
+++ b/demo/demo_audio.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import torch
+from mmcv import Config, DictAction
+
+from mmaction.apis import inference_recognizer, init_recognizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('audio', help='audio file')
+    parser.add_argument('label', help='label file')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    device = torch.device(args.device)
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+    model = init_recognizer(cfg, args.checkpoint, device=device)
+
+    if not args.audio.endswith('.npy'):
+        raise NotImplementedError('Demo works on extracted audio features')
+    results = inference_recognizer(model, args.audio)
+
+    labels = open(args.label).readlines()
+    labels = [x.strip() for x in labels]
+    results = [(labels[k[0]], k[1]) for k in results]
+
+    print('Scores:')
+    for result in results:
+        print(f'{result[0]}: ', result[1])
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/demo_gradcam.gif
+++ b/demo/demo_gradcam.gif
--- a/demo/demo_gradcam.py
+++ b/demo/demo_gradcam.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv import Config, DictAction
+from mmcv.parallel import collate, scatter
+
+from mmaction.apis import init_recognizer
+from mmaction.datasets.pipelines import Compose
+from mmaction.utils import GradCAM
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 GradCAM demo')
+
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('video', help='video file/url or rawframes directory')
+    parser.add_argument(
+        '--use-frames',
+        default=False,
+        action='store_true',
+        help='whether to use rawframes as input')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--target-layer-name',
+        type=str,
+        default='backbone/layer4/1/relu',
+        help='GradCAM target layer name')
+    parser.add_argument('--out-filename', default=None, help='output filename')
+    parser.add_argument('--fps', default=5, type=int)
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--target-resolution',
+        nargs=2,
+        default=None,
+        type=int,
+        help='Target resolution (w, h) for resizing the frames when using a '
+        'video as input. If either dimension is set to -1, the frames are '
+        'resized by keeping the existing aspect ratio')
+    parser.add_argument(
+        '--resize-algorithm',
+        default='bilinear',
+        help='resize algorithm applied to generate video & gif')
+
+    args = parser.parse_args()
+    return args
+
+
+def build_inputs(model, video_path, use_frames=False):
+    """build inputs for GradCAM.
+
+    Note that, building inputs for GradCAM is exactly the same as building
+    inputs for Recognizer test stage. Codes from `inference_recognizer`.
+
+    Args:
+        model (nn.Module): Recognizer model.
+        video_path (str): video file/url or rawframes directory.
+        use_frames (bool): whether to use rawframes as input.
+    Returns:
+        dict: Both GradCAM inputs and Recognizer test stage inputs,
+            including two keys, ``imgs`` and ``label``.
+    """
+    if not (osp.exists(video_path) or video_path.startswith('http')):
+        raise RuntimeError(f"'{video_path}' is missing")
+
+    if osp.isfile(video_path) and use_frames:
+        raise RuntimeError(
+            f"'{video_path}' is a video file, not a rawframe directory")
+    if osp.isdir(video_path) and not use_frames:
+        raise RuntimeError(
+            f"'{video_path}' is a rawframe directory, not a video file")
+
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+
+    # build the data pipeline
+    test_pipeline = cfg.data.test.pipeline
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    if use_frames:
+        filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg')
+        modality = cfg.data.test.get('modality', 'RGB')
+        start_index = cfg.data.test.get('start_index', 1)
+        data = dict(
+            frame_dir=video_path,
+            total_frames=len(os.listdir(video_path)),
+            label=-1,
+            start_index=start_index,
+            filename_tmpl=filename_tmpl,
+            modality=modality)
+    else:
+        start_index = cfg.data.test.get('start_index', 0)
+        data = dict(
+            filename=video_path,
+            label=-1,
+            start_index=start_index,
+            modality='RGB')
+    data = test_pipeline(data)
+    data = collate([data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device])[0]
+
+    return data
+
+
+def _resize_frames(frame_list,
+                   scale,
+                   keep_ratio=True,
+                   interpolation='bilinear'):
+    """resize frames according to given scale.
+
+    Codes are modified from `mmaction2/datasets/pipelines/augmentation.py`,
+    `Resize` class.
+
+    Args:
+        frame_list (list[np.ndarray]): frames to be resized.
+        scale (tuple[int]): If keep_ratio is True, it serves as scaling
+            factor or maximum size: the image will be rescaled as large
+            as possible within the scale. Otherwise, it serves as (w, h)
+            of output size.
+        keep_ratio (bool): If set to True, Images will be resized without
+            changing the aspect ratio. Otherwise, it will resize images to a
+            given size. Default: True.
+        interpolation (str): Algorithm used for interpolation:
+            "nearest" | "bilinear". Default: "bilinear".
+    Returns:
+        list[np.ndarray]: Both GradCAM and Recognizer test stage inputs,
+            including two keys, ``imgs`` and ``label``.
+    """
+    if scale is None or (scale[0] == -1 and scale[1] == -1):
+        return frame_list
+    scale = tuple(scale)
+    max_long_edge = max(scale)
+    max_short_edge = min(scale)
+    if max_short_edge == -1:
+        scale = (np.inf, max_long_edge)
+
+    img_h, img_w, _ = frame_list[0].shape
+
+    if keep_ratio:
+        new_w, new_h = mmcv.rescale_size((img_w, img_h), scale)
+    else:
+        new_w, new_h = scale
+
+    frame_list = [
+        mmcv.imresize(img, (new_w, new_h), interpolation=interpolation)
+        for img in frame_list
+    ]
+
+    return frame_list
+
+
+def main():
+    args = parse_args()
+
+    # assign the desired device.
+    device = torch.device(args.device)
+
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+
+    # build the recognizer from a config file and checkpoint file/url
+    model = init_recognizer(cfg, args.checkpoint, device=device)
+
+    inputs = build_inputs(model, args.video, use_frames=args.use_frames)
+    gradcam = GradCAM(model, args.target_layer_name)
+    results = gradcam(inputs)
+
+    if args.out_filename is not None:
+        try:
+            from moviepy.editor import ImageSequenceClip
+        except ImportError:
+            raise ImportError('Please install moviepy to enable output file.')
+
+        # frames_batches shape [B, T, H, W, 3], in RGB order
+        frames_batches = (results[0] * 255.).numpy().astype(np.uint8)
+        frames = frames_batches.reshape(-1, *frames_batches.shape[-3:])
+
+        frame_list = list(frames)
+        frame_list = _resize_frames(
+            frame_list,
+            args.target_resolution,
+            interpolation=args.resize_algorithm)
+
+        video_clips = ImageSequenceClip(frame_list, fps=args.fps)
+        out_type = osp.splitext(args.out_filename)[1][1:]
+        if out_type == 'gif':
+            video_clips.write_gif(args.out_filename)
+        else:
+            video_clips.write_videofile(args.out_filename, remove_temp=True)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/demo_out.mp4
+++ b/demo/demo_out.mp4
--- a/demo/demo_skeleton.py
+++ b/demo/demo_skeleton.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import shutil
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv import DictAction
+
+from mmaction.apis import inference_recognizer, init_recognizer
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+except (ImportError, ModuleNotFoundError):
+    raise ImportError('Failed to import `inference_detector` and '
+                      '`init_detector` form `mmdet.apis`. These apis are '
+                      'required in this demo! ')
+
+try:
+    from mmpose.apis import (inference_top_down_pose_model, init_pose_model,
+                             vis_pose_result)
+except (ImportError, ModuleNotFoundError):
+    raise ImportError('Failed to import `inference_top_down_pose_model`, '
+                      '`init_pose_model`, and `vis_pose_result` form '
+                      '`mmpose.apis`. These apis are required in this demo! ')
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.75
+FONTCOLOR = (255, 255, 255)  # BGR, white
+THICKNESS = 1
+LINETYPE = 1
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('video', help='video file/url')
+    parser.add_argument('out_filename', help='output filename')
+    parser.add_argument(
+        '--config',
+        default=('configs/skeleton/posec3d/'
+                 'slowonly_r50_u48_240e_ntu120_xsub_keypoint.py'),
+        help='skeleton model config file path')
+    parser.add_argument(
+        '--checkpoint',
+        default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
+                 'slowonly_r50_u48_240e_ntu120_xsub_keypoint/'
+                 'slowonly_r50_u48_240e_ntu120_xsub_keypoint-6736b03f.pth'),
+        help='skeleton model checkpoint file/url')
+    parser.add_argument(
+        '--det-config',
+        default='demo/faster_rcnn_r50_fpn_2x_coco.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+                 'faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--pose-config',
+        default='demo/hrnet_w32_coco_256x192.py',
+        help='human pose estimation config file path (from mmpose)')
+    parser.add_argument(
+        '--pose-checkpoint',
+        default=('https://download.openmmlab.com/mmpose/top_down/hrnet/'
+                 'hrnet_w32_coco_256x192-c78dce93_20200708.pth'),
+        help='human pose estimation checkpoint file/url')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/skeleton/label_map_ntu120.txt',
+        help='label map file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--short-side',
+        type=int,
+        default=480,
+        help='specify the short-side length of the image')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def frame_extraction(video_path, short_side):
+    """Extract frames given video_path.
+
+    Args:
+        video_path (str): The video_path.
+    """
+    # Load the video, extract frames into ./tmp/video_name
+    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
+    os.makedirs(target_dir, exist_ok=True)
+    # Should be able to handle videos up to several hours
+    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
+    vid = cv2.VideoCapture(video_path)
+    frames = []
+    frame_paths = []
+    flag, frame = vid.read()
+    cnt = 0
+    new_h, new_w = None, None
+    while flag:
+        if new_h is None:
+            h, w, _ = frame.shape
+            new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
+
+        frame = mmcv.imresize(frame, (new_w, new_h))
+
+        frames.append(frame)
+        frame_path = frame_tmpl.format(cnt + 1)
+        frame_paths.append(frame_path)
+
+        cv2.imwrite(frame_path, frame)
+        cnt += 1
+        flag, frame = vid.read()
+
+    return frame_paths, frames
+
+
+def detection_inference(args, frame_paths):
+    """Detect human boxes given frame paths.
+
+    Args:
+        args (argparse.Namespace): The arguments.
+        frame_paths (list[str]): The paths of frames to do detection inference.
+
+    Returns:
+        list[np.ndarray]: The human detection results.
+    """
+    model = init_detector(args.det_config, args.det_checkpoint, args.device)
+    assert model.CLASSES[0] == 'person', ('We require you to use a detector '
+                                          'trained on COCO')
+    results = []
+    print('Performing Human Detection for each frame')
+    prog_bar = mmcv.ProgressBar(len(frame_paths))
+    for frame_path in frame_paths:
+        result = inference_detector(model, frame_path)
+        # We only keep human detections with score larger than det_score_thr
+        result = result[0][result[0][:, 4] >= args.det_score_thr]
+        results.append(result)
+        prog_bar.update()
+    return results
+
+
+def pose_inference(args, frame_paths, det_results):
+    model = init_pose_model(args.pose_config, args.pose_checkpoint,
+                            args.device)
+    ret = []
+    print('Performing Human Pose Estimation for each frame')
+    prog_bar = mmcv.ProgressBar(len(frame_paths))
+    for f, d in zip(frame_paths, det_results):
+        # Align input format
+        d = [dict(bbox=x) for x in list(d)]
+        pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
+        ret.append(pose)
+        prog_bar.update()
+    return ret
+
+
+def main():
+    args = parse_args()
+
+    frame_paths, original_frames = frame_extraction(args.video,
+                                                    args.short_side)
+    num_frame = len(frame_paths)
+    h, w, _ = original_frames[0].shape
+
+    # Get clip_len, frame_interval and calculate center index of each clip
+    config = mmcv.Config.fromfile(args.config)
+    config.merge_from_dict(args.cfg_options)
+    for component in config.data.test.pipeline:
+        if component['type'] == 'PoseNormalize':
+            component['mean'] = (w // 2, h // 2, .5)
+            component['max_value'] = (w, h, 1.)
+
+    model = init_recognizer(config, args.checkpoint, args.device)
+
+    # Load label_map
+    label_map = [x.strip() for x in open(args.label_map).readlines()]
+
+    # Get Human detection results
+    det_results = detection_inference(args, frame_paths)
+    torch.cuda.empty_cache()
+
+    pose_results = pose_inference(args, frame_paths, det_results)
+    torch.cuda.empty_cache()
+
+    fake_anno = dict(
+        frame_dir='',
+        label=-1,
+        img_shape=(h, w),
+        original_shape=(h, w),
+        start_index=0,
+        modality='Pose',
+        total_frames=num_frame)
+    num_person = max([len(x) for x in pose_results])
+
+    num_keypoint = 17
+    keypoint = np.zeros((num_person, num_frame, num_keypoint, 2),
+                        dtype=np.float16)
+    keypoint_score = np.zeros((num_person, num_frame, num_keypoint),
+                              dtype=np.float16)
+    for i, poses in enumerate(pose_results):
+        for j, pose in enumerate(poses):
+            pose = pose['keypoints']
+            keypoint[j, i] = pose[:, :2]
+            keypoint_score[j, i] = pose[:, 2]
+    fake_anno['keypoint'] = keypoint
+    fake_anno['keypoint_score'] = keypoint_score
+
+    results = inference_recognizer(model, fake_anno)
+
+    action_label = label_map[results[0][0]]
+
+    pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
+                                 args.device)
+    vis_frames = [
+        vis_pose_result(pose_model, frame_paths[i], pose_results[i])
+        for i in range(num_frame)
+    ]
+    for frame in vis_frames:
+        cv2.putText(frame, action_label, (10, 30), FONTFACE, FONTSCALE,
+                    FONTCOLOR, THICKNESS, LINETYPE)
+
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=24)
+    vid.write_videofile(args.out_filename, remove_temp=True)
+
+    tmp_frame_dir = osp.dirname(frame_paths[0])
+    shutil.rmtree(tmp_frame_dir)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/demo_spatiotemporal_det.py
+++ b/demo/demo_spatiotemporal_det.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy as cp
+import os
+import os.path as osp
+import shutil
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv import DictAction
+from mmcv.runner import load_checkpoint
+
+from mmaction.models import build_detector
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+except (ImportError, ModuleNotFoundError):
+    raise ImportError('Failed to import `inference_detector` and '
+                      '`init_detector` form `mmdet.apis`. These apis are '
+                      'required in this demo! ')
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+    """Visualize frames with predicted annotations.
+
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted results.
+        plate (str): The plate used for visualization. Default: plate_blue.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5.
+
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    plate = [x[::-1] for x in plate]
+    frames_ = cp.deepcopy(frames)
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    scale_ratio = np.array([w, h, w, h])
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_[ind]
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                cv2.rectangle(frame, st, ed, plate[0], 2)
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, str(score[k])])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument(
+        '--config',
+        default=('configs/detection/ava/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py'),
+        help='spatio temporal detection config file path')
+    parser.add_argument(
+        '--checkpoint',
+        default=('https://download.openmmlab.com/mmaction/detection/ava/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb'
+                 '_20201217-16378594.pth'),
+        help='spatio temporal detection checkpoint file/url')
+    parser.add_argument(
+        '--det-config',
+        default='demo/faster_rcnn_r50_fpn_2x_coco.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+                 'faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--action-score-thr',
+        type=float,
+        default=0.5,
+        help='the threshold of human action score')
+    parser.add_argument('--video', help='video file/url')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/ava/label_map.txt',
+        help='label map file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--out-filename',
+        default='demo/stdet_demo.mp4',
+        help='output filename')
+    parser.add_argument(
+        '--predict-stepsize',
+        default=8,
+        type=int,
+        help='give out a prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=4,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument(
+        '--output-fps',
+        default=6,
+        type=int,
+        help='the fps of demo video output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def frame_extraction(video_path):
+    """Extract frames given video_path.
+
+    Args:
+        video_path (str): The video_path.
+    """
+    # Load the video, extract frames into ./tmp/video_name
+    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
+    os.makedirs(target_dir, exist_ok=True)
+    # Should be able to handle videos up to several hours
+    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
+    vid = cv2.VideoCapture(video_path)
+    frames = []
+    frame_paths = []
+    flag, frame = vid.read()
+    cnt = 0
+    while flag:
+        frames.append(frame)
+        frame_path = frame_tmpl.format(cnt + 1)
+        frame_paths.append(frame_path)
+        cv2.imwrite(frame_path, frame)
+        cnt += 1
+        flag, frame = vid.read()
+    return frame_paths, frames
+
+
+def detection_inference(args, frame_paths):
+    """Detect human boxes given frame paths.
+
+    Args:
+        args (argparse.Namespace): The arguments.
+        frame_paths (list[str]): The paths of frames to do detection inference.
+
+    Returns:
+        list[np.ndarray]: The human detection results.
+    """
+    model = init_detector(args.det_config, args.det_checkpoint, args.device)
+    assert model.CLASSES[0] == 'person', ('We require you to use a detector '
+                                          'trained on COCO')
+    results = []
+    print('Performing Human Detection for each frame')
+    prog_bar = mmcv.ProgressBar(len(frame_paths))
+    for frame_path in frame_paths:
+        result = inference_detector(model, frame_path)
+        # We only keep human detections with score larger than det_score_thr
+        result = result[0][result[0][:, 4] >= args.det_score_thr]
+        results.append(result)
+        prog_bar.update()
+    return results
+
+
+def load_label_map(file_path):
+    """Load Label Map.
+
+    Args:
+        file_path (str): The file path of label map.
+
+    Returns:
+        dict: The label map (int -> label name).
+    """
+    lines = open(file_path).readlines()
+    lines = [x.strip().split(': ') for x in lines]
+    return {int(x[0]): x[1] for x in lines}
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+def pack_result(human_detection, result, img_h, img_w):
+    """Short summary.
+
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+        img_h (int): The image height.
+        img_w (int): The image width.
+
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    human_detection[:, 0::2] /= img_w
+    human_detection[:, 1::2] /= img_h
+    results = []
+    if result is None:
+        return None
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+        results.append(
+            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
+                                                            for x in res]))
+    return results
+
+
+def main():
+    args = parse_args()
+
+    frame_paths, original_frames = frame_extraction(args.video)
+    num_frame = len(frame_paths)
+    h, w, _ = original_frames[0].shape
+
+    # resize frames to shortside 256
+    new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
+    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
+    w_ratio, h_ratio = new_w / w, new_h / h
+
+    # Get clip_len, frame_interval and calculate center index of each clip
+    config = mmcv.Config.fromfile(args.config)
+    config.merge_from_dict(args.cfg_options)
+    val_pipeline = config.data.val.pipeline
+
+    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
+    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
+    window_size = clip_len * frame_interval
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    # Note that it's 1 based here
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    # Load label_map
+    label_map = load_label_map(args.label_map)
+    try:
+        if config['data']['train']['custom_classes'] is not None:
+            label_map = {
+                id + 1: label_map[cls]
+                for id, cls in enumerate(config['data']['train']
+                                         ['custom_classes'])
+            }
+    except KeyError:
+        pass
+
+    # Get Human detection results
+    center_frames = [frame_paths[ind - 1] for ind in timestamps]
+    human_detections = detection_inference(args, center_frames)
+    for i in range(len(human_detections)):
+        det = human_detections[i]
+        det[:, 0:4:2] *= w_ratio
+        det[:, 1:4:2] *= h_ratio
+        human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+
+    # Get img_norm_cfg
+    img_norm_cfg = config['img_norm_cfg']
+    if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
+        to_bgr = img_norm_cfg.pop('to_bgr')
+        img_norm_cfg['to_rgb'] = to_bgr
+    img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
+    img_norm_cfg['std'] = np.array(img_norm_cfg['std'])
+
+    # Build STDET model
+    try:
+        # In our spatiotemporal detection demo, different actions should have
+        # the same number of bboxes.
+        config['model']['test_cfg']['rcnn']['action_thr'] = .0
+    except KeyError:
+        pass
+
+    config.model.backbone.pretrained = None
+    model = build_detector(config.model, test_cfg=config.get('test_cfg'))
+
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    model.to(args.device)
+    model.eval()
+
+    predictions = []
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    assert len(timestamps) == len(human_detections)
+    prog_bar = mmcv.ProgressBar(len(timestamps))
+    for timestamp, proposal in zip(timestamps, human_detections):
+        if proposal.shape[0] == 0:
+            predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
+        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
+        # THWC -> CTHW -> 1CTHW
+        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
+        input_tensor = torch.from_numpy(input_array).to(args.device)
+
+        with torch.no_grad():
+            result = model(
+                return_loss=False,
+                img=[input_tensor],
+                img_metas=[[dict(img_shape=(new_h, new_w))]],
+                proposals=[[proposal]])
+            result = result[0]
+            prediction = []
+            # N proposals
+            for i in range(proposal.shape[0]):
+                prediction.append([])
+            # Perform action score thr
+            for i in range(len(result)):
+                if i + 1 not in label_map:
+                    continue
+                for j in range(proposal.shape[0]):
+                    if result[i][j, 4] > args.action_score_thr:
+                        prediction[j].append((label_map[i + 1], result[i][j,
+                                                                          4]))
+            predictions.append(prediction)
+        prog_bar.update()
+
+    results = []
+    for human_detection, prediction in zip(human_detections, predictions):
+        results.append(pack_result(human_detection, prediction, new_h, new_w))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int64)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)
+    frames = [
+        cv2.imread(frame_paths[i - 1])
+        for i in dense_timestamps(timestamps, dense_n)
+    ]
+    print('Performing visualization')
+    vis_frames = visualize(frames, results)
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+
+    tmp_frame_dir = osp.dirname(frame_paths[0])
+    shutil.rmtree(tmp_frame_dir)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/demo_video_structuralize.py
+++ b/demo/demo_video_structuralize.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy as cp
+import os
+import os.path as osp
+import shutil
+import warnings
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv import DictAction
+from mmcv.runner import load_checkpoint
+
+from mmaction.apis import inference_recognizer
+from mmaction.datasets.pipelines import Compose
+from mmaction.models import build_detector, build_model, build_recognizer
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+except (ImportError, ModuleNotFoundError):
+    warnings.warn('Failed to import `inference_detector` and `init_detector` '
+                  'form `mmdet.apis`. These apis are required in '
+                  'skeleton-based applications! ')
+
+try:
+    from mmpose.apis import (inference_top_down_pose_model, init_pose_model,
+                             vis_pose_result)
+except (ImportError, ModuleNotFoundError):
+    warnings.warn('Failed to import `inference_top_down_pose_model`, '
+                  '`init_pose_model`, and `vis_pose_result` form '
+                  '`mmpose.apis`. These apis are required in skeleton-based '
+                  'applications! ')
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+PLATEBLUE = PLATEBLUE.split('-')
+PLATEBLUE = [hex2color(h) for h in PLATEBLUE]
+PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000'
+PLATEGREEN = PLATEGREEN.split('-')
+PLATEGREEN = [hex2color(h) for h in PLATEGREEN]
+
+
+def visualize(frames,
+              annotations,
+              pose_results,
+              action_result,
+              pose_model,
+              plate=PLATEBLUE,
+              max_num=5):
+    """Visualize frames with predicted annotations.
+
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted spatio-temporal
+            detection results.
+        pose_results (list[list[tuple]): The pose results.
+        action_result (str): The predicted action recognition results.
+        pose_model (nn.Module): The constructed pose model.
+        plate (str): The plate used for visualization. Default: PLATEBLUE.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5.
+
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    plate = [x[::-1] for x in plate]
+    frames_ = cp.deepcopy(frames)
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    scale_ratio = np.array([w, h, w, h])
+
+    # add pose results
+    if pose_results:
+        for i in range(nf):
+            frames_[i] = vis_pose_result(pose_model, frames_[i],
+                                         pose_results[i])
+
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_[ind]
+
+            # add action result for whole video
+            cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
+                        FONTCOLOR, THICKNESS, LINETYPE)
+
+            # add spatio-temporal action detection results
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                if not pose_results:
+                    cv2.rectangle(frame, st, ed, plate[0], 2)
+
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, str(score[k])])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument(
+        '--rgb-stdet-config',
+        default=('configs/detection/ava/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py'),
+        help='rgb-based spatio temporal detection config file path')
+    parser.add_argument(
+        '--rgb-stdet-checkpoint',
+        default=('https://download.openmmlab.com/mmaction/detection/ava/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
+                 'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb'
+                 '_20201217-16378594.pth'),
+        help='rgb-based spatio temporal detection checkpoint file/url')
+    parser.add_argument(
+        '--skeleton-stdet-checkpoint',
+        default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
+                 'posec3d_ava.pth'),
+        help='skeleton-based spatio temporal detection checkpoint file/url')
+    parser.add_argument(
+        '--det-config',
+        default='demo/faster_rcnn_r50_fpn_2x_coco.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/'
+                 'faster_rcnn/faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--pose-config',
+        default='demo/hrnet_w32_coco_256x192.py',
+        help='human pose estimation config file path (from mmpose)')
+    parser.add_argument(
+        '--pose-checkpoint',
+        default=('https://download.openmmlab.com/mmpose/top_down/hrnet/'
+                 'hrnet_w32_coco_256x192-c78dce93_20200708.pth'),
+        help='human pose estimation checkpoint file/url')
+    parser.add_argument(
+        '--skeleton-config',
+        default='configs/skeleton/posec3d/'
+        'slowonly_r50_u48_240e_ntu120_xsub_keypoint.py',
+        help='skeleton-based action recognition config file path')
+    parser.add_argument(
+        '--skeleton-checkpoint',
+        default='https://download.openmmlab.com/mmaction/skeleton/posec3d/'
+        'posec3d_k400.pth',
+        help='skeleton-based action recognition checkpoint file/url')
+    parser.add_argument(
+        '--rgb-config',
+        default='configs/recognition/tsn/'
+        'tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py',
+        help='rgb-based action recognition config file path')
+    parser.add_argument(
+        '--rgb-checkpoint',
+        default='https://download.openmmlab.com/mmaction/recognition/'
+        'tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/'
+        'tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth',
+        help='rgb-based action recognition checkpoint file/url')
+    parser.add_argument(
+        '--use-skeleton-stdet',
+        action='store_true',
+        help='use skeleton-based spatio temporal detection method')
+    parser.add_argument(
+        '--use-skeleton-recog',
+        action='store_true',
+        help='use skeleton-based action recognition method')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--action-score-thr',
+        type=float,
+        default=0.4,
+        help='the threshold of action prediction score')
+    parser.add_argument(
+        '--video',
+        default='demo/test_video_structuralize.mp4',
+        help='video file/url')
+    parser.add_argument(
+        '--label-map-stdet',
+        default='tools/data/ava/label_map.txt',
+        help='label map file for spatio-temporal action detection')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/kinetics/label_map_k400.txt',
+        help='label map file for action recognition')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--out-filename',
+        default='demo/test_stdet_recognition_output.mp4',
+        help='output filename')
+    parser.add_argument(
+        '--predict-stepsize',
+        default=8,
+        type=int,
+        help='give out a spatio-temporal detection prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=1,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument(
+        '--output-fps',
+        default=24,
+        type=int,
+        help='the fps of demo video output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def frame_extraction(video_path):
+    """Extract frames given video_path.
+
+    Args:
+        video_path (str): The video_path.
+    """
+    # Load the video, extract frames into ./tmp/video_name
+    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
+    # target_dir = osp.join('./tmp','spatial_skeleton_dir')
+    os.makedirs(target_dir, exist_ok=True)
+    # Should be able to handle videos up to several hours
+    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
+    vid = cv2.VideoCapture(video_path)
+    frames = []
+    frame_paths = []
+    flag, frame = vid.read()
+    cnt = 0
+    while flag:
+        frames.append(frame)
+        frame_path = frame_tmpl.format(cnt + 1)
+        frame_paths.append(frame_path)
+        cv2.imwrite(frame_path, frame)
+        cnt += 1
+        flag, frame = vid.read()
+    return frame_paths, frames
+
+
+def detection_inference(args, frame_paths):
+    """Detect human boxes given frame paths.
+
+    Args:
+        args (argparse.Namespace): The arguments.
+        frame_paths (list[str]): The paths of frames to do detection inference.
+
+    Returns:
+        list[np.ndarray]: The human detection results.
+    """
+    model = init_detector(args.det_config, args.det_checkpoint, args.device)
+    assert model.CLASSES[0] == 'person', ('We require you to use a detector '
+                                          'trained on COCO')
+    results = []
+    print('Performing Human Detection for each frame')
+    prog_bar = mmcv.ProgressBar(len(frame_paths))
+    for frame_path in frame_paths:
+        result = inference_detector(model, frame_path)
+        # We only keep human detections with score larger than det_score_thr
+        result = result[0][result[0][:, 4] >= args.det_score_thr]
+        results.append(result)
+        prog_bar.update()
+
+    return results
+
+
+def pose_inference(args, frame_paths, det_results):
+    model = init_pose_model(args.pose_config, args.pose_checkpoint,
+                            args.device)
+    ret = []
+    print('Performing Human Pose Estimation for each frame')
+    prog_bar = mmcv.ProgressBar(len(frame_paths))
+    for f, d in zip(frame_paths, det_results):
+        # Align input format
+        d = [dict(bbox=x) for x in list(d)]
+
+        pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
+        ret.append(pose)
+        prog_bar.update()
+    return ret
+
+
+def load_label_map(file_path):
+    """Load Label Map.
+
+    Args:
+        file_path (str): The file path of label map.
+
+    Returns:
+        dict: The label map (int -> label name).
+    """
+    lines = open(file_path).readlines()
+    lines = [x.strip().split(': ') for x in lines]
+    return {int(x[0]): x[1] for x in lines}
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+def pack_result(human_detection, result, img_h, img_w):
+    """Short summary.
+
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+        img_h (int): The image height.
+        img_w (int): The image width.
+
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    human_detection[:, 0::2] /= img_w
+    human_detection[:, 1::2] /= img_h
+    results = []
+    if result is None:
+        return None
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+        results.append(
+            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
+                                                            for x in res]))
+    return results
+
+
+def expand_bbox(bbox, h, w, ratio=1.25):
+    x1, y1, x2, y2 = bbox
+    center_x = (x1 + x2) // 2
+    center_y = (y1 + y2) // 2
+    width = x2 - x1
+    height = y2 - y1
+
+    square_l = max(width, height)
+    new_width = new_height = square_l * ratio
+
+    new_x1 = max(0, int(center_x - new_width / 2))
+    new_x2 = min(int(center_x + new_width / 2), w)
+    new_y1 = max(0, int(center_y - new_height / 2))
+    new_y2 = min(int(center_y + new_height / 2), h)
+    return (new_x1, new_y1, new_x2, new_y2)
+
+
+def cal_iou(box1, box2):
+    xmin1, ymin1, xmax1, ymax1 = box1
+    xmin2, ymin2, xmax2, ymax2 = box2
+
+    s1 = (xmax1 - xmin1) * (ymax1 - ymin1)
+    s2 = (xmax2 - xmin2) * (ymax2 - ymin2)
+
+    xmin = max(xmin1, xmin2)
+    ymin = max(ymin1, ymin2)
+    xmax = min(xmax1, xmax2)
+    ymax = min(ymax1, ymax2)
+
+    w = max(0, xmax - xmin)
+    h = max(0, ymax - ymin)
+    intersect = w * h
+    union = s1 + s2 - intersect
+    iou = intersect / union
+
+    return iou
+
+
+def skeleton_based_action_recognition(args, pose_results, num_frame, h, w):
+    fake_anno = dict(
+        frame_dict='',
+        label=-1,
+        img_shape=(h, w),
+        origin_shape=(h, w),
+        start_index=0,
+        modality='Pose',
+        total_frames=num_frame)
+    num_person = max([len(x) for x in pose_results])
+
+    num_keypoint = 17
+    keypoint = np.zeros((num_person, num_frame, num_keypoint, 2),
+                        dtype=np.float16)
+    keypoint_score = np.zeros((num_person, num_frame, num_keypoint),
+                              dtype=np.float16)
+    for i, poses in enumerate(pose_results):
+        for j, pose in enumerate(poses):
+            pose = pose['keypoints']
+            keypoint[j, i] = pose[:, :2]
+            keypoint_score[j, i] = pose[:, 2]
+
+    fake_anno['keypoint'] = keypoint
+    fake_anno['keypoint_score'] = keypoint_score
+
+    label_map = [x.strip() for x in open(args.label_map).readlines()]
+    num_class = len(label_map)
+
+    skeleton_config = mmcv.Config.fromfile(args.skeleton_config)
+    skeleton_config.model.cls_head.num_classes = num_class  # for K400 dataset
+    skeleton_pipeline = Compose(skeleton_config.test_pipeline)
+    skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None]
+    skeleton_imgs = skeleton_imgs.to(args.device)
+
+    # Build skeleton-based recognition model
+    skeleton_model = build_model(skeleton_config.model)
+    load_checkpoint(
+        skeleton_model, args.skeleton_checkpoint, map_location='cpu')
+    skeleton_model.to(args.device)
+    skeleton_model.eval()
+
+    with torch.no_grad():
+        output = skeleton_model(return_loss=False, imgs=skeleton_imgs)
+
+    action_idx = np.argmax(output)
+    skeleton_action_result = label_map[
+        action_idx]  # skeleton-based action result for the whole video
+    return skeleton_action_result
+
+
+def rgb_based_action_recognition(args):
+    rgb_config = mmcv.Config.fromfile(args.rgb_config)
+    rgb_config.model.backbone.pretrained = None
+    rgb_model = build_recognizer(
+        rgb_config.model, test_cfg=rgb_config.get('test_cfg'))
+    load_checkpoint(rgb_model, args.rgb_checkpoint, map_location='cpu')
+    rgb_model.cfg = rgb_config
+    rgb_model.to(args.device)
+    rgb_model.eval()
+    action_results = inference_recognizer(
+        rgb_model, args.video, label_path=args.label_map)
+    rgb_action_result = action_results[0][0]
+    label_map = [x.strip() for x in open(args.label_map).readlines()]
+    return label_map[rgb_action_result]
+
+
+def skeleton_based_stdet(args, label_map, human_detections, pose_results,
+                         num_frame, clip_len, frame_interval, h, w):
+    window_size = clip_len * frame_interval
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    skeleton_config = mmcv.Config.fromfile(args.skeleton_config)
+    num_class = max(label_map.keys()) + 1  # for AVA dataset (81)
+    skeleton_config.model.cls_head.num_classes = num_class
+    skeleton_pipeline = Compose(skeleton_config.test_pipeline)
+    skeleton_stdet_model = build_model(skeleton_config.model)
+    load_checkpoint(
+        skeleton_stdet_model,
+        args.skeleton_stdet_checkpoint,
+        map_location='cpu')
+    skeleton_stdet_model.to(args.device)
+    skeleton_stdet_model.eval()
+
+    skeleton_predictions = []
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    prog_bar = mmcv.ProgressBar(len(timestamps))
+    for timestamp in timestamps:
+        proposal = human_detections[timestamp - 1]
+        if proposal.shape[0] == 0:  # no people detected
+            skeleton_predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+        num_frame = len(frame_inds)  # 30
+
+        pose_result = [pose_results[ind] for ind in frame_inds]
+
+        skeleton_prediction = []
+        for i in range(proposal.shape[0]):  # num_person
+            skeleton_prediction.append([])
+
+            fake_anno = dict(
+                frame_dict='',
+                label=-1,
+                img_shape=(h, w),
+                origin_shape=(h, w),
+                start_index=0,
+                modality='Pose',
+                total_frames=num_frame)
+            num_person = 1
+
+            num_keypoint = 17
+            keypoint = np.zeros(
+                (num_person, num_frame, num_keypoint, 2))  # M T V 2
+            keypoint_score = np.zeros(
+                (num_person, num_frame, num_keypoint))  # M T V
+
+            # pose matching
+            person_bbox = proposal[i][:4]
+            area = expand_bbox(person_bbox, h, w)
+
+            for j, poses in enumerate(pose_result):  # num_frame
+                max_iou = float('-inf')
+                index = -1
+                if len(poses) == 0:
+                    continue
+                for k, per_pose in enumerate(poses):
+                    iou = cal_iou(per_pose['bbox'][:4], area)
+                    if max_iou < iou:
+                        index = k
+                        max_iou = iou
+                keypoint[0, j] = poses[index]['keypoints'][:, :2]
+                keypoint_score[0, j] = poses[index]['keypoints'][:, 2]
+
+            fake_anno['keypoint'] = keypoint
+            fake_anno['keypoint_score'] = keypoint_score
+
+            skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None]
+            skeleton_imgs = skeleton_imgs.to(args.device)
+
+            with torch.no_grad():
+                output = skeleton_stdet_model(
+                    return_loss=False, imgs=skeleton_imgs)
+                output = output[0]
+                for k in range(len(output)):  # 81
+                    if k not in label_map:
+                        continue
+                    if output[k] > args.action_score_thr:
+                        skeleton_prediction[i].append(
+                            (label_map[k], output[k]))
+
+        skeleton_predictions.append(skeleton_prediction)
+        prog_bar.update()
+
+    return timestamps, skeleton_predictions
+
+
+def rgb_based_stdet(args, frames, label_map, human_detections, w, h, new_w,
+                    new_h, w_ratio, h_ratio):
+
+    rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config)
+    rgb_stdet_config.merge_from_dict(args.cfg_options)
+
+    val_pipeline = rgb_stdet_config.data.val.pipeline
+    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
+    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+
+    window_size = clip_len * frame_interval
+    num_frame = len(frames)
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    # Get img_norm_cfg
+    img_norm_cfg = rgb_stdet_config['img_norm_cfg']
+    if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
+        to_bgr = img_norm_cfg.pop('to_bgr')
+        img_norm_cfg['to_rgb'] = to_bgr
+    img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
+    img_norm_cfg['std'] = np.array(img_norm_cfg['std'])
+
+    # Build STDET model
+    try:
+        # In our spatiotemporal detection demo, different actions should have
+        # the same number of bboxes.
+        rgb_stdet_config['model']['test_cfg']['rcnn']['action_thr'] = .0
+    except KeyError:
+        pass
+
+    rgb_stdet_config.model.backbone.pretrained = None
+    rgb_stdet_model = build_detector(
+        rgb_stdet_config.model, test_cfg=rgb_stdet_config.get('test_cfg'))
+
+    load_checkpoint(
+        rgb_stdet_model, args.rgb_stdet_checkpoint, map_location='cpu')
+    rgb_stdet_model.to(args.device)
+    rgb_stdet_model.eval()
+
+    predictions = []
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    prog_bar = mmcv.ProgressBar(len(timestamps))
+    for timestamp in timestamps:
+        proposal = human_detections[timestamp - 1]
+
+        if proposal.shape[0] == 0:
+            predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+
+        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
+        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
+        # THWC -> CTHW -> 1CTHW
+        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
+        input_tensor = torch.from_numpy(input_array).to(args.device)
+
+        with torch.no_grad():
+            result = rgb_stdet_model(
+                return_loss=False,
+                img=[input_tensor],
+                img_metas=[[dict(img_shape=(new_h, new_w))]],
+                proposals=[[proposal]])
+            result = result[0]
+            prediction = []
+            # N proposals
+            for i in range(proposal.shape[0]):
+                prediction.append([])
+
+            # Perform action score thr
+            for i in range(len(result)):  # 80
+                if i + 1 not in label_map:
+                    continue
+                for j in range(proposal.shape[0]):
+                    if result[i][j, 4] > args.action_score_thr:
+                        prediction[j].append((label_map[i + 1], result[i][j,
+                                                                          4]))
+            predictions.append(prediction)
+        prog_bar.update()
+
+    return timestamps, predictions
+
+
+def main():
+    args = parse_args()
+
+    frame_paths, original_frames = frame_extraction(args.video)
+    num_frame = len(frame_paths)
+    h, w, _ = original_frames[0].shape
+
+    # Get Human detection results and pose results
+    human_detections = detection_inference(args, frame_paths)
+    pose_results = None
+    if args.use_skeleton_recog or args.use_skeleton_stdet:
+        pose_results = pose_inference(args, frame_paths, human_detections)
+
+    # resize frames to shortside 256
+    new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
+    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
+    w_ratio, h_ratio = new_w / w, new_h / h
+
+    # Load spatio-temporal detection label_map
+    stdet_label_map = load_label_map(args.label_map_stdet)
+    rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config)
+    rgb_stdet_config.merge_from_dict(args.cfg_options)
+    try:
+        if rgb_stdet_config['data']['train']['custom_classes'] is not None:
+            stdet_label_map = {
+                id + 1: stdet_label_map[cls]
+                for id, cls in enumerate(rgb_stdet_config['data']['train']
+                                         ['custom_classes'])
+            }
+    except KeyError:
+        pass
+
+    action_result = None
+    if args.use_skeleton_recog:
+        print('Use skeleton-based recognition')
+        action_result = skeleton_based_action_recognition(
+            args, pose_results, num_frame, h, w)
+    else:
+        print('Use rgb-based recognition')
+        action_result = rgb_based_action_recognition(args)
+
+    stdet_preds = None
+    if args.use_skeleton_stdet:
+        print('Use skeleton-based SpatioTemporal Action Detection')
+        clip_len, frame_interval = 30, 1
+        timestamps, stdet_preds = skeleton_based_stdet(args, stdet_label_map,
+                                                       human_detections,
+                                                       pose_results, num_frame,
+                                                       clip_len,
+                                                       frame_interval, h, w)
+        for i in range(len(human_detections)):
+            det = human_detections[i]
+            det[:, 0:4:2] *= w_ratio
+            det[:, 1:4:2] *= h_ratio
+            human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+
+    else:
+        print('Use rgb-based SpatioTemporal Action Detection')
+        for i in range(len(human_detections)):
+            det = human_detections[i]
+            det[:, 0:4:2] *= w_ratio
+            det[:, 1:4:2] *= h_ratio
+            human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+        timestamps, stdet_preds = rgb_based_stdet(args, frames,
+                                                  stdet_label_map,
+                                                  human_detections, w, h,
+                                                  new_w, new_h, w_ratio,
+                                                  h_ratio)
+
+    stdet_results = []
+    for timestamp, prediction in zip(timestamps, stdet_preds):
+        human_detection = human_detections[timestamp - 1]
+        stdet_results.append(
+            pack_result(human_detection, prediction, new_h, new_w))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int64)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)
+    output_timestamps = dense_timestamps(timestamps, dense_n)
+    frames = [
+        cv2.imread(frame_paths[timestamp - 1])
+        for timestamp in output_timestamps
+    ]
+
+    print('Performing visualization')
+    pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
+                                 args.device)
+
+    if args.use_skeleton_recog or args.use_skeleton_stdet:
+        pose_results = [
+            pose_results[timestamp - 1] for timestamp in output_timestamps
+        ]
+
+    vis_frames = visualize(frames, stdet_results, pose_results, action_result,
+                           pose_model)
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+
+    tmp_frame_dir = osp.dirname(frame_paths[0])
+    shutil.rmtree(tmp_frame_dir)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/faster_rcnn_r50_fpn_2x_coco.py
+++ b/demo/faster_rcnn_r50_fpn_2x_coco.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# model config
+model = dict(
+    type='FasterRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
+
+# dataset config
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
+# Schedule
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[16, 22])
+total_epochs = 24
+# runtime
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/demo/fuse/data_list.txt
+++ b/demo/fuse/data_list.txt
+jf7RDuUTrsQ 300 325
+JTlatknwOrY 301 233
+8UxlDNur-Z0 300 262
+y9r115bgfNk 300 320
+ZnIDviwA8CE 300 244
+c8ln_nWYMyM 300 333
+9GFfKVeoGm0 300 98
+F5Y_gGsg4x8 300 193
+AuqIu3x_lhY 300 36
+1Hi5GMotrjs 300 26
+czhL0iDbNT8 300 46
+DYpTE_n-Wvk 177 208
+R-xmgefs-M4 300 101
+KPP2qRzMdos 300 131
+PmgfU9ocx5A 300 193
+GI7nIyMEQi4 300 173
+A8TIWMvJVDU 300 72
+ustVqWMM56c 300 289
+03dk7mneDU0 300 254
+jqkyelS4GJk 300 279
+a58tBGuDIg0 231 382
+5l1ajLjqaPo 300 226
+-5wLopwbGX0 300 132
+NUG7kwJ-614 300 103
+wHUvw_R2iv8 300 97
+44Mak5_s6Fk 300 256
+y5vsk8Mj-3w 300 77
+TEj_A_BC-aU 300 393
+fUdu6hpMt_c 299 40
+C5Z1sRArUR0 300 254
+-orecnYvpNw 300 284
+Urmbp1ulIXI 300 319
+bLgdi4w7OAk 299 36
+cVv_XMw4W2U 300 27
+dV8JmKwDUzM 300 312
+yZ9hIqW4bRc 300 239
+9ykbMdR9Jss 213 257
+G8fEnqIOkiA 300 158
+6P2eVJ-Qp1g 300 131
+Y-acp_jXG1Q 302 315
+xthWPdx21r8 301 62
+LExCUx4STW0 300 9
+p2UMwzWsY0U 300 248
+c0UI7f3Plro 300 383
+1MmjE51PeIE 300 93
+OU5dJpNHATk 300 342
+38Uv6dbQkWc 281 44
+5ZNdkbmv274 300 59
+DrSL3Uddj6s 300 283
+aNJ1-bvRox8 175 384
+b5U7A_crvE0 194 377
+xeWO9Bl9aWA 300 86
+Zy8Ta83mrXo 300 223
+AXnDRH7o2DQ 300 146
+fTPDXmcygjw 300 11
+EhRxb8-cNzQ 164 325
+iO8RYYQzNiE 299 191
+XbCncZcXuTI 300 55
+pSCunaRn45A 300 265
+UqI--TBQRgg 300 165
+yD42KW6cm-A 300 186
+VseX7hoxhbM 300 61
+1FEcfy-moBM 300 8
+BUT8oefH9Nw 300 120
+-49tMSUTnZg 300 227
+cZKPTt_FcFs 300 85
+fiKJm0eavfw 300 323
+gJcVljRRxGE 302 87
+de1rSoht9t4 300 253
+UAIJnI7fQYo 300 284
+c4eIDxmVmCw 300 95
+3LGce3efz7M 300 332
+EC8iyn_q-NM 300 92
+eo15donXwmY 300 351
+NsG31u7Pd2Q 300 87
+ILkPWpZYlPE 300 137
+n5ZHSJRZl1U 300 338
+UoQE44FEqLQ 300 260
+5I-4meP_5wY 300 185
+udLMOf77S3U 300 209
+a4Ye18Mnblk 262 172
+QbDMgHWwt_s 236 395
+S6iAYBBMnwk 300 267
+DNMfmNV8Uug 300 131
+AJdp07pp43c 300 293
+tVuop87KbDY 300 103
+o79s5eOAF-c 300 246
+dMt_nuBNdeY 300 168
+RJU9NV1R4Fw 300 128
+Zhux7Vy-hHc 300 82
+47Cj6jwQKjo 300 228
+a7Mc-0lwAuE 300 129
+taZtEzvkg3M 300 264
+bVDZohQJhBI 240 129
+sBJk5li0O5o 216 154
+DQUNZmbQI_g 300 29
+-zpKHNrNsn4 300 244
+Dcz0r8q-sx0 300 249
+hfRKTH9pOMA 165 116
+8CdUbOHDtes 300 222
--- a/demo/fuse/flow.pkl
+++ b/demo/fuse/flow.pkl
--- a/demo/fuse/rgb.pkl
+++ b/demo/fuse/rgb.pkl
--- a/demo/hrnet_w32_coco_256x192.py
+++ b/demo/hrnet_w32_coco_256x192.py
+# Copyright (c) OpenMMLab. All rights reserved.
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w32-36af842e.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
--- a/demo/long_video_demo.py
+++ b/demo/long_video_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import random
+from collections import deque
+from operator import itemgetter
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv import Config, DictAction
+from mmcv.parallel import collate, scatter
+
+from mmaction.apis import init_recognizer
+from mmaction.datasets.pipelines import Compose
+
+FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL
+FONTSCALE = 1
+THICKNESS = 1
+LINETYPE = 1
+
+EXCLUED_STEPS = [
+    'OpenCVInit', 'OpenCVDecode', 'DecordInit', 'DecordDecode', 'PyAVInit',
+    'PyAVDecode', 'RawFrameDecode'
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMAction2 predict different labels in a long video demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file/url')
+    parser.add_argument('video_path', help='video file/url')
+    parser.add_argument('label', help='label file')
+    parser.add_argument('out_file', help='output result file in video/json')
+    parser.add_argument(
+        '--input-step',
+        type=int,
+        default=1,
+        help='input step for sampling frames')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--threshold',
+        type=float,
+        default=0.01,
+        help='recognition score threshold')
+    parser.add_argument(
+        '--stride',
+        type=float,
+        default=0,
+        help=('the prediction stride equals to stride * sample_length '
+              '(sample_length indicates the size of temporal window from '
+              'which you sample frames, which equals to '
+              'clip_len x frame_interval), if set as 0, the '
+              'prediction stride is 1'))
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--label-color',
+        nargs='+',
+        type=int,
+        default=(255, 255, 255),
+        help='font color (B, G, R) of the labels in output video')
+    parser.add_argument(
+        '--msg-color',
+        nargs='+',
+        type=int,
+        default=(128, 128, 128),
+        help='font color (B, G, R) of the messages in output video')
+    args = parser.parse_args()
+    return args
+
+
+def show_results_video(result_queue,
+                       text_info,
+                       thr,
+                       msg,
+                       frame,
+                       video_writer,
+                       label_color=(255, 255, 255),
+                       msg_color=(128, 128, 128)):
+    if len(result_queue) != 0:
+        text_info = {}
+        results = result_queue.popleft()
+        for i, result in enumerate(results):
+            selected_label, score = result
+            if score < thr:
+                break
+            location = (0, 40 + i * 20)
+            text = selected_label + ': ' + str(round(score, 2))
+            text_info[location] = text
+            cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                        label_color, THICKNESS, LINETYPE)
+    elif len(text_info):
+        for location, text in text_info.items():
+            cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                        label_color, THICKNESS, LINETYPE)
+    else:
+        cv2.putText(frame, msg, (0, 40), FONTFACE, FONTSCALE, msg_color,
+                    THICKNESS, LINETYPE)
+    video_writer.write(frame)
+    return text_info
+
+
+def get_results_json(result_queue, text_info, thr, msg, ind, out_json):
+    if len(result_queue) != 0:
+        text_info = {}
+        results = result_queue.popleft()
+        for i, result in enumerate(results):
+            selected_label, score = result
+            if score < thr:
+                break
+            text_info[i + 1] = selected_label + ': ' + str(round(score, 2))
+        out_json[ind] = text_info
+    elif len(text_info):
+        out_json[ind] = text_info
+    else:
+        out_json[ind] = msg
+    return text_info, out_json
+
+
+def show_results(model, data, label, args):
+    frame_queue = deque(maxlen=args.sample_length)
+    result_queue = deque(maxlen=1)
+
+    cap = cv2.VideoCapture(args.video_path)
+    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+
+    msg = 'Preparing action recognition ...'
+    text_info = {}
+    out_json = {}
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    frame_size = (frame_width, frame_height)
+
+    ind = 0
+    video_writer = None if args.out_file.endswith('.json') \
+        else cv2.VideoWriter(args.out_file, fourcc, fps, frame_size)
+    prog_bar = mmcv.ProgressBar(num_frames)
+    backup_frames = []
+
+    while ind < num_frames:
+        ind += 1
+        prog_bar.update()
+        ret, frame = cap.read()
+        if frame is None:
+            # drop it when encounting None
+            continue
+        backup_frames.append(np.array(frame)[:, :, ::-1])
+        if ind == args.sample_length:
+            # provide a quick show at the beginning
+            frame_queue.extend(backup_frames)
+            backup_frames = []
+        elif ((len(backup_frames) == args.input_step
+               and ind > args.sample_length) or ind == num_frames):
+            # pick a frame from the backup
+            # when the backup is full or reach the last frame
+            chosen_frame = random.choice(backup_frames)
+            backup_frames = []
+            frame_queue.append(chosen_frame)
+
+        ret, scores = inference(model, data, args, frame_queue)
+
+        if ret:
+            num_selected_labels = min(len(label), 5)
+            scores_tuples = tuple(zip(label, scores))
+            scores_sorted = sorted(
+                scores_tuples, key=itemgetter(1), reverse=True)
+            results = scores_sorted[:num_selected_labels]
+            result_queue.append(results)
+
+        if args.out_file.endswith('.json'):
+            text_info, out_json = get_results_json(result_queue, text_info,
+                                                   args.threshold, msg, ind,
+                                                   out_json)
+        else:
+            text_info = show_results_video(result_queue, text_info,
+                                           args.threshold, msg, frame,
+                                           video_writer, args.label_color,
+                                           args.msg_color)
+
+    cap.release()
+    cv2.destroyAllWindows()
+    if args.out_file.endswith('.json'):
+        with open(args.out_file, 'w') as js:
+            json.dump(out_json, js)
+
+
+def inference(model, data, args, frame_queue):
+    if len(frame_queue) != args.sample_length:
+        # Do no inference when there is no enough frames
+        return False, None
+
+    cur_windows = list(np.array(frame_queue))
+    if data['img_shape'] is None:
+        data['img_shape'] = frame_queue[0].shape[:2]
+
+    cur_data = data.copy()
+    cur_data['imgs'] = cur_windows
+    cur_data = args.test_pipeline(cur_data)
+    cur_data = collate([cur_data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        cur_data = scatter(cur_data, [args.device])[0]
+    with torch.no_grad():
+        scores = model(return_loss=False, **cur_data)[0]
+
+    if args.stride > 0:
+        pred_stride = int(args.sample_length * args.stride)
+        for _ in range(pred_stride):
+            frame_queue.popleft()
+
+    # for case ``args.stride=0``
+    # deque will automatically popleft one element
+
+    return True, scores
+
+
+def main():
+    args = parse_args()
+
+    args.device = torch.device(args.device)
+
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+
+    model = init_recognizer(cfg, args.checkpoint, device=args.device)
+    data = dict(img_shape=None, modality='RGB', label=-1)
+    with open(args.label, 'r') as f:
+        label = [line.strip() for line in f]
+
+    # prepare test pipeline from non-camera pipeline
+    cfg = model.cfg
+    sample_length = 0
+    pipeline = cfg.data.test.pipeline
+    pipeline_ = pipeline.copy()
+    for step in pipeline:
+        if 'SampleFrames' in step['type']:
+            sample_length = step['clip_len'] * step['num_clips']
+            data['num_clips'] = step['num_clips']
+            data['clip_len'] = step['clip_len']
+            pipeline_.remove(step)
+        if step['type'] in EXCLUED_STEPS:
+            # remove step to decode frames
+            pipeline_.remove(step)
+    test_pipeline = Compose(pipeline_)
+
+    assert sample_length > 0
+    args.sample_length = sample_length
+    args.test_pipeline = test_pipeline
+
+    show_results(model, data, label, args)
+
+
+if __name__ == '__main__':
+    main()