Commit 5b3e36dc authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add model TSM

parents
Pipeline #315 failed with stages
in 0 seconds
model = dict(
type='SkeletonGCN',
backbone=dict(
type='STGCN',
in_channels=3,
edge_importance_weighting=True,
graph_cfg=dict(layout='coco', strategy='spatial')),
cls_head=dict(
type='STGCNHead',
num_classes=60,
in_channels=256,
loss_cls=dict(type='CrossEntropyLoss')),
train_cfg=None,
test_cfg=None)
dataset_type = 'PoseDataset'
ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl'
ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl'
train_pipeline = [
dict(type='PaddingWithLoop', clip_len=300),
dict(type='PoseDecode'),
dict(type='FormatGCNInput', input_format='NCTVM'),
dict(type='PoseNormalize'),
dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
]
val_pipeline = [
dict(type='PaddingWithLoop', clip_len=300),
dict(type='PoseDecode'),
dict(type='FormatGCNInput', input_format='NCTVM'),
dict(type='PoseNormalize'),
dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
]
test_pipeline = [
dict(type='PaddingWithLoop', clip_len=300),
dict(type='PoseDecode'),
dict(type='FormatGCNInput', input_format='NCTVM'),
dict(type='PoseNormalize'),
dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix='',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix='',
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix='',
pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(policy='step', step=[10, 50])
total_epochs = 80
checkpoint_config = dict(interval=5)
evaluation = dict(interval=5, metrics=['top_k_accuracy'])
log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
# runtime settings
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/stgcn_80e_ntu60_xsub_keypoint/'
load_from = None
resume_from = None
workflow = [('train', 1)]
model = dict(
type='SkeletonGCN',
backbone=dict(
type='STGCN',
in_channels=3,
edge_importance_weighting=True,
graph_cfg=dict(layout='ntu-rgb+d', strategy='spatial')),
cls_head=dict(
type='STGCNHead',
num_classes=60,
in_channels=256,
loss_cls=dict(type='CrossEntropyLoss')),
train_cfg=None,
test_cfg=None)
dataset_type = 'PoseDataset'
ann_file_train = 'data/ntu/nturgb+d_skeletons_60_3d_nmtvc/xsub/train.pkl'
ann_file_val = 'data/ntu/nturgb+d_skeletons_60_3d_nmtvc/xsub/val.pkl'
train_pipeline = [
dict(type='PaddingWithLoop', clip_len=300),
dict(type='PoseDecode'),
dict(type='FormatGCNInput', input_format='NCTVM'),
dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
]
val_pipeline = [
dict(type='PaddingWithLoop', clip_len=300),
dict(type='PoseDecode'),
dict(type='FormatGCNInput', input_format='NCTVM'),
dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
]
test_pipeline = [
dict(type='PaddingWithLoop', clip_len=300),
dict(type='PoseDecode'),
dict(type='FormatGCNInput', input_format='NCTVM'),
dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix='',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix='',
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix='',
pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(policy='step', step=[10, 50])
total_epochs = 80
checkpoint_config = dict(interval=3)
evaluation = dict(interval=3, metrics=['top_k_accuracy'])
log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
# runtime settings
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/stgcn_80e_ntu60_xsub_keypoint_3d/'
load_from = None
resume_from = None
workflow = [('train', 1)]
# Demo
## Outline
- [Modify configs through script arguments](#modify-config-through-script-arguments): Tricks to directly modify configs through script arguments.
- [Video demo](#video-demo): A demo script to predict the recognition result using a single video.
- [SpatioTemporal Action Detection Video Demo](#spatiotemporal-action-detection-video-demo): A demo script to predict the SpatioTemporal Action Detection result using a single video.
- [Video GradCAM Demo](#video-gradcam-demo): A demo script to visualize GradCAM results using a single video.
- [Webcam demo](#webcam-demo): A demo script to implement real-time action recognition from a web camera.
- [Long Video demo](#long-video-demo): a demo script to predict different labels using a single long video.
- [SpatioTemporal Action Detection Webcam Demo](#spatiotemporal-action-detection-webcam-demo): A demo script to implement real-time spatio-temporal action detection from a web camera.
- [Skeleton-based Action Recognition Demo](#skeleton-based-action-recognition-demo): A demo script to predict the skeleton-based action recognition result using a single video.
- [Video Structuralize Demo](#video-structuralize-demo): A demo script to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video.
- [Audio Demo](#audio-demo): A demo script to predict the recognition result using a single audio file.
## Modify configs through script arguments
When running demos using our provided scripts, you may specify `--cfg-options` to in-place modify the config.
- Update config keys of dict.
The config options can be specified following the order of the dict keys in the original config.
For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode.
- Update keys inside a list of configs.
Some config dicts are composed as a list in your config. For example, the training pipeline `data.train.pipeline` is normally a list
e.g. `[dict(type='SampleFrames'), ...]`. If you want to change `'SampleFrames'` to `'DenseSampleFrames'` in the pipeline,
you may specify `--cfg-options data.train.pipeline.0.type=DenseSampleFrames`.
- Update values of list/tuples.
If the value to be updated is a list or a tuple. For example, the config file normally sets `workflow=[('train', 1)]`. If you want to
change this key, you may specify `--cfg-options workflow="[(train,1),(val,1)]"`. Note that the quotation mark " is necessary to
support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value.
## Video demo
We provide a demo script to predict the recognition result using a single video. In order to get predict results in range `[0, 1]`, make sure to set `model['test_cfg'] = dict(average_clips='prob')` in config file.
```shell
python demo/demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} {LABEL_FILE} [--use-frames] \
[--device ${DEVICE_TYPE}] [--fps {FPS}] [--font-scale {FONT_SCALE}] [--font-color {FONT_COLOR}] \
[--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}]
```
Optional arguments:
- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input.
- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30.
- `FONT_SCALE`: Font scale of the label added in the video. If not specified, it will be 0.5.
- `FONT_COLOR`: Font color of the label added in the video. If not specified, it will be `white`.
- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio.
- `RESIZE_ALGORITHM`: Resize algorithm used for resizing. If not specified, it will be set to `bicubic`.
- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file.
Examples:
Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
1. Recognize a video file as input by using a TSN model on cuda by default.
```shell
# The demo.mp4 and label_map_k400.txt are both from Kinetics-400
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
```
2. Recognize a video file as input by using a TSN model on cuda by default, loading checkpoint from url.
```shell
# The demo.mp4 and label_map_k400.txt are both from Kinetics-400
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
```
3. Recognize a list of rawframes as input by using a TSN model on cpu.
```shell
python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_FRAMES/ LABEL_FILE --use-frames --device cpu
```
4. Recognize a video file as input by using a TSN model and then generate an mp4 file.
```shell
# The demo.mp4 and label_map_k400.txt are both from Kinetics-400
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --out-filename demo/demo_out.mp4
```
5. Recognize a list of rawframes as input by using a TSN model and then generate a gif file.
```shell
python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_FRAMES/ LABEL_FILE --use-frames --out-filename demo/demo_out.gif
```
6. Recognize a video file as input by using a TSN model, then generate an mp4 file with a given resolution and resize algorithm.
```shell
# The demo.mp4 and label_map_k400.txt are both from Kinetics-400
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --target-resolution 340 256 --resize-algorithm bilinear \
--out-filename demo/demo_out.mp4
```
```shell
# The demo.mp4 and label_map_k400.txt are both from Kinetics-400
# If either dimension is set to -1, the frames are resized by keeping the existing aspect ratio
# For --target-resolution 170 -1, original resolution (340, 256) -> target resolution (170, 128)
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --target-resolution 170 -1 --resize-algorithm bilinear \
--out-filename demo/demo_out.mp4
```
7. Recognize a video file as input by using a TSN model, then generate an mp4 file with a label in a red color and fontscale 1.
```shell
# The demo.mp4 and label_map_k400.txt are both from Kinetics-400
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --font-scale 1 --font-color red \
--out-filename demo/demo_out.mp4
```
8. Recognize a list of rawframes as input by using a TSN model and then generate an mp4 file with 24 fps.
```shell
python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_FRAMES/ LABEL_FILE --use-frames --fps 24 --out-filename demo/demo_out.gif
```
## SpatioTemporal Action Detection Video Demo
We provide a demo script to predict the SpatioTemporal Action Detection result using a single video.
```shell
python demo/demo_spatiotemporal_det.py --video ${VIDEO_FILE} \
[--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
[--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
[--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
[--label-map ${LABEL_MAP}] \
[--device ${DEVICE}] \
[--out-filename ${OUTPUT_FILENAME}] \
[--predict-stepsize ${PREDICT_STEPSIZE}] \
[--output-stepsize ${OUTPUT_STEPSIZE}] \
[--output-fps ${OUTPUT_FPS}]
```
Optional arguments:
- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint URL.
- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Default: 0.5.
- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`.
- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Default: `cuda:0`.
- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: `demo/stdet_demo.mp4`.
- `PREDICT_STEPSIZE`: Make a prediction per N frames. Default: 8.
- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Default: 4.
- `OUTPUT_FPS`: The FPS of demo video output. Default: 6.
Examples:
Assume that you are located at `$MMACTION2` .
1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4.
```shell
python demo/demo_spatiotemporal_det.py --video demo/demo.mp4 \
--config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--action-score-thr 0.5 \
--label-map tools/data/ava/label_map.txt \
--predict-stepsize 8 \
--output-stepsize 4 \
--output-fps 6
```
## Video GradCAM Demo
We provide a demo script to visualize GradCAM results using a single video.
```shell
python demo/demo_gradcam.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} [--use-frames] \
[--device ${DEVICE_TYPE}] [--target-layer-name ${TARGET_LAYER_NAME}] [--fps {FPS}] \
[--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}]
```
- `--use-frames`: If specified, the demo will take rawframes as input. Otherwise, it will take a video as input.
- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
- `FPS`: FPS value of the output video when using rawframes as input. If not specified, it will be set to 30.
- `OUT_FILE`: Path to the output file which can be a video format or gif format. If not specified, it will be set to `None` and does not generate the output file.
- `TARGET_LAYER_NAME`: Layer name to generate GradCAM localization map.
- `TARGET_RESOLUTION`: Resolution(desired_width, desired_height) for resizing the frames before output when using a video as input. If not specified, it will be None and the frames are resized by keeping the existing aspect ratio.
- `RESIZE_ALGORITHM`: Resize algorithm used for resizing. If not specified, it will be set to `bilinear`.
Examples:
Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
1. Get GradCAM results of a I3D model, using a video file as input and then generate an gif file with 10 fps.
```shell
python demo/demo_gradcam.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
checkpoints/i3d_r50_video_32x2x1_100e_kinetics400_rgb_20200826-e31c6f52.pth demo/demo.mp4 \
--target-layer-name backbone/layer4/1/relu --fps 10 \
--out-filename demo/demo_gradcam.gif
```
2. Get GradCAM results of a TSM model, using a video file as input and then generate an gif file, loading checkpoint from url.
```shell
python demo/demo_gradcam.py configs/recognition/tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth \
demo/demo.mp4 --target-layer-name backbone/layer4/1/relu --out-filename demo/demo_gradcam_tsm.gif
```
## Webcam demo
We provide a demo script to implement real-time action recognition from web camera. In order to get predict results in range `[0, 1]`, make sure to set `model.['test_cfg'] = dict(average_clips='prob')` in config file.
```shell
python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${LABEL_FILE} \
[--device ${DEVICE_TYPE}] [--camera-id ${CAMERA_ID}] [--threshold ${THRESHOLD}] \
[--average-size ${AVERAGE_SIZE}] [--drawing-fps ${DRAWING_FPS}] [--inference-fps ${INFERENCE_FPS}]
```
Optional arguments:
- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
- `CAMERA_ID`: ID of camera device If not specified, it will be set to 0.
- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.
- `AVERAGE_SIZE`: Number of latest clips to be averaged for prediction. If not specified, it will be set to 1.
- `DRAWING_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 20.
- `INFERENCE_FPS`: Upper bound FPS value of the output drawing. If not specified, it will be set to 4.
:::{note}
If your hardware is good enough, increasing the value of `DRAWING_FPS` and `INFERENCE_FPS` will get a better experience.
:::
Examples:
Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
1. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times
and outputting result labels with score higher than 0.2.
```shell
python demo/webcam_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth tools/data/kinetics/label_map_k400.txt --average-size 5 \
--threshold 0.2 --device cpu
```
2. Recognize the action from web camera as input by using a TSN model on cpu, averaging the score per 5 times
and outputting result labels with score higher than 0.2, loading checkpoint from url.
```shell
python demo/webcam_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
tools/data/kinetics/label_map_k400.txt --average-size 5 --threshold 0.2 --device cpu
```
3. Recognize the action from web camera as input by using a I3D model on gpu by default, averaging the score per 5 times
and outputting result labels with score higher than 0.2.
```shell
python demo/webcam_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
checkpoints/i3d_r50_32x2x1_100e_kinetics400_rgb_20200614-c25ef9a4.pth tools/data/kinetics/label_map_k400.txt \
--average-size 5 --threshold 0.2
```
:::{note}
Considering the efficiency difference for users' hardware, Some modifications might be done to suit the case.
Users can change:
1). `SampleFrames` step (especially the number of `clip_len` and `num_clips`) of `test_pipeline` in the config file, like `--cfg-options data.test.pipeline.0.num_clips=3`.
2). Change to the suitable Crop methods like `TenCrop`, `ThreeCrop`, `CenterCrop`, etc. in `test_pipeline` of the config file, like `--cfg-options data.test.pipeline.4.type=CenterCrop`.
3). Change the number of `--average-size`. The smaller, the faster.
:::
## Long video demo
We provide a demo script to predict different labels using a single long video. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
```shell
python demo/long_video_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \
${OUT_FILE} [--input-step ${INPUT_STEP}] [--device ${DEVICE_TYPE}] [--threshold ${THRESHOLD}]
```
Optional arguments:
- `OUT_FILE`: Path to the output, either video or json file
- `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1.
- `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
- `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01.
- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames (`sample_length` indicates the size of temporal window from which you sample frames, which equals to `clip_len x frame_interval`). For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. The desired value of `STRIDE` is (0, 1\], while it also works for `STRIDE > 1` (the generated predictions will be too sparse). Default: 0.
- `LABEL_COLOR`: Font Color of the labels in (B, G, R). Default is white, that is (256, 256, 256).
- `MSG_COLOR`: Font Color of the messages in (B, G, R). Default is gray, that is (128, 128, 128).
Examples:
Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
or use checkpoint url from `configs/` to directly load corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
1. Predict different labels in a long video by using a TSN model on cpu, with 3 frames for input steps (that is, random sample one from each 3 frames)
and outputting result labels with score higher than 0.2.
```shell
python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
--input-step 3 --device cpu --threshold 0.2
```
2. Predict different labels in a long video by using a TSN model on cpu, with 3 frames for input steps (that is, random sample one from each 3 frames)
and outputting result labels with score higher than 0.2, loading checkpoint from url.
```shell
python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
```
3. Predict different labels in a long video from web by using a TSN model on cpu, with 3 frames for input steps (that is, random sample one from each 3 frames)
and outputting result labels with score higher than 0.2, loading checkpoint from url.
```shell
python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4 \
tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
```
4. Predict different labels in a long video by using a I3D model on gpu, with input_step=1, threshold=0.01 as default and print the labels in cyan.
```shell
python demo/long_video_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
checkpoints/i3d_r50_256p_32x2x1_100e_kinetics400_rgb_20200801-7d9f44de.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
--label-color 255 255 0
```
5. Predict different labels in a long video by using a I3D model on gpu and save the results as a `json` file
```shell
python demo/long_video_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
checkpoints/i3d_r50_256p_32x2x1_100e_kinetics400_rgb_20200801-7d9f44de.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt ./results.json
```
## SpatioTemporal Action Detection Webcam Demo
We provide a demo script to implement real-time spatio-temporal action detection from a web camera.
```shell
python demo/webcam_demo_spatiotemporal_det.py \
[--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
[--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
[--input-video] ${INPUT_VIDEO} \
[--label-map ${LABEL_MAP}] \
[--device ${DEVICE}] \
[--output-fps ${OUTPUT_FPS}] \
[--out-filename ${OUTPUT_FILENAME}] \
[--show] \
[--display-height] ${DISPLAY_HEIGHT} \
[--display-width] ${DISPLAY_WIDTH} \
[--predict-stepsize ${PREDICT_STEPSIZE}] \
[--clip-vis-length] ${CLIP_VIS_LENGTH}
```
Optional arguments:
- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: The spatiotemporal action detection checkpoint path or URL.
- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Default: 0.4.
- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
- `INPUT_VIDEO`: The webcam id or video path of the source. Default: `0`.
- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`.
- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Default: `cuda:0`.
- `OUTPUT_FPS`: The FPS of demo video output. Default: 15.
- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: None.
- `--show`: Whether to show predictions with `cv2.imshow`.
- `DISPLAY_HEIGHT`: The height of the display frame. Default: 0.
- `DISPLAY_WIDTH`: The width of the display frame. Default: 0. If `DISPLAY_HEIGHT <= 0 and DISPLAY_WIDTH <= 0`, the display frame and input video share the same shape.
- `PREDICT_STEPSIZE`: Make a prediction per N frames. Default: 8.
- `CLIP_VIS_LENGTH`: The number of the draw frames for each clip. In other words, for each clip, there are at most `CLIP_VIS_LENGTH` frames to be draw around the keyframe. DEFAULT: 8.
Tips to get a better experience for webcam demo:
- How to choose `--output-fps`?
- `--output-fps` should be almost equal to read thread fps.
- Read thread fps is printed by logger in format `DEBUG:__main__:Read Thread: {duration} ms, {fps} fps`
- How to choose `--predict-stepsize`?
- It's related to how to choose human detector and spatio-temporval model.
- Overall, the duration of read thread for each task should be greater equal to that of model inference.
- The durations for read/inference are both printed by logger.
- Larger `--predict-stepsize` leads to larger duration for read thread.
- In order to fully take the advantage of computation resources, decrease the value of `--predict-stepsize`.
Examples:
Assume that you are located at `$MMACTION2` .
1. Use the Faster RCNN as the human detector, SlowOnly-8x8-R101 as the action detector. Making predictions per 40 frames, and FPS of the output is 20. Show predictions with `cv2.imshow`.
```shell
python demo/webcam_demo_spatiotemporal_det.py \
--input-video 0 \
--config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--action-score-thr 0.5 \
--label-map tools/data/ava/label_map.txt \
--predict-stepsize 40 \
--output-fps 20 \
--show
```
## Skeleton-based Action Recognition Demo
We provide a demo script to predict the skeleton-based action recognition result using a single video.
```shell
python demo/demo_skeleton.py ${VIDEO_FILE} ${OUT_FILENAME} \
[--config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
[--checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
[--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
[--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
[--label-map ${LABEL_MAP}] \
[--device ${DEVICE}] \
[--short-side] ${SHORT_SIDE}
```
Optional arguments:
- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path.
- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or URL.
- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint).
- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint URL (trained on COCO-Keypoint).
- `LABEL_MAP`: The label map used. Default: `tools/data/ava/label_map.txt`.
- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Default: `cuda:0`.
- `SHORT_SIDE`: The short side used for frame extraction. Default: 480.
Examples:
Assume that you are located at `$MMACTION2` .
1. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D-NTURGB+D-120-Xsub-keypoint as the skeleton-based action recognizer.
```shell
python demo/demo_skeleton.py demo/ntu_sample.avi demo/skeleton_demo.mp4 \
--config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint/slowonly_r50_u48_240e_ntu120_xsub_keypoint-6736b03f.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--pose-config demo/hrnet_w32_coco_256x192.py \
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--label-map tools/data/skeleton/label_map_ntu120.txt
```
2. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, STGCN-NTURGB+D-60-Xsub-keypoint as the skeleton-based action recognizer.
```shell
python demo/demo_skeleton.py demo/ntu_sample.avi demo/skeleton_demo.mp4 \
--config configs/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint.py \
--checkpoint https://download.openmmlab.com/mmaction/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint/stgcn_80e_ntu60_xsub_keypoint-e7bb9653.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--pose-config demo/hrnet_w32_coco_256x192.py \
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--label-map tools/data/skeleton/label_map_ntu120.txt
```
## Video Structuralize Demo
We provide a demo script to to predict the skeleton-based and rgb-based action recognition and spatio-temporal action detection result using a single video.
```shell
python demo/demo_video_structuralize.py
[--rgb-stdet-config ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
[--rgb-stdet-checkpoint ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--skeleton-stdet-checkpoint ${SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
[--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
[--skeleton-config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
[--skeleton-checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
[--rgb-config ${RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
[--rgb-checkpoint ${RGB_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
[--use-skeleton-stdet ${USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD}] \
[--use-skeleton-recog ${USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRE}] \
[--action-score-thr ${ACTION_DETECTION_SCORE_THRE}] \
[--video ${VIDEO_FILE}] \
[--label-map-stdet ${LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION}] \
[--device ${DEVICE}] \
[--out-filename ${OUTPUT_FILENAME}] \
[--predict-stepsize ${PREDICT_STEPSIZE}] \
[--output-stepsize ${OUTPU_STEPSIZE}] \
[--output-fps ${OUTPUT_FPS}] \
[--cfg-options]
```
Optional arguments:
- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The rgb-based spatio temoral action detection config file path.
- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The rgb-based spatio temoral action detection checkpoint path or URL.
- `SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: The skeleton-based spatio temoral action detection checkpoint path or URL.
- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: The human pose estimation config file path (trained on COCO-Keypoint).
- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: The human pose estimation checkpoint URL (trained on COCO-Keypoint).
- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The skeleton-based action recognition config file path.
- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: The skeleton-based action recognition checkpoint path or URL.
- `RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE`: The rgb-based action recognition config file path.
- `RGB_BASED_ACTION_RECOGNITION_CHECKPOINT`: The rgb-based action recognition checkpoint path or URL.
- `USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD`: Use skeleton-based spatio temporal action detection method.
- `USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD`: Use skeleton-based action recognition method.
- `HUMAN_DETECTION_SCORE_THRE`: The score threshold for human detection. Default: 0.9.
- `ACTION_DETECTION_SCORE_THRE`: The score threshold for action detection. Default: 0.4.
- `LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION`: The label map for spatio temporal action detection used. Default: `tools/data/ava/label_map.txt`.
- `LABEL_MAP`: The label map for action recognition. Default: `tools/data/kinetics/label_map_k400.txt`.
- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Default: `cuda:0`.
- `OUTPUT_FILENAME`: Path to the output file which is a video format. Default: `demo/test_stdet_recognition_output.mp4`.
- `PREDICT_STEPSIZE`: Make a prediction per N frames. Default: 8.
- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Default: 1.
- `OUTPUT_FPS`: The FPS of demo video output. Default: 24.
Examples:
Assume that you are located at `$MMACTION2` .
1. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer and the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
```shell
python demo/demo_video_structuralize.py
--skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--pose-config demo/hrnet_w32_coco_256x192.py
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/
posec3d_k400.pth \
--use-skeleton-stdet \
--use-skeleton-recog \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
2. Use the Faster RCNN as the human detector, TSN-R50-1x1x3 as the rgb-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
```shell
python demo/demo_video_structuralize.py
--rgb-stdet-config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--rgb-stdet-checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--rgb-config configs/recognition/tsn/
tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
--rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/
tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/
tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
3. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, PoseC3D as the skeleton-based action recognizer, SlowOnly-8x8-R101 as the rgb-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
```shell
python demo/demo_video_structuralize.py
--rgb-stdet-config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--rgb-stdet-checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--pose-config demo/hrnet_w32_coco_256x192.py
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/
posec3d_k400.pth \
--use-skeleton-recog \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
4. Use the Faster RCNN as the human detector, HRNetw32 as the pose estimator, TSN-R50-1x1x3 as the rgb-based action recognizer, PoseC3D as the skeleton-based spatio temporal action detector. Making action detection predictions per 8 frames, and output 1 frame per 1 frame to the output video. The FPS of the output video is 24.
```shell
python demo/demo_video_structuralize.py
--skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--pose-config demo/hrnet_w32_coco_256x192.py
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--rgb-config configs/recognition/tsn/
tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
--rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/
tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/
tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
--use-skeleton-stdet \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
## Audio Demo
Demo script to predict the audio-based action recognition using a single audio feature.
The script `extract_audio.py` can be used to extract audios from videos and the script `build_audio_features.py` can be used to extract the audio features.
```shell
python demo/demo_audio.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${AUDIO_FILE} {LABEL_FILE} [--device ${DEVICE}]
```
Optional arguments:
- `DEVICE`: Type of device to run the demo. Allowed values are cuda devices like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
Examples:
Assume that you are located at `$MMACTION2` and have already downloaded the checkpoints to the directory `checkpoints/`,
or use checkpoint url from `configs/` to directly load the corresponding checkpoint, which will be automatically saved in `$HOME/.cache/torch/checkpoints`.
1. Recognize an audio file as input by using a tsn model on cuda by default.
```shell
python demo/demo_audio.py \
configs/recognition_audio/resnet/tsn_r18_64x1x1_100e_kinetics400_audio_feature.py \
https://download.openmmlab.com/mmaction/recognition/audio_recognition/tsn_r18_64x1x1_100e_kinetics400_audio_feature/tsn_r18_64x1x1_100e_kinetics400_audio_feature_20201012-bf34df6c.pth \
audio_feature.npy label_map_k400.txt
```
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"from mmaction.apis import init_recognizer, inference_recognizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"config_file = '../configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py'\n",
"# download the checkpoint from model zoo and put it in `checkpoints/`\n",
"checkpoint_file = '../checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"# build the model from a config file and a checkpoint file\n",
"model = init_recognizer(config_file, checkpoint_file, device='cpu')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"# test a single video and show the result:\n",
"video = 'demo.mp4'\n",
"label = '../../tools/data/kinetics/label_map_k400.txt'\n",
"results = inference_recognizer(model, video)\n",
"\n",
"labels = open(label).readlines()\n",
"labels = [x.strip() for x in labels]\n",
"results = [(labels[k[0]], k[1]) for k in results]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
},
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"arm wrestling: 29.61644\n",
"rock scissors paper: 10.754839\n",
"shaking hands: 9.9084\n",
"clapping: 9.189912\n",
"massaging feet: 8.305307\n"
]
}
],
"source": [
"# show the results\n",
"for result in results:\n",
" print(f'{result[0]}: ', result[1])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
File added
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import os.path as osp
import cv2
import numpy as np
import torch
import webcolors
from mmcv import Config, DictAction
from mmaction.apis import inference_recognizer, init_recognizer
def parse_args():
parser = argparse.ArgumentParser(description='MMAction2 demo')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file/url')
parser.add_argument('video', help='video file/url or rawframes directory')
parser.add_argument('label', help='label file')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--use-frames',
default=False,
action='store_true',
help='whether to use rawframes as input')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--fps',
default=30,
type=int,
help='specify fps value of the output video when using rawframes to '
'generate file')
parser.add_argument(
'--font-scale',
default=0.5,
type=float,
help='font scale of the label in output video')
parser.add_argument(
'--font-color',
default='white',
help='font color of the label in output video')
parser.add_argument(
'--target-resolution',
nargs=2,
default=None,
type=int,
help='Target resolution (w, h) for resizing the frames when using a '
'video as input. If either dimension is set to -1, the frames are '
'resized by keeping the existing aspect ratio')
parser.add_argument(
'--resize-algorithm',
default='bicubic',
help='resize algorithm applied to generate video')
parser.add_argument('--out-filename', default=None, help='output filename')
args = parser.parse_args()
return args
def get_output(video_path,
out_filename,
label,
fps=30,
font_scale=0.5,
font_color='white',
target_resolution=None,
resize_algorithm='bicubic',
use_frames=False):
"""Get demo output using ``moviepy``.
This function will generate video file or gif file from raw video or
frames, by using ``moviepy``. For more information of some parameters,
you can refer to: https://github.com/Zulko/moviepy.
Args:
video_path (str): The video file path or the rawframes directory path.
If ``use_frames`` is set to True, it should be rawframes directory
path. Otherwise, it should be video file path.
out_filename (str): Output filename for the generated file.
label (str): Predicted label of the generated file.
fps (int): Number of picture frames to read per second. Default: 30.
font_scale (float): Font scale of the label. Default: 0.5.
font_color (str): Font color of the label. Default: 'white'.
target_resolution (None | tuple[int | None]): Set to
(desired_width desired_height) to have resized frames. If either
dimension is None, the frames are resized by keeping the existing
aspect ratio. Default: None.
resize_algorithm (str): Support "bicubic", "bilinear", "neighbor",
"lanczos", etc. Default: 'bicubic'. For more information,
see https://ffmpeg.org/ffmpeg-scaler.html
use_frames: Determine Whether to use rawframes as input. Default:False.
"""
if video_path.startswith(('http://', 'https://')):
raise NotImplementedError
try:
# In case of a segment fault when import decord in the head of demo
import decord
from moviepy.editor import ImageSequenceClip
except ImportError:
raise ImportError('Please install moviepy to enable output file.')
# Channel Order is BGR
if use_frames:
frame_list = sorted(
[osp.join(video_path, x) for x in os.listdir(video_path)])
frames = [cv2.imread(x) for x in frame_list]
else:
video = decord.VideoReader(video_path)
frames = [x.asnumpy()[..., ::-1] for x in video]
if target_resolution:
w, h = target_resolution
frame_h, frame_w, _ = frames[0].shape
if w == -1:
w = int(h / frame_h * frame_w)
if h == -1:
h = int(w / frame_w * frame_h)
frames = [cv2.resize(f, (w, h)) for f in frames]
textsize = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, font_scale,
1)[0]
textheight = textsize[1]
padding = 10
location = (padding, padding + textheight)
if isinstance(font_color, str):
font_color = webcolors.name_to_rgb(font_color)[::-1]
frames = [np.array(frame) for frame in frames]
for frame in frames:
cv2.putText(frame, label, location, cv2.FONT_HERSHEY_DUPLEX,
font_scale, font_color, 1)
# RGB order
frames = [x[..., ::-1] for x in frames]
video_clips = ImageSequenceClip(frames, fps=fps)
out_type = osp.splitext(out_filename)[1][1:]
if out_type == 'gif':
video_clips.write_gif(out_filename)
else:
video_clips.write_videofile(out_filename, remove_temp=True)
def main():
args = parse_args()
# assign the desired device.
device = torch.device(args.device)
cfg = Config.fromfile(args.config)
cfg.merge_from_dict(args.cfg_options)
# build the recognizer from a config file and checkpoint file/url
model = init_recognizer(cfg, args.checkpoint, device=device)
# e.g. use ('backbone', ) to return backbone feature
output_layer_names = None
# test a single video or rawframes of a single video
if output_layer_names:
results, returned_feature = inference_recognizer(
model, args.video, outputs=output_layer_names)
else:
results = inference_recognizer(model, args.video)
labels = open(args.label).readlines()
labels = [x.strip() for x in labels]
results = [(labels[k[0]], k[1]) for k in results]
print('The top-5 labels with corresponding scores are:')
for result in results:
print(f'{result[0]}: ', result[1])
if args.out_filename is not None:
if args.target_resolution is not None:
if args.target_resolution[0] == -1:
assert isinstance(args.target_resolution[1], int)
assert args.target_resolution[1] > 0
if args.target_resolution[1] == -1:
assert isinstance(args.target_resolution[0], int)
assert args.target_resolution[0] > 0
args.target_resolution = tuple(args.target_resolution)
get_output(
args.video,
args.out_filename,
results[0][0],
fps=args.fps,
font_scale=args.font_scale,
font_color=args.font_color,
target_resolution=args.target_resolution,
resize_algorithm=args.resize_algorithm,
use_frames=args.use_frames)
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import torch
from mmcv import Config, DictAction
from mmaction.apis import inference_recognizer, init_recognizer
def parse_args():
parser = argparse.ArgumentParser(description='MMAction2 demo')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file/url')
parser.add_argument('audio', help='audio file')
parser.add_argument('label', help='label file')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
args = parser.parse_args()
return args
def main():
args = parse_args()
device = torch.device(args.device)
cfg = Config.fromfile(args.config)
cfg.merge_from_dict(args.cfg_options)
model = init_recognizer(cfg, args.checkpoint, device=device)
if not args.audio.endswith('.npy'):
raise NotImplementedError('Demo works on extracted audio features')
results = inference_recognizer(model, args.audio)
labels = open(args.label).readlines()
labels = [x.strip() for x in labels]
results = [(labels[k[0]], k[1]) for k in results]
print('Scores:')
for result in results:
print(f'{result[0]}: ', result[1])
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import os.path as osp
import mmcv
import numpy as np
import torch
from mmcv import Config, DictAction
from mmcv.parallel import collate, scatter
from mmaction.apis import init_recognizer
from mmaction.datasets.pipelines import Compose
from mmaction.utils import GradCAM
def parse_args():
parser = argparse.ArgumentParser(description='MMAction2 GradCAM demo')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file/url')
parser.add_argument('video', help='video file/url or rawframes directory')
parser.add_argument(
'--use-frames',
default=False,
action='store_true',
help='whether to use rawframes as input')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--target-layer-name',
type=str,
default='backbone/layer4/1/relu',
help='GradCAM target layer name')
parser.add_argument('--out-filename', default=None, help='output filename')
parser.add_argument('--fps', default=5, type=int)
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--target-resolution',
nargs=2,
default=None,
type=int,
help='Target resolution (w, h) for resizing the frames when using a '
'video as input. If either dimension is set to -1, the frames are '
'resized by keeping the existing aspect ratio')
parser.add_argument(
'--resize-algorithm',
default='bilinear',
help='resize algorithm applied to generate video & gif')
args = parser.parse_args()
return args
def build_inputs(model, video_path, use_frames=False):
"""build inputs for GradCAM.
Note that, building inputs for GradCAM is exactly the same as building
inputs for Recognizer test stage. Codes from `inference_recognizer`.
Args:
model (nn.Module): Recognizer model.
video_path (str): video file/url or rawframes directory.
use_frames (bool): whether to use rawframes as input.
Returns:
dict: Both GradCAM inputs and Recognizer test stage inputs,
including two keys, ``imgs`` and ``label``.
"""
if not (osp.exists(video_path) or video_path.startswith('http')):
raise RuntimeError(f"'{video_path}' is missing")
if osp.isfile(video_path) and use_frames:
raise RuntimeError(
f"'{video_path}' is a video file, not a rawframe directory")
if osp.isdir(video_path) and not use_frames:
raise RuntimeError(
f"'{video_path}' is a rawframe directory, not a video file")
cfg = model.cfg
device = next(model.parameters()).device # model device
# build the data pipeline
test_pipeline = cfg.data.test.pipeline
test_pipeline = Compose(test_pipeline)
# prepare data
if use_frames:
filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg')
modality = cfg.data.test.get('modality', 'RGB')
start_index = cfg.data.test.get('start_index', 1)
data = dict(
frame_dir=video_path,
total_frames=len(os.listdir(video_path)),
label=-1,
start_index=start_index,
filename_tmpl=filename_tmpl,
modality=modality)
else:
start_index = cfg.data.test.get('start_index', 0)
data = dict(
filename=video_path,
label=-1,
start_index=start_index,
modality='RGB')
data = test_pipeline(data)
data = collate([data], samples_per_gpu=1)
if next(model.parameters()).is_cuda:
# scatter to specified GPU
data = scatter(data, [device])[0]
return data
def _resize_frames(frame_list,
scale,
keep_ratio=True,
interpolation='bilinear'):
"""resize frames according to given scale.
Codes are modified from `mmaction2/datasets/pipelines/augmentation.py`,
`Resize` class.
Args:
frame_list (list[np.ndarray]): frames to be resized.
scale (tuple[int]): If keep_ratio is True, it serves as scaling
factor or maximum size: the image will be rescaled as large
as possible within the scale. Otherwise, it serves as (w, h)
of output size.
keep_ratio (bool): If set to True, Images will be resized without
changing the aspect ratio. Otherwise, it will resize images to a
given size. Default: True.
interpolation (str): Algorithm used for interpolation:
"nearest" | "bilinear". Default: "bilinear".
Returns:
list[np.ndarray]: Both GradCAM and Recognizer test stage inputs,
including two keys, ``imgs`` and ``label``.
"""
if scale is None or (scale[0] == -1 and scale[1] == -1):
return frame_list
scale = tuple(scale)
max_long_edge = max(scale)
max_short_edge = min(scale)
if max_short_edge == -1:
scale = (np.inf, max_long_edge)
img_h, img_w, _ = frame_list[0].shape
if keep_ratio:
new_w, new_h = mmcv.rescale_size((img_w, img_h), scale)
else:
new_w, new_h = scale
frame_list = [
mmcv.imresize(img, (new_w, new_h), interpolation=interpolation)
for img in frame_list
]
return frame_list
def main():
args = parse_args()
# assign the desired device.
device = torch.device(args.device)
cfg = Config.fromfile(args.config)
cfg.merge_from_dict(args.cfg_options)
# build the recognizer from a config file and checkpoint file/url
model = init_recognizer(cfg, args.checkpoint, device=device)
inputs = build_inputs(model, args.video, use_frames=args.use_frames)
gradcam = GradCAM(model, args.target_layer_name)
results = gradcam(inputs)
if args.out_filename is not None:
try:
from moviepy.editor import ImageSequenceClip
except ImportError:
raise ImportError('Please install moviepy to enable output file.')
# frames_batches shape [B, T, H, W, 3], in RGB order
frames_batches = (results[0] * 255.).numpy().astype(np.uint8)
frames = frames_batches.reshape(-1, *frames_batches.shape[-3:])
frame_list = list(frames)
frame_list = _resize_frames(
frame_list,
args.target_resolution,
interpolation=args.resize_algorithm)
video_clips = ImageSequenceClip(frame_list, fps=args.fps)
out_type = osp.splitext(args.out_filename)[1][1:]
if out_type == 'gif':
video_clips.write_gif(args.out_filename)
else:
video_clips.write_videofile(args.out_filename, remove_temp=True)
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import os.path as osp
import shutil
import cv2
import mmcv
import numpy as np
import torch
from mmcv import DictAction
from mmaction.apis import inference_recognizer, init_recognizer
try:
from mmdet.apis import inference_detector, init_detector
except (ImportError, ModuleNotFoundError):
raise ImportError('Failed to import `inference_detector` and '
'`init_detector` form `mmdet.apis`. These apis are '
'required in this demo! ')
try:
from mmpose.apis import (inference_top_down_pose_model, init_pose_model,
vis_pose_result)
except (ImportError, ModuleNotFoundError):
raise ImportError('Failed to import `inference_top_down_pose_model`, '
'`init_pose_model`, and `vis_pose_result` form '
'`mmpose.apis`. These apis are required in this demo! ')
try:
import moviepy.editor as mpy
except ImportError:
raise ImportError('Please install moviepy to enable output file')
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 0.75
FONTCOLOR = (255, 255, 255) # BGR, white
THICKNESS = 1
LINETYPE = 1
def parse_args():
parser = argparse.ArgumentParser(description='MMAction2 demo')
parser.add_argument('video', help='video file/url')
parser.add_argument('out_filename', help='output filename')
parser.add_argument(
'--config',
default=('configs/skeleton/posec3d/'
'slowonly_r50_u48_240e_ntu120_xsub_keypoint.py'),
help='skeleton model config file path')
parser.add_argument(
'--checkpoint',
default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
'slowonly_r50_u48_240e_ntu120_xsub_keypoint/'
'slowonly_r50_u48_240e_ntu120_xsub_keypoint-6736b03f.pth'),
help='skeleton model checkpoint file/url')
parser.add_argument(
'--det-config',
default='demo/faster_rcnn_r50_fpn_2x_coco.py',
help='human detection config file path (from mmdet)')
parser.add_argument(
'--det-checkpoint',
default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
'faster_rcnn_r50_fpn_2x_coco/'
'faster_rcnn_r50_fpn_2x_coco_'
'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
help='human detection checkpoint file/url')
parser.add_argument(
'--pose-config',
default='demo/hrnet_w32_coco_256x192.py',
help='human pose estimation config file path (from mmpose)')
parser.add_argument(
'--pose-checkpoint',
default=('https://download.openmmlab.com/mmpose/top_down/hrnet/'
'hrnet_w32_coco_256x192-c78dce93_20200708.pth'),
help='human pose estimation checkpoint file/url')
parser.add_argument(
'--det-score-thr',
type=float,
default=0.9,
help='the threshold of human detection score')
parser.add_argument(
'--label-map',
default='tools/data/skeleton/label_map_ntu120.txt',
help='label map file')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--short-side',
type=int,
default=480,
help='specify the short-side length of the image')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
args = parser.parse_args()
return args
def frame_extraction(video_path, short_side):
"""Extract frames given video_path.
Args:
video_path (str): The video_path.
"""
# Load the video, extract frames into ./tmp/video_name
target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
os.makedirs(target_dir, exist_ok=True)
# Should be able to handle videos up to several hours
frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
vid = cv2.VideoCapture(video_path)
frames = []
frame_paths = []
flag, frame = vid.read()
cnt = 0
new_h, new_w = None, None
while flag:
if new_h is None:
h, w, _ = frame.shape
new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
frame = mmcv.imresize(frame, (new_w, new_h))
frames.append(frame)
frame_path = frame_tmpl.format(cnt + 1)
frame_paths.append(frame_path)
cv2.imwrite(frame_path, frame)
cnt += 1
flag, frame = vid.read()
return frame_paths, frames
def detection_inference(args, frame_paths):
"""Detect human boxes given frame paths.
Args:
args (argparse.Namespace): The arguments.
frame_paths (list[str]): The paths of frames to do detection inference.
Returns:
list[np.ndarray]: The human detection results.
"""
model = init_detector(args.det_config, args.det_checkpoint, args.device)
assert model.CLASSES[0] == 'person', ('We require you to use a detector '
'trained on COCO')
results = []
print('Performing Human Detection for each frame')
prog_bar = mmcv.ProgressBar(len(frame_paths))
for frame_path in frame_paths:
result = inference_detector(model, frame_path)
# We only keep human detections with score larger than det_score_thr
result = result[0][result[0][:, 4] >= args.det_score_thr]
results.append(result)
prog_bar.update()
return results
def pose_inference(args, frame_paths, det_results):
model = init_pose_model(args.pose_config, args.pose_checkpoint,
args.device)
ret = []
print('Performing Human Pose Estimation for each frame')
prog_bar = mmcv.ProgressBar(len(frame_paths))
for f, d in zip(frame_paths, det_results):
# Align input format
d = [dict(bbox=x) for x in list(d)]
pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
ret.append(pose)
prog_bar.update()
return ret
def main():
args = parse_args()
frame_paths, original_frames = frame_extraction(args.video,
args.short_side)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape
# Get clip_len, frame_interval and calculate center index of each clip
config = mmcv.Config.fromfile(args.config)
config.merge_from_dict(args.cfg_options)
for component in config.data.test.pipeline:
if component['type'] == 'PoseNormalize':
component['mean'] = (w // 2, h // 2, .5)
component['max_value'] = (w, h, 1.)
model = init_recognizer(config, args.checkpoint, args.device)
# Load label_map
label_map = [x.strip() for x in open(args.label_map).readlines()]
# Get Human detection results
det_results = detection_inference(args, frame_paths)
torch.cuda.empty_cache()
pose_results = pose_inference(args, frame_paths, det_results)
torch.cuda.empty_cache()
fake_anno = dict(
frame_dir='',
label=-1,
img_shape=(h, w),
original_shape=(h, w),
start_index=0,
modality='Pose',
total_frames=num_frame)
num_person = max([len(x) for x in pose_results])
num_keypoint = 17
keypoint = np.zeros((num_person, num_frame, num_keypoint, 2),
dtype=np.float16)
keypoint_score = np.zeros((num_person, num_frame, num_keypoint),
dtype=np.float16)
for i, poses in enumerate(pose_results):
for j, pose in enumerate(poses):
pose = pose['keypoints']
keypoint[j, i] = pose[:, :2]
keypoint_score[j, i] = pose[:, 2]
fake_anno['keypoint'] = keypoint
fake_anno['keypoint_score'] = keypoint_score
results = inference_recognizer(model, fake_anno)
action_label = label_map[results[0][0]]
pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
args.device)
vis_frames = [
vis_pose_result(pose_model, frame_paths[i], pose_results[i])
for i in range(num_frame)
]
for frame in vis_frames:
cv2.putText(frame, action_label, (10, 30), FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=24)
vid.write_videofile(args.out_filename, remove_temp=True)
tmp_frame_dir = osp.dirname(frame_paths[0])
shutil.rmtree(tmp_frame_dir)
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy as cp
import os
import os.path as osp
import shutil
import cv2
import mmcv
import numpy as np
import torch
from mmcv import DictAction
from mmcv.runner import load_checkpoint
from mmaction.models import build_detector
try:
from mmdet.apis import inference_detector, init_detector
except (ImportError, ModuleNotFoundError):
raise ImportError('Failed to import `inference_detector` and '
'`init_detector` form `mmdet.apis`. These apis are '
'required in this demo! ')
try:
import moviepy.editor as mpy
except ImportError:
raise ImportError('Please install moviepy to enable output file')
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 0.5
FONTCOLOR = (255, 255, 255) # BGR, white
MSGCOLOR = (128, 128, 128) # BGR, gray
THICKNESS = 1
LINETYPE = 1
def hex2color(h):
"""Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
plate_blue = plate_blue.split('-')
plate_blue = [hex2color(h) for h in plate_blue]
plate_green = '004b23-006400-007200-008000-38b000-70e000'
plate_green = plate_green.split('-')
plate_green = [hex2color(h) for h in plate_green]
def visualize(frames, annotations, plate=plate_blue, max_num=5):
"""Visualize frames with predicted annotations.
Args:
frames (list[np.ndarray]): Frames for visualization, note that
len(frames) % len(annotations) should be 0.
annotations (list[list[tuple]]): The predicted results.
plate (str): The plate used for visualization. Default: plate_blue.
max_num (int): Max number of labels to visualize for a person box.
Default: 5.
Returns:
list[np.ndarray]: Visualized frames.
"""
assert max_num + 1 <= len(plate)
plate = [x[::-1] for x in plate]
frames_ = cp.deepcopy(frames)
nf, na = len(frames), len(annotations)
assert nf % na == 0
nfpa = len(frames) // len(annotations)
anno = None
h, w, _ = frames[0].shape
scale_ratio = np.array([w, h, w, h])
for i in range(na):
anno = annotations[i]
if anno is None:
continue
for j in range(nfpa):
ind = i * nfpa + j
frame = frames_[ind]
for ann in anno:
box = ann[0]
label = ann[1]
if not len(label):
continue
score = ann[2]
box = (box * scale_ratio).astype(np.int64)
st, ed = tuple(box[:2]), tuple(box[2:])
cv2.rectangle(frame, st, ed, plate[0], 2)
for k, lb in enumerate(label):
if k >= max_num:
break
text = abbrev(lb)
text = ': '.join([text, str(score[k])])
location = (0 + st[0], 18 + k * 18 + st[1])
textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
THICKNESS)[0]
textwidth = textsize[0]
diag0 = (location[0] + textwidth, location[1] - 14)
diag1 = (location[0], location[1] + 2)
cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
return frames_
def parse_args():
parser = argparse.ArgumentParser(description='MMAction2 demo')
parser.add_argument(
'--config',
default=('configs/detection/ava/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py'),
help='spatio temporal detection config file path')
parser.add_argument(
'--checkpoint',
default=('https://download.openmmlab.com/mmaction/detection/ava/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb'
'_20201217-16378594.pth'),
help='spatio temporal detection checkpoint file/url')
parser.add_argument(
'--det-config',
default='demo/faster_rcnn_r50_fpn_2x_coco.py',
help='human detection config file path (from mmdet)')
parser.add_argument(
'--det-checkpoint',
default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
'faster_rcnn_r50_fpn_2x_coco/'
'faster_rcnn_r50_fpn_2x_coco_'
'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
help='human detection checkpoint file/url')
parser.add_argument(
'--det-score-thr',
type=float,
default=0.9,
help='the threshold of human detection score')
parser.add_argument(
'--action-score-thr',
type=float,
default=0.5,
help='the threshold of human action score')
parser.add_argument('--video', help='video file/url')
parser.add_argument(
'--label-map',
default='tools/data/ava/label_map.txt',
help='label map file')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--out-filename',
default='demo/stdet_demo.mp4',
help='output filename')
parser.add_argument(
'--predict-stepsize',
default=8,
type=int,
help='give out a prediction per n frames')
parser.add_argument(
'--output-stepsize',
default=4,
type=int,
help=('show one frame per n frames in the demo, we should have: '
'predict_stepsize % output_stepsize == 0'))
parser.add_argument(
'--output-fps',
default=6,
type=int,
help='the fps of demo video output')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
args = parser.parse_args()
return args
def frame_extraction(video_path):
"""Extract frames given video_path.
Args:
video_path (str): The video_path.
"""
# Load the video, extract frames into ./tmp/video_name
target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
os.makedirs(target_dir, exist_ok=True)
# Should be able to handle videos up to several hours
frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
vid = cv2.VideoCapture(video_path)
frames = []
frame_paths = []
flag, frame = vid.read()
cnt = 0
while flag:
frames.append(frame)
frame_path = frame_tmpl.format(cnt + 1)
frame_paths.append(frame_path)
cv2.imwrite(frame_path, frame)
cnt += 1
flag, frame = vid.read()
return frame_paths, frames
def detection_inference(args, frame_paths):
"""Detect human boxes given frame paths.
Args:
args (argparse.Namespace): The arguments.
frame_paths (list[str]): The paths of frames to do detection inference.
Returns:
list[np.ndarray]: The human detection results.
"""
model = init_detector(args.det_config, args.det_checkpoint, args.device)
assert model.CLASSES[0] == 'person', ('We require you to use a detector '
'trained on COCO')
results = []
print('Performing Human Detection for each frame')
prog_bar = mmcv.ProgressBar(len(frame_paths))
for frame_path in frame_paths:
result = inference_detector(model, frame_path)
# We only keep human detections with score larger than det_score_thr
result = result[0][result[0][:, 4] >= args.det_score_thr]
results.append(result)
prog_bar.update()
return results
def load_label_map(file_path):
"""Load Label Map.
Args:
file_path (str): The file path of label map.
Returns:
dict: The label map (int -> label name).
"""
lines = open(file_path).readlines()
lines = [x.strip().split(': ') for x in lines]
return {int(x[0]): x[1] for x in lines}
def abbrev(name):
"""Get the abbreviation of label name:
'take (an object) from (a person)' -> 'take ... from ...'
"""
while name.find('(') != -1:
st, ed = name.find('('), name.find(')')
name = name[:st] + '...' + name[ed + 1:]
return name
def pack_result(human_detection, result, img_h, img_w):
"""Short summary.
Args:
human_detection (np.ndarray): Human detection result.
result (type): The predicted label of each human proposal.
img_h (int): The image height.
img_w (int): The image width.
Returns:
tuple: Tuple of human proposal, label name and label score.
"""
human_detection[:, 0::2] /= img_w
human_detection[:, 1::2] /= img_h
results = []
if result is None:
return None
for prop, res in zip(human_detection, result):
res.sort(key=lambda x: -x[1])
results.append(
(prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
for x in res]))
return results
def main():
args = parse_args()
frame_paths, original_frames = frame_extraction(args.video)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape
# resize frames to shortside 256
new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
w_ratio, h_ratio = new_w / w, new_h / h
# Get clip_len, frame_interval and calculate center index of each clip
config = mmcv.Config.fromfile(args.config)
config.merge_from_dict(args.cfg_options)
val_pipeline = config.data.val.pipeline
sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
window_size = clip_len * frame_interval
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
# Note that it's 1 based here
timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
args.predict_stepsize)
# Load label_map
label_map = load_label_map(args.label_map)
try:
if config['data']['train']['custom_classes'] is not None:
label_map = {
id + 1: label_map[cls]
for id, cls in enumerate(config['data']['train']
['custom_classes'])
}
except KeyError:
pass
# Get Human detection results
center_frames = [frame_paths[ind - 1] for ind in timestamps]
human_detections = detection_inference(args, center_frames)
for i in range(len(human_detections)):
det = human_detections[i]
det[:, 0:4:2] *= w_ratio
det[:, 1:4:2] *= h_ratio
human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
# Get img_norm_cfg
img_norm_cfg = config['img_norm_cfg']
if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
to_bgr = img_norm_cfg.pop('to_bgr')
img_norm_cfg['to_rgb'] = to_bgr
img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
img_norm_cfg['std'] = np.array(img_norm_cfg['std'])
# Build STDET model
try:
# In our spatiotemporal detection demo, different actions should have
# the same number of bboxes.
config['model']['test_cfg']['rcnn']['action_thr'] = .0
except KeyError:
pass
config.model.backbone.pretrained = None
model = build_detector(config.model, test_cfg=config.get('test_cfg'))
load_checkpoint(model, args.checkpoint, map_location='cpu')
model.to(args.device)
model.eval()
predictions = []
print('Performing SpatioTemporal Action Detection for each clip')
assert len(timestamps) == len(human_detections)
prog_bar = mmcv.ProgressBar(len(timestamps))
for timestamp, proposal in zip(timestamps, human_detections):
if proposal.shape[0] == 0:
predictions.append(None)
continue
start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
frame_inds = start_frame + np.arange(0, window_size, frame_interval)
frame_inds = list(frame_inds - 1)
imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
_ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
# THWC -> CTHW -> 1CTHW
input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
input_tensor = torch.from_numpy(input_array).to(args.device)
with torch.no_grad():
result = model(
return_loss=False,
img=[input_tensor],
img_metas=[[dict(img_shape=(new_h, new_w))]],
proposals=[[proposal]])
result = result[0]
prediction = []
# N proposals
for i in range(proposal.shape[0]):
prediction.append([])
# Perform action score thr
for i in range(len(result)):
if i + 1 not in label_map:
continue
for j in range(proposal.shape[0]):
if result[i][j, 4] > args.action_score_thr:
prediction[j].append((label_map[i + 1], result[i][j,
4]))
predictions.append(prediction)
prog_bar.update()
results = []
for human_detection, prediction in zip(human_detections, predictions):
results.append(pack_result(human_detection, prediction, new_h, new_w))
def dense_timestamps(timestamps, n):
"""Make it nx frames."""
old_frame_interval = (timestamps[1] - timestamps[0])
start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
new_frame_inds = np.arange(
len(timestamps) * n) * old_frame_interval / n + start
return new_frame_inds.astype(np.int64)
dense_n = int(args.predict_stepsize / args.output_stepsize)
frames = [
cv2.imread(frame_paths[i - 1])
for i in dense_timestamps(timestamps, dense_n)
]
print('Performing visualization')
vis_frames = visualize(frames, results)
vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
fps=args.output_fps)
vid.write_videofile(args.out_filename)
tmp_frame_dir = osp.dirname(frame_paths[0])
shutil.rmtree(tmp_frame_dir)
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy as cp
import os
import os.path as osp
import shutil
import warnings
import cv2
import mmcv
import numpy as np
import torch
from mmcv import DictAction
from mmcv.runner import load_checkpoint
from mmaction.apis import inference_recognizer
from mmaction.datasets.pipelines import Compose
from mmaction.models import build_detector, build_model, build_recognizer
try:
from mmdet.apis import inference_detector, init_detector
except (ImportError, ModuleNotFoundError):
warnings.warn('Failed to import `inference_detector` and `init_detector` '
'form `mmdet.apis`. These apis are required in '
'skeleton-based applications! ')
try:
from mmpose.apis import (inference_top_down_pose_model, init_pose_model,
vis_pose_result)
except (ImportError, ModuleNotFoundError):
warnings.warn('Failed to import `inference_top_down_pose_model`, '
'`init_pose_model`, and `vis_pose_result` form '
'`mmpose.apis`. These apis are required in skeleton-based '
'applications! ')
try:
import moviepy.editor as mpy
except ImportError:
raise ImportError('Please install moviepy to enable output file')
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 0.5
FONTCOLOR = (255, 255, 255) # BGR, white
MSGCOLOR = (128, 128, 128) # BGR, gray
THICKNESS = 1
LINETYPE = 1
def hex2color(h):
"""Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
PLATEBLUE = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
PLATEBLUE = PLATEBLUE.split('-')
PLATEBLUE = [hex2color(h) for h in PLATEBLUE]
PLATEGREEN = '004b23-006400-007200-008000-38b000-70e000'
PLATEGREEN = PLATEGREEN.split('-')
PLATEGREEN = [hex2color(h) for h in PLATEGREEN]
def visualize(frames,
annotations,
pose_results,
action_result,
pose_model,
plate=PLATEBLUE,
max_num=5):
"""Visualize frames with predicted annotations.
Args:
frames (list[np.ndarray]): Frames for visualization, note that
len(frames) % len(annotations) should be 0.
annotations (list[list[tuple]]): The predicted spatio-temporal
detection results.
pose_results (list[list[tuple]): The pose results.
action_result (str): The predicted action recognition results.
pose_model (nn.Module): The constructed pose model.
plate (str): The plate used for visualization. Default: PLATEBLUE.
max_num (int): Max number of labels to visualize for a person box.
Default: 5.
Returns:
list[np.ndarray]: Visualized frames.
"""
assert max_num + 1 <= len(plate)
plate = [x[::-1] for x in plate]
frames_ = cp.deepcopy(frames)
nf, na = len(frames), len(annotations)
assert nf % na == 0
nfpa = len(frames) // len(annotations)
anno = None
h, w, _ = frames[0].shape
scale_ratio = np.array([w, h, w, h])
# add pose results
if pose_results:
for i in range(nf):
frames_[i] = vis_pose_result(pose_model, frames_[i],
pose_results[i])
for i in range(na):
anno = annotations[i]
if anno is None:
continue
for j in range(nfpa):
ind = i * nfpa + j
frame = frames_[ind]
# add action result for whole video
cv2.putText(frame, action_result, (10, 30), FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
# add spatio-temporal action detection results
for ann in anno:
box = ann[0]
label = ann[1]
if not len(label):
continue
score = ann[2]
box = (box * scale_ratio).astype(np.int64)
st, ed = tuple(box[:2]), tuple(box[2:])
if not pose_results:
cv2.rectangle(frame, st, ed, plate[0], 2)
for k, lb in enumerate(label):
if k >= max_num:
break
text = abbrev(lb)
text = ': '.join([text, str(score[k])])
location = (0 + st[0], 18 + k * 18 + st[1])
textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
THICKNESS)[0]
textwidth = textsize[0]
diag0 = (location[0] + textwidth, location[1] - 14)
diag1 = (location[0], location[1] + 2)
cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
return frames_
def parse_args():
parser = argparse.ArgumentParser(description='MMAction2 demo')
parser.add_argument(
'--rgb-stdet-config',
default=('configs/detection/ava/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py'),
help='rgb-based spatio temporal detection config file path')
parser.add_argument(
'--rgb-stdet-checkpoint',
default=('https://download.openmmlab.com/mmaction/detection/ava/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb'
'_20201217-16378594.pth'),
help='rgb-based spatio temporal detection checkpoint file/url')
parser.add_argument(
'--skeleton-stdet-checkpoint',
default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
'posec3d_ava.pth'),
help='skeleton-based spatio temporal detection checkpoint file/url')
parser.add_argument(
'--det-config',
default='demo/faster_rcnn_r50_fpn_2x_coco.py',
help='human detection config file path (from mmdet)')
parser.add_argument(
'--det-checkpoint',
default=('http://download.openmmlab.com/mmdetection/v2.0/'
'faster_rcnn/faster_rcnn_r50_fpn_2x_coco/'
'faster_rcnn_r50_fpn_2x_coco_'
'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
help='human detection checkpoint file/url')
parser.add_argument(
'--pose-config',
default='demo/hrnet_w32_coco_256x192.py',
help='human pose estimation config file path (from mmpose)')
parser.add_argument(
'--pose-checkpoint',
default=('https://download.openmmlab.com/mmpose/top_down/hrnet/'
'hrnet_w32_coco_256x192-c78dce93_20200708.pth'),
help='human pose estimation checkpoint file/url')
parser.add_argument(
'--skeleton-config',
default='configs/skeleton/posec3d/'
'slowonly_r50_u48_240e_ntu120_xsub_keypoint.py',
help='skeleton-based action recognition config file path')
parser.add_argument(
'--skeleton-checkpoint',
default='https://download.openmmlab.com/mmaction/skeleton/posec3d/'
'posec3d_k400.pth',
help='skeleton-based action recognition checkpoint file/url')
parser.add_argument(
'--rgb-config',
default='configs/recognition/tsn/'
'tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py',
help='rgb-based action recognition config file path')
parser.add_argument(
'--rgb-checkpoint',
default='https://download.openmmlab.com/mmaction/recognition/'
'tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/'
'tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth',
help='rgb-based action recognition checkpoint file/url')
parser.add_argument(
'--use-skeleton-stdet',
action='store_true',
help='use skeleton-based spatio temporal detection method')
parser.add_argument(
'--use-skeleton-recog',
action='store_true',
help='use skeleton-based action recognition method')
parser.add_argument(
'--det-score-thr',
type=float,
default=0.9,
help='the threshold of human detection score')
parser.add_argument(
'--action-score-thr',
type=float,
default=0.4,
help='the threshold of action prediction score')
parser.add_argument(
'--video',
default='demo/test_video_structuralize.mp4',
help='video file/url')
parser.add_argument(
'--label-map-stdet',
default='tools/data/ava/label_map.txt',
help='label map file for spatio-temporal action detection')
parser.add_argument(
'--label-map',
default='tools/data/kinetics/label_map_k400.txt',
help='label map file for action recognition')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--out-filename',
default='demo/test_stdet_recognition_output.mp4',
help='output filename')
parser.add_argument(
'--predict-stepsize',
default=8,
type=int,
help='give out a spatio-temporal detection prediction per n frames')
parser.add_argument(
'--output-stepsize',
default=1,
type=int,
help=('show one frame per n frames in the demo, we should have: '
'predict_stepsize % output_stepsize == 0'))
parser.add_argument(
'--output-fps',
default=24,
type=int,
help='the fps of demo video output')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
args = parser.parse_args()
return args
def frame_extraction(video_path):
"""Extract frames given video_path.
Args:
video_path (str): The video_path.
"""
# Load the video, extract frames into ./tmp/video_name
target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
# target_dir = osp.join('./tmp','spatial_skeleton_dir')
os.makedirs(target_dir, exist_ok=True)
# Should be able to handle videos up to several hours
frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
vid = cv2.VideoCapture(video_path)
frames = []
frame_paths = []
flag, frame = vid.read()
cnt = 0
while flag:
frames.append(frame)
frame_path = frame_tmpl.format(cnt + 1)
frame_paths.append(frame_path)
cv2.imwrite(frame_path, frame)
cnt += 1
flag, frame = vid.read()
return frame_paths, frames
def detection_inference(args, frame_paths):
"""Detect human boxes given frame paths.
Args:
args (argparse.Namespace): The arguments.
frame_paths (list[str]): The paths of frames to do detection inference.
Returns:
list[np.ndarray]: The human detection results.
"""
model = init_detector(args.det_config, args.det_checkpoint, args.device)
assert model.CLASSES[0] == 'person', ('We require you to use a detector '
'trained on COCO')
results = []
print('Performing Human Detection for each frame')
prog_bar = mmcv.ProgressBar(len(frame_paths))
for frame_path in frame_paths:
result = inference_detector(model, frame_path)
# We only keep human detections with score larger than det_score_thr
result = result[0][result[0][:, 4] >= args.det_score_thr]
results.append(result)
prog_bar.update()
return results
def pose_inference(args, frame_paths, det_results):
model = init_pose_model(args.pose_config, args.pose_checkpoint,
args.device)
ret = []
print('Performing Human Pose Estimation for each frame')
prog_bar = mmcv.ProgressBar(len(frame_paths))
for f, d in zip(frame_paths, det_results):
# Align input format
d = [dict(bbox=x) for x in list(d)]
pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
ret.append(pose)
prog_bar.update()
return ret
def load_label_map(file_path):
"""Load Label Map.
Args:
file_path (str): The file path of label map.
Returns:
dict: The label map (int -> label name).
"""
lines = open(file_path).readlines()
lines = [x.strip().split(': ') for x in lines]
return {int(x[0]): x[1] for x in lines}
def abbrev(name):
"""Get the abbreviation of label name:
'take (an object) from (a person)' -> 'take ... from ...'
"""
while name.find('(') != -1:
st, ed = name.find('('), name.find(')')
name = name[:st] + '...' + name[ed + 1:]
return name
def pack_result(human_detection, result, img_h, img_w):
"""Short summary.
Args:
human_detection (np.ndarray): Human detection result.
result (type): The predicted label of each human proposal.
img_h (int): The image height.
img_w (int): The image width.
Returns:
tuple: Tuple of human proposal, label name and label score.
"""
human_detection[:, 0::2] /= img_w
human_detection[:, 1::2] /= img_h
results = []
if result is None:
return None
for prop, res in zip(human_detection, result):
res.sort(key=lambda x: -x[1])
results.append(
(prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
for x in res]))
return results
def expand_bbox(bbox, h, w, ratio=1.25):
x1, y1, x2, y2 = bbox
center_x = (x1 + x2) // 2
center_y = (y1 + y2) // 2
width = x2 - x1
height = y2 - y1
square_l = max(width, height)
new_width = new_height = square_l * ratio
new_x1 = max(0, int(center_x - new_width / 2))
new_x2 = min(int(center_x + new_width / 2), w)
new_y1 = max(0, int(center_y - new_height / 2))
new_y2 = min(int(center_y + new_height / 2), h)
return (new_x1, new_y1, new_x2, new_y2)
def cal_iou(box1, box2):
xmin1, ymin1, xmax1, ymax1 = box1
xmin2, ymin2, xmax2, ymax2 = box2
s1 = (xmax1 - xmin1) * (ymax1 - ymin1)
s2 = (xmax2 - xmin2) * (ymax2 - ymin2)
xmin = max(xmin1, xmin2)
ymin = max(ymin1, ymin2)
xmax = min(xmax1, xmax2)
ymax = min(ymax1, ymax2)
w = max(0, xmax - xmin)
h = max(0, ymax - ymin)
intersect = w * h
union = s1 + s2 - intersect
iou = intersect / union
return iou
def skeleton_based_action_recognition(args, pose_results, num_frame, h, w):
fake_anno = dict(
frame_dict='',
label=-1,
img_shape=(h, w),
origin_shape=(h, w),
start_index=0,
modality='Pose',
total_frames=num_frame)
num_person = max([len(x) for x in pose_results])
num_keypoint = 17
keypoint = np.zeros((num_person, num_frame, num_keypoint, 2),
dtype=np.float16)
keypoint_score = np.zeros((num_person, num_frame, num_keypoint),
dtype=np.float16)
for i, poses in enumerate(pose_results):
for j, pose in enumerate(poses):
pose = pose['keypoints']
keypoint[j, i] = pose[:, :2]
keypoint_score[j, i] = pose[:, 2]
fake_anno['keypoint'] = keypoint
fake_anno['keypoint_score'] = keypoint_score
label_map = [x.strip() for x in open(args.label_map).readlines()]
num_class = len(label_map)
skeleton_config = mmcv.Config.fromfile(args.skeleton_config)
skeleton_config.model.cls_head.num_classes = num_class # for K400 dataset
skeleton_pipeline = Compose(skeleton_config.test_pipeline)
skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None]
skeleton_imgs = skeleton_imgs.to(args.device)
# Build skeleton-based recognition model
skeleton_model = build_model(skeleton_config.model)
load_checkpoint(
skeleton_model, args.skeleton_checkpoint, map_location='cpu')
skeleton_model.to(args.device)
skeleton_model.eval()
with torch.no_grad():
output = skeleton_model(return_loss=False, imgs=skeleton_imgs)
action_idx = np.argmax(output)
skeleton_action_result = label_map[
action_idx] # skeleton-based action result for the whole video
return skeleton_action_result
def rgb_based_action_recognition(args):
rgb_config = mmcv.Config.fromfile(args.rgb_config)
rgb_config.model.backbone.pretrained = None
rgb_model = build_recognizer(
rgb_config.model, test_cfg=rgb_config.get('test_cfg'))
load_checkpoint(rgb_model, args.rgb_checkpoint, map_location='cpu')
rgb_model.cfg = rgb_config
rgb_model.to(args.device)
rgb_model.eval()
action_results = inference_recognizer(
rgb_model, args.video, label_path=args.label_map)
rgb_action_result = action_results[0][0]
label_map = [x.strip() for x in open(args.label_map).readlines()]
return label_map[rgb_action_result]
def skeleton_based_stdet(args, label_map, human_detections, pose_results,
num_frame, clip_len, frame_interval, h, w):
window_size = clip_len * frame_interval
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
args.predict_stepsize)
skeleton_config = mmcv.Config.fromfile(args.skeleton_config)
num_class = max(label_map.keys()) + 1 # for AVA dataset (81)
skeleton_config.model.cls_head.num_classes = num_class
skeleton_pipeline = Compose(skeleton_config.test_pipeline)
skeleton_stdet_model = build_model(skeleton_config.model)
load_checkpoint(
skeleton_stdet_model,
args.skeleton_stdet_checkpoint,
map_location='cpu')
skeleton_stdet_model.to(args.device)
skeleton_stdet_model.eval()
skeleton_predictions = []
print('Performing SpatioTemporal Action Detection for each clip')
prog_bar = mmcv.ProgressBar(len(timestamps))
for timestamp in timestamps:
proposal = human_detections[timestamp - 1]
if proposal.shape[0] == 0: # no people detected
skeleton_predictions.append(None)
continue
start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
frame_inds = start_frame + np.arange(0, window_size, frame_interval)
frame_inds = list(frame_inds - 1)
num_frame = len(frame_inds) # 30
pose_result = [pose_results[ind] for ind in frame_inds]
skeleton_prediction = []
for i in range(proposal.shape[0]): # num_person
skeleton_prediction.append([])
fake_anno = dict(
frame_dict='',
label=-1,
img_shape=(h, w),
origin_shape=(h, w),
start_index=0,
modality='Pose',
total_frames=num_frame)
num_person = 1
num_keypoint = 17
keypoint = np.zeros(
(num_person, num_frame, num_keypoint, 2)) # M T V 2
keypoint_score = np.zeros(
(num_person, num_frame, num_keypoint)) # M T V
# pose matching
person_bbox = proposal[i][:4]
area = expand_bbox(person_bbox, h, w)
for j, poses in enumerate(pose_result): # num_frame
max_iou = float('-inf')
index = -1
if len(poses) == 0:
continue
for k, per_pose in enumerate(poses):
iou = cal_iou(per_pose['bbox'][:4], area)
if max_iou < iou:
index = k
max_iou = iou
keypoint[0, j] = poses[index]['keypoints'][:, :2]
keypoint_score[0, j] = poses[index]['keypoints'][:, 2]
fake_anno['keypoint'] = keypoint
fake_anno['keypoint_score'] = keypoint_score
skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None]
skeleton_imgs = skeleton_imgs.to(args.device)
with torch.no_grad():
output = skeleton_stdet_model(
return_loss=False, imgs=skeleton_imgs)
output = output[0]
for k in range(len(output)): # 81
if k not in label_map:
continue
if output[k] > args.action_score_thr:
skeleton_prediction[i].append(
(label_map[k], output[k]))
skeleton_predictions.append(skeleton_prediction)
prog_bar.update()
return timestamps, skeleton_predictions
def rgb_based_stdet(args, frames, label_map, human_detections, w, h, new_w,
new_h, w_ratio, h_ratio):
rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config)
rgb_stdet_config.merge_from_dict(args.cfg_options)
val_pipeline = rgb_stdet_config.data.val.pipeline
sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
window_size = clip_len * frame_interval
num_frame = len(frames)
timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
args.predict_stepsize)
# Get img_norm_cfg
img_norm_cfg = rgb_stdet_config['img_norm_cfg']
if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
to_bgr = img_norm_cfg.pop('to_bgr')
img_norm_cfg['to_rgb'] = to_bgr
img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
img_norm_cfg['std'] = np.array(img_norm_cfg['std'])
# Build STDET model
try:
# In our spatiotemporal detection demo, different actions should have
# the same number of bboxes.
rgb_stdet_config['model']['test_cfg']['rcnn']['action_thr'] = .0
except KeyError:
pass
rgb_stdet_config.model.backbone.pretrained = None
rgb_stdet_model = build_detector(
rgb_stdet_config.model, test_cfg=rgb_stdet_config.get('test_cfg'))
load_checkpoint(
rgb_stdet_model, args.rgb_stdet_checkpoint, map_location='cpu')
rgb_stdet_model.to(args.device)
rgb_stdet_model.eval()
predictions = []
print('Performing SpatioTemporal Action Detection for each clip')
prog_bar = mmcv.ProgressBar(len(timestamps))
for timestamp in timestamps:
proposal = human_detections[timestamp - 1]
if proposal.shape[0] == 0:
predictions.append(None)
continue
start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
frame_inds = start_frame + np.arange(0, window_size, frame_interval)
frame_inds = list(frame_inds - 1)
imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
_ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
# THWC -> CTHW -> 1CTHW
input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
input_tensor = torch.from_numpy(input_array).to(args.device)
with torch.no_grad():
result = rgb_stdet_model(
return_loss=False,
img=[input_tensor],
img_metas=[[dict(img_shape=(new_h, new_w))]],
proposals=[[proposal]])
result = result[0]
prediction = []
# N proposals
for i in range(proposal.shape[0]):
prediction.append([])
# Perform action score thr
for i in range(len(result)): # 80
if i + 1 not in label_map:
continue
for j in range(proposal.shape[0]):
if result[i][j, 4] > args.action_score_thr:
prediction[j].append((label_map[i + 1], result[i][j,
4]))
predictions.append(prediction)
prog_bar.update()
return timestamps, predictions
def main():
args = parse_args()
frame_paths, original_frames = frame_extraction(args.video)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape
# Get Human detection results and pose results
human_detections = detection_inference(args, frame_paths)
pose_results = None
if args.use_skeleton_recog or args.use_skeleton_stdet:
pose_results = pose_inference(args, frame_paths, human_detections)
# resize frames to shortside 256
new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
w_ratio, h_ratio = new_w / w, new_h / h
# Load spatio-temporal detection label_map
stdet_label_map = load_label_map(args.label_map_stdet)
rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config)
rgb_stdet_config.merge_from_dict(args.cfg_options)
try:
if rgb_stdet_config['data']['train']['custom_classes'] is not None:
stdet_label_map = {
id + 1: stdet_label_map[cls]
for id, cls in enumerate(rgb_stdet_config['data']['train']
['custom_classes'])
}
except KeyError:
pass
action_result = None
if args.use_skeleton_recog:
print('Use skeleton-based recognition')
action_result = skeleton_based_action_recognition(
args, pose_results, num_frame, h, w)
else:
print('Use rgb-based recognition')
action_result = rgb_based_action_recognition(args)
stdet_preds = None
if args.use_skeleton_stdet:
print('Use skeleton-based SpatioTemporal Action Detection')
clip_len, frame_interval = 30, 1
timestamps, stdet_preds = skeleton_based_stdet(args, stdet_label_map,
human_detections,
pose_results, num_frame,
clip_len,
frame_interval, h, w)
for i in range(len(human_detections)):
det = human_detections[i]
det[:, 0:4:2] *= w_ratio
det[:, 1:4:2] *= h_ratio
human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
else:
print('Use rgb-based SpatioTemporal Action Detection')
for i in range(len(human_detections)):
det = human_detections[i]
det[:, 0:4:2] *= w_ratio
det[:, 1:4:2] *= h_ratio
human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
timestamps, stdet_preds = rgb_based_stdet(args, frames,
stdet_label_map,
human_detections, w, h,
new_w, new_h, w_ratio,
h_ratio)
stdet_results = []
for timestamp, prediction in zip(timestamps, stdet_preds):
human_detection = human_detections[timestamp - 1]
stdet_results.append(
pack_result(human_detection, prediction, new_h, new_w))
def dense_timestamps(timestamps, n):
"""Make it nx frames."""
old_frame_interval = (timestamps[1] - timestamps[0])
start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
new_frame_inds = np.arange(
len(timestamps) * n) * old_frame_interval / n + start
return new_frame_inds.astype(np.int64)
dense_n = int(args.predict_stepsize / args.output_stepsize)
output_timestamps = dense_timestamps(timestamps, dense_n)
frames = [
cv2.imread(frame_paths[timestamp - 1])
for timestamp in output_timestamps
]
print('Performing visualization')
pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
args.device)
if args.use_skeleton_recog or args.use_skeleton_stdet:
pose_results = [
pose_results[timestamp - 1] for timestamp in output_timestamps
]
vis_frames = visualize(frames, stdet_results, pose_results, action_result,
pose_model)
vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
fps=args.output_fps)
vid.write_videofile(args.out_filename)
tmp_frame_dir = osp.dirname(frame_paths[0])
shutil.rmtree(tmp_frame_dir)
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
# model config
model = dict(
type='FasterRCNN',
pretrained='torchvision://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
roi_head=dict(
type='StandardRoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=False,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=-1,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=2000,
max_per_img=1000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)),
test_cfg=dict(
rpn=dict(
nms_pre=1000,
max_per_img=1000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100)))
# dataset config
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='bbox')
# Schedule
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[16, 22])
total_epochs = 24
# runtime
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
])
# yapf:enable
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
jf7RDuUTrsQ 300 325
JTlatknwOrY 301 233
8UxlDNur-Z0 300 262
y9r115bgfNk 300 320
ZnIDviwA8CE 300 244
c8ln_nWYMyM 300 333
9GFfKVeoGm0 300 98
F5Y_gGsg4x8 300 193
AuqIu3x_lhY 300 36
1Hi5GMotrjs 300 26
czhL0iDbNT8 300 46
DYpTE_n-Wvk 177 208
R-xmgefs-M4 300 101
KPP2qRzMdos 300 131
PmgfU9ocx5A 300 193
GI7nIyMEQi4 300 173
A8TIWMvJVDU 300 72
ustVqWMM56c 300 289
03dk7mneDU0 300 254
jqkyelS4GJk 300 279
a58tBGuDIg0 231 382
5l1ajLjqaPo 300 226
-5wLopwbGX0 300 132
NUG7kwJ-614 300 103
wHUvw_R2iv8 300 97
44Mak5_s6Fk 300 256
y5vsk8Mj-3w 300 77
TEj_A_BC-aU 300 393
fUdu6hpMt_c 299 40
C5Z1sRArUR0 300 254
-orecnYvpNw 300 284
Urmbp1ulIXI 300 319
bLgdi4w7OAk 299 36
cVv_XMw4W2U 300 27
dV8JmKwDUzM 300 312
yZ9hIqW4bRc 300 239
9ykbMdR9Jss 213 257
G8fEnqIOkiA 300 158
6P2eVJ-Qp1g 300 131
Y-acp_jXG1Q 302 315
xthWPdx21r8 301 62
LExCUx4STW0 300 9
p2UMwzWsY0U 300 248
c0UI7f3Plro 300 383
1MmjE51PeIE 300 93
OU5dJpNHATk 300 342
38Uv6dbQkWc 281 44
5ZNdkbmv274 300 59
DrSL3Uddj6s 300 283
aNJ1-bvRox8 175 384
b5U7A_crvE0 194 377
xeWO9Bl9aWA 300 86
Zy8Ta83mrXo 300 223
AXnDRH7o2DQ 300 146
fTPDXmcygjw 300 11
EhRxb8-cNzQ 164 325
iO8RYYQzNiE 299 191
XbCncZcXuTI 300 55
pSCunaRn45A 300 265
UqI--TBQRgg 300 165
yD42KW6cm-A 300 186
VseX7hoxhbM 300 61
1FEcfy-moBM 300 8
BUT8oefH9Nw 300 120
-49tMSUTnZg 300 227
cZKPTt_FcFs 300 85
fiKJm0eavfw 300 323
gJcVljRRxGE 302 87
de1rSoht9t4 300 253
UAIJnI7fQYo 300 284
c4eIDxmVmCw 300 95
3LGce3efz7M 300 332
EC8iyn_q-NM 300 92
eo15donXwmY 300 351
NsG31u7Pd2Q 300 87
ILkPWpZYlPE 300 137
n5ZHSJRZl1U 300 338
UoQE44FEqLQ 300 260
5I-4meP_5wY 300 185
udLMOf77S3U 300 209
a4Ye18Mnblk 262 172
QbDMgHWwt_s 236 395
S6iAYBBMnwk 300 267
DNMfmNV8Uug 300 131
AJdp07pp43c 300 293
tVuop87KbDY 300 103
o79s5eOAF-c 300 246
dMt_nuBNdeY 300 168
RJU9NV1R4Fw 300 128
Zhux7Vy-hHc 300 82
47Cj6jwQKjo 300 228
a7Mc-0lwAuE 300 129
taZtEzvkg3M 300 264
bVDZohQJhBI 240 129
sBJk5li0O5o 216 154
DQUNZmbQI_g 300 29
-zpKHNrNsn4 300 244
Dcz0r8q-sx0 300 249
hfRKTH9pOMA 165 116
8CdUbOHDtes 300 222
# Copyright (c) OpenMMLab. All rights reserved.
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=10)
evaluation = dict(interval=10, metric='mAP', key_indicator='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = dict(
type='TopDown',
pretrained='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w32-36af842e.pth',
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(32, 64)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(32, 64, 128)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(32, 64, 128, 256))),
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=32,
out_channels=channel_cfg['num_output_channels'],
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=64,
workers_per_gpu=2,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline),
)
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
import random
from collections import deque
from operator import itemgetter
import cv2
import mmcv
import numpy as np
import torch
from mmcv import Config, DictAction
from mmcv.parallel import collate, scatter
from mmaction.apis import init_recognizer
from mmaction.datasets.pipelines import Compose
FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL
FONTSCALE = 1
THICKNESS = 1
LINETYPE = 1
EXCLUED_STEPS = [
'OpenCVInit', 'OpenCVDecode', 'DecordInit', 'DecordDecode', 'PyAVInit',
'PyAVDecode', 'RawFrameDecode'
]
def parse_args():
parser = argparse.ArgumentParser(
description='MMAction2 predict different labels in a long video demo')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file/url')
parser.add_argument('video_path', help='video file/url')
parser.add_argument('label', help='label file')
parser.add_argument('out_file', help='output result file in video/json')
parser.add_argument(
'--input-step',
type=int,
default=1,
help='input step for sampling frames')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--threshold',
type=float,
default=0.01,
help='recognition score threshold')
parser.add_argument(
'--stride',
type=float,
default=0,
help=('the prediction stride equals to stride * sample_length '
'(sample_length indicates the size of temporal window from '
'which you sample frames, which equals to '
'clip_len x frame_interval), if set as 0, the '
'prediction stride is 1'))
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--label-color',
nargs='+',
type=int,
default=(255, 255, 255),
help='font color (B, G, R) of the labels in output video')
parser.add_argument(
'--msg-color',
nargs='+',
type=int,
default=(128, 128, 128),
help='font color (B, G, R) of the messages in output video')
args = parser.parse_args()
return args
def show_results_video(result_queue,
text_info,
thr,
msg,
frame,
video_writer,
label_color=(255, 255, 255),
msg_color=(128, 128, 128)):
if len(result_queue) != 0:
text_info = {}
results = result_queue.popleft()
for i, result in enumerate(results):
selected_label, score = result
if score < thr:
break
location = (0, 40 + i * 20)
text = selected_label + ': ' + str(round(score, 2))
text_info[location] = text
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
label_color, THICKNESS, LINETYPE)
elif len(text_info):
for location, text in text_info.items():
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
label_color, THICKNESS, LINETYPE)
else:
cv2.putText(frame, msg, (0, 40), FONTFACE, FONTSCALE, msg_color,
THICKNESS, LINETYPE)
video_writer.write(frame)
return text_info
def get_results_json(result_queue, text_info, thr, msg, ind, out_json):
if len(result_queue) != 0:
text_info = {}
results = result_queue.popleft()
for i, result in enumerate(results):
selected_label, score = result
if score < thr:
break
text_info[i + 1] = selected_label + ': ' + str(round(score, 2))
out_json[ind] = text_info
elif len(text_info):
out_json[ind] = text_info
else:
out_json[ind] = msg
return text_info, out_json
def show_results(model, data, label, args):
frame_queue = deque(maxlen=args.sample_length)
result_queue = deque(maxlen=1)
cap = cv2.VideoCapture(args.video_path)
num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
msg = 'Preparing action recognition ...'
text_info = {}
out_json = {}
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
frame_size = (frame_width, frame_height)
ind = 0
video_writer = None if args.out_file.endswith('.json') \
else cv2.VideoWriter(args.out_file, fourcc, fps, frame_size)
prog_bar = mmcv.ProgressBar(num_frames)
backup_frames = []
while ind < num_frames:
ind += 1
prog_bar.update()
ret, frame = cap.read()
if frame is None:
# drop it when encounting None
continue
backup_frames.append(np.array(frame)[:, :, ::-1])
if ind == args.sample_length:
# provide a quick show at the beginning
frame_queue.extend(backup_frames)
backup_frames = []
elif ((len(backup_frames) == args.input_step
and ind > args.sample_length) or ind == num_frames):
# pick a frame from the backup
# when the backup is full or reach the last frame
chosen_frame = random.choice(backup_frames)
backup_frames = []
frame_queue.append(chosen_frame)
ret, scores = inference(model, data, args, frame_queue)
if ret:
num_selected_labels = min(len(label), 5)
scores_tuples = tuple(zip(label, scores))
scores_sorted = sorted(
scores_tuples, key=itemgetter(1), reverse=True)
results = scores_sorted[:num_selected_labels]
result_queue.append(results)
if args.out_file.endswith('.json'):
text_info, out_json = get_results_json(result_queue, text_info,
args.threshold, msg, ind,
out_json)
else:
text_info = show_results_video(result_queue, text_info,
args.threshold, msg, frame,
video_writer, args.label_color,
args.msg_color)
cap.release()
cv2.destroyAllWindows()
if args.out_file.endswith('.json'):
with open(args.out_file, 'w') as js:
json.dump(out_json, js)
def inference(model, data, args, frame_queue):
if len(frame_queue) != args.sample_length:
# Do no inference when there is no enough frames
return False, None
cur_windows = list(np.array(frame_queue))
if data['img_shape'] is None:
data['img_shape'] = frame_queue[0].shape[:2]
cur_data = data.copy()
cur_data['imgs'] = cur_windows
cur_data = args.test_pipeline(cur_data)
cur_data = collate([cur_data], samples_per_gpu=1)
if next(model.parameters()).is_cuda:
cur_data = scatter(cur_data, [args.device])[0]
with torch.no_grad():
scores = model(return_loss=False, **cur_data)[0]
if args.stride > 0:
pred_stride = int(args.sample_length * args.stride)
for _ in range(pred_stride):
frame_queue.popleft()
# for case ``args.stride=0``
# deque will automatically popleft one element
return True, scores
def main():
args = parse_args()
args.device = torch.device(args.device)
cfg = Config.fromfile(args.config)
cfg.merge_from_dict(args.cfg_options)
model = init_recognizer(cfg, args.checkpoint, device=args.device)
data = dict(img_shape=None, modality='RGB', label=-1)
with open(args.label, 'r') as f:
label = [line.strip() for line in f]
# prepare test pipeline from non-camera pipeline
cfg = model.cfg
sample_length = 0
pipeline = cfg.data.test.pipeline
pipeline_ = pipeline.copy()
for step in pipeline:
if 'SampleFrames' in step['type']:
sample_length = step['clip_len'] * step['num_clips']
data['num_clips'] = step['num_clips']
data['clip_len'] = step['clip_len']
pipeline_.remove(step)
if step['type'] in EXCLUED_STEPS:
# remove step to decode frames
pipeline_.remove(step)
test_pipeline = Compose(pipeline_)
assert sample_length > 0
args.sample_length = sample_length
args.test_pipeline = test_pipeline
show_results(model, data, label, args)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment