Commit 76ccaa54 authored by unknown's avatar unknown
Browse files

添加mmaction2测试用例

parent 44c28b2b
#!/usr/bin/env python
# Copyright (c) OpenMMLab. All rights reserved.
import functools as func
import glob
import re
from os.path import basename, splitext
import numpy as np
import titlecase
def anchor(name):
return re.sub(r'-+', '-', re.sub(r'[^a-zA-Z0-9]', '-',
name.strip().lower())).strip('-')
# Count algorithms
files = sorted(glob.glob('*_models.md'))
# files = sorted(glob.glob('docs/*_models.md'))
stats = []
for f in files:
with open(f, 'r') as content_file:
content = content_file.read()
# title
title = content.split('\n')[0].replace('#', '')
# skip IMAGE and ABSTRACT tags
content = [
x for x in content.split('\n')
if 'IMAGE' not in x and 'ABSTRACT' not in x
]
content = '\n'.join(content)
# count papers
papers = set(
(papertype, titlecase.titlecase(paper.lower().strip()))
for (papertype, paper) in re.findall(
r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
content, re.DOTALL))
# paper links
revcontent = '\n'.join(list(reversed(content.splitlines())))
paperlinks = {}
for _, p in papers:
print(p)
q = p.replace('\\', '\\\\').replace('?', '\\?')
paperlinks[p] = ' '.join(
(f'[->]({splitext(basename(f))[0]}.html#{anchor(paperlink)})'
for paperlink in re.findall(
rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
revcontent, re.DOTALL | re.IGNORECASE)))
print(' ', paperlinks[p])
paperlist = '\n'.join(
sorted(f' - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
# count configs
configs = set(x.lower().strip()
for x in re.findall(r'https.*configs/.*\.py', content))
# count ckpts
ckpts = set(x.lower().strip()
for x in re.findall(r'https://download.*\.pth', content)
if 'mmaction' in x)
statsmsg = f"""
## [{title}]({f})
* Number of checkpoints: {len(ckpts)}
* Number of configs: {len(configs)}
* Number of papers: {len(papers)}
{paperlist}
"""
stats.append((papers, configs, ckpts, statsmsg))
allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _, _ in stats])
allconfigs = func.reduce(lambda a, b: a.union(b), [c for _, c, _, _ in stats])
allckpts = func.reduce(lambda a, b: a.union(b), [c for _, _, c, _ in stats])
msglist = '\n'.join(x for _, _, _, x in stats)
papertypes, papercounts = np.unique([t for t, _ in allpapers],
return_counts=True)
countstr = '\n'.join(
[f' - {t}: {c}' for t, c in zip(papertypes, papercounts)])
modelzoo = f"""
# Overview
* Number of checkpoints: {len(allckpts)}
* Number of configs: {len(allconfigs)}
* Number of papers: {len(allpapers)}
{countstr}
For supported datasets, see [datasets overview](datasets.md).
{msglist}
"""
with open('modelzoo.md', 'w') as f:
f.write(modelzoo)
# Count datasets
files = ['supported_datasets.md']
# files = sorted(glob.glob('docs/tasks/*.md'))
datastats = []
for f in files:
with open(f, 'r') as content_file:
content = content_file.read()
# title
title = content.split('\n')[0].replace('#', '')
# count papers
papers = set(
(papertype, titlecase.titlecase(paper.lower().strip()))
for (papertype, paper) in re.findall(
r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
content, re.DOTALL))
# paper links
revcontent = '\n'.join(list(reversed(content.splitlines())))
paperlinks = {}
for _, p in papers:
print(p)
q = p.replace('\\', '\\\\').replace('?', '\\?')
paperlinks[p] = ', '.join(
(f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})'
for p in re.findall(
rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
revcontent, re.DOTALL | re.IGNORECASE)))
print(' ', paperlinks[p])
paperlist = '\n'.join(
sorted(f' - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
statsmsg = f"""
## [{title}]({f})
* Number of papers: {len(papers)}
{paperlist}
"""
datastats.append((papers, configs, ckpts, statsmsg))
alldatapapers = func.reduce(lambda a, b: a.union(b),
[p for p, _, _, _ in datastats])
# Summarize
msglist = '\n'.join(x for _, _, _, x in stats)
datamsglist = '\n'.join(x for _, _, _, x in datastats)
papertypes, papercounts = np.unique([t for t, _ in alldatapapers],
return_counts=True)
countstr = '\n'.join(
[f' - {t}: {c}' for t, c in zip(papertypes, papercounts)])
modelzoo = f"""
# Overview
* Number of papers: {len(alldatapapers)}
{countstr}
For supported action algorithms, see [modelzoo overview](modelzoo.md).
{datamsglist}
"""
with open('datasets.md', 'w') as f:
f.write(modelzoo)
# Supported Datasets
- Action Recognition
- [UCF101](/tools/data/ucf101/README.md) \[ [Homepage](https://www.crcv.ucf.edu/research/data-sets/ucf101/) \].
- [HMDB51](/tools/data/hmdb51/README.md) \[ [Homepage](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) \].
- [Kinetics-\[400/600/700\]](/tools/data/kinetics/README.md) \[ [Homepage](https://deepmind.com/research/open-source/kinetics) \]
- [Something-Something V1](/tools/data/sthv1/README.md) \[ [Homepage](https://20bn.com/datasets/something-something/v1) \]
- [Something-Something V2](/tools/data/sthv2/README.md) \[ [Homepage](https://20bn.com/datasets/something-something) \]
- [Moments in Time](/tools/data/mit/README.md) \[ [Homepage](http://moments.csail.mit.edu/) \]
- [Multi-Moments in Time](/tools/data/mmit/README.md) \[ [Homepage](http://moments.csail.mit.edu/challenge_iccv_2019.html) \]
- [HVU](/tools/data/hvu/README.md) \[ [Homepage](https://github.com/holistic-video-understanding/HVU-Dataset) \]
- [Jester](/tools/data/jester/README.md) \[ [Homepage](https://20bn.com/datasets/jester/v1) \]
- [GYM](/tools/data/gym/README.md) \[ [Homepage](https://sdolivia.github.io/FineGym/) \]
- [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
- [Diving48](/tools/data/diving48/README.md) \[ [Homepage](http://www.svcl.ucsd.edu/projects/resound/dataset.html) \]
- [OmniSource](/tools/data/omnisource/README.md) \[ [Homepage](https://kennymckormick.github.io/omnisource/) \]
- Temporal Action Detection
- [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
- [THUMOS14](/tools/data/thumos14/README.md) \[ [Homepage](https://www.crcv.ucf.edu/THUMOS14/download.html) \]
- Spatial Temporal Action Detection
- [AVA](/tools/data/ava/README.md) \[ [Homepage](https://research.google.com/ava/index.html) \]
- [UCF101-24](/tools/data/ucf101_24/README.md) \[ [Homepage](http://www.thumos.info/download.html) \]
- [JHMDB](/tools/data/jhmdb/README.md) \[ [Homepage](http://jhmdb.is.tue.mpg.de/) \]
- Skeleton-based Action Recognition
- [PoseC3D Skeleton Dataset](/tools/data/skeleton/README.md) \[ [Homepage](https://kennymckormick.github.io/posec3d/) \]
The supported datasets are listed above.
We provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`.
Below is the detailed tutorials of data deployment for each dataset.
## <a href='https://mmaction2.readthedocs.io/en/latest/'>English</a>
## <a href='https://mmaction2.readthedocs.io/zh_CN/latest/'>简体中文</a>
# Tutorial 1: Learn about Configs
We use python files as configs, incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
You can find all the provided configs under `$MMAction2/configs`. If you wish to inspect the config file,
you may run `python tools/analysis/print_config.py /PATH/TO/CONFIG` to see the complete config.
<!-- TOC -->
- [Modify config through script arguments](#modify-config-through-script-arguments)
- [Config File Structure](#config-file-structure)
- [Config File Naming Convention](#config-file-naming-convention)
- [Config System for Action localization](#config-system-for-action-localization)
- [Config System for Action Recognition](#config-system-for-action-recognition)
- [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection)
- [FAQ](#faq)
- [Use intermediate variables in configs](#use-intermediate-variables-in-configs)
<!-- TOC -->
## Modify config through script arguments
When submitting jobs using "tools/train.py" or "tools/test.py", you may specify `--cfg-options` to in-place modify the config.
- Update config keys of dict.
The config options can be specified following the order of the dict keys in the original config.
For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode.
- Update keys inside a list of configs.
Some config dicts are composed as a list in your config. For example, the training pipeline `data.train.pipeline` is normally a list
e.g. `[dict(type='SampleFrames'), ...]`. If you want to change `'SampleFrames'` to `'DenseSampleFrames'` in the pipeline,
you may specify `--cfg-options data.train.pipeline.0.type=DenseSampleFrames`.
- Update values of list/tuples.
If the value to be updated is a list or a tuple. For example, the config file normally sets `workflow=[('train', 1)]`. If you want to
change this key, you may specify `--cfg-options workflow="[(train,1),(val,1)]"`. Note that the quotation mark " is necessary to
support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value.
## Config File Structure
There are 3 basic component types under `config/_base_`, model, schedule, default_runtime.
Many methods could be easily constructed with one of each like TSN, I3D, SlowOnly, etc.
The configs that are composed by components from `_base_` are called _primitive_.
For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3.
For easy understanding, we recommend contributors to inherit from exiting methods.
For example, if some modification is made base on TSN, users may first inherit the basic TSN structure by specifying `_base_ = ../tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py`, then modify the necessary fields in the config files.
If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder under `configs/TASK`.
Please refer to [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html) for detailed documentation.
## Config File Naming Convention
We follow the style below to name config files. Contributors are advised to follow the same style.
```
{model}_[model setting]_{backbone}_[misc]_{data setting}_[gpu x batch_per_gpu]_{schedule}_{dataset}_{modality}
```
`{xxx}` is required field and `[yyy]` is optional.
- `{model}`: model type, e.g. `tsn`, `i3d`, etc.
- `[model setting]`: specific setting for some models.
- `{backbone}`: backbone type, e.g. `r50` (ResNet-50), etc.
- `[misc]`: miscellaneous setting/plugins of model, e.g. `dense`, `320p`, `video`, etc.
- `{data setting}`: frame sample setting in `{clip_len}x{frame_interval}x{num_clips}` format.
- `[gpu x batch_per_gpu]`: GPUs and samples per GPU.
- `{schedule}`: training schedule, e.g. `20e` means 20 epochs.
- `{dataset}`: dataset name, e.g. `kinetics400`, `mmit`, etc.
- `{modality}`: frame modality, e.g. `rgb`, `flow`, etc.
### Config System for Action localization
We incorporate modular design into our config system,
which is convenient to conduct various experiments.
- An Example of BMN
To help the users have a basic idea of a complete config structure and the modules in an action localization system,
we make brief comments on the config of BMN as the following.
For more detailed usage and alternative for per parameter in each module, please refer to the [API documentation](https://mmaction2.readthedocs.io/en/latest/api.html).
```python
# model settings
model = dict( # Config of the model
type='BMN', # Type of the localizer
temporal_dim=100, # Total frames selected for each video
boundary_ratio=0.5, # Ratio for determining video boundaries
num_samples=32, # Number of samples for each proposal
num_samples_per_bin=3, # Number of bin samples for each sample
feat_dim=400, # Dimension of feature
soft_nms_alpha=0.4, # Soft NMS alpha
soft_nms_low_threshold=0.5, # Soft NMS low threshold
soft_nms_high_threshold=0.9, # Soft NMS high threshold
post_process_top_k=100) # Top k proposals in post process
# model training and testing settings
train_cfg = None # Config of training hyperparameters for BMN
test_cfg = dict(average_clips='score') # Config for testing hyperparameters for BMN
# dataset settings
dataset_type = 'ActivityNetDataset' # Type of dataset for training, validation and testing
data_root = 'data/activitynet_feature_cuhk/csv_mean_100/' # Root path to data for training
data_root_val = 'data/activitynet_feature_cuhk/csv_mean_100/' # Root path to data for validation and testing
ann_file_train = 'data/ActivityNet/anet_anno_train.json' # Path to the annotation file for training
ann_file_val = 'data/ActivityNet/anet_anno_val.json' # Path to the annotation file for validation
ann_file_test = 'data/ActivityNet/anet_anno_test.json' # Path to the annotation file for testing
train_pipeline = [ # List of training pipeline steps
dict(type='LoadLocalizationFeature'), # Load localization feature pipeline
dict(type='GenerateLocalizationLabels'), # Generate localization labels pipeline
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the localizer
keys=['raw_feature', 'gt_bbox'], # Keys of input
meta_name='video_meta', # Meta name
meta_keys=['video_name']), # Meta keys of input
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['raw_feature']), # Keys to be converted from image to tensor
dict( # Config of ToDataContainer
type='ToDataContainer', # Pipeline to convert the data to DataContainer
fields=[dict(key='gt_bbox', stack=False, cpu_only=True)]) # Required fields to be converted with keys and attributes
]
val_pipeline = [ # List of validation pipeline steps
dict(type='LoadLocalizationFeature'), # Load localization feature pipeline
dict(type='GenerateLocalizationLabels'), # Generate localization labels pipeline
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the localizer
keys=['raw_feature', 'gt_bbox'], # Keys of input
meta_name='video_meta', # Meta name
meta_keys=[
'video_name', 'duration_second', 'duration_frame', 'annotations',
'feature_frame'
]), # Meta keys of input
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['raw_feature']), # Keys to be converted from image to tensor
dict( # Config of ToDataContainer
type='ToDataContainer', # Pipeline to convert the data to DataContainer
fields=[dict(key='gt_bbox', stack=False, cpu_only=True)]) # Required fields to be converted with keys and attributes
]
test_pipeline = [ # List of testing pipeline steps
dict(type='LoadLocalizationFeature'), # Load localization feature pipeline
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the localizer
keys=['raw_feature'], # Keys of input
meta_name='video_meta', # Meta name
meta_keys=[
'video_name', 'duration_second', 'duration_frame', 'annotations',
'feature_frame'
]), # Meta keys of input
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['raw_feature']), # Keys to be converted from image to tensor
]
data = dict( # Config of data
videos_per_gpu=8, # Batch size of each single GPU
workers_per_gpu=8, # Workers to pre-fetch data for each single GPU
train_dataloader=dict( # Additional config of train dataloader
drop_last=True), # Whether to drop out the last batch of data in training
val_dataloader=dict( # Additional config of validation dataloader
videos_per_gpu=1), # Batch size of each single GPU during evaluation
test_dataloader=dict( # Additional config of test dataloader
videos_per_gpu=2), # Batch size of each single GPU during testing
test=dict( # Testing dataset config
type=dataset_type,
ann_file=ann_file_test,
pipeline=test_pipeline,
data_prefix=data_root_val),
val=dict( # Validation dataset config
type=dataset_type,
ann_file=ann_file_val,
pipeline=val_pipeline,
data_prefix=data_root_val),
train=dict( # Training dataset config
type=dataset_type,
ann_file=ann_file_train,
pipeline=train_pipeline,
data_prefix=data_root))
# optimizer
optimizer = dict(
# Config used to build optimizer, support (1). All the optimizers in PyTorch
# whose arguments are also the same as those in PyTorch. (2). Custom optimizers
# which are built on `constructor`, referring to "tutorials/5_new_modules.md"
# for implementation.
type='Adam', # Type of optimizer, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/optimizer/default_constructor.py#L13 for more details
lr=0.001, # Learning rate, see detail usages of the parameters in the documentation of PyTorch
weight_decay=0.0001) # Weight decay of Adam
optimizer_config = dict( # Config used to build the optimizer hook
grad_clip=None) # Most of the methods do not use gradient clip
# learning policy
lr_config = dict( # Learning rate scheduler config used to register LrUpdater hook
policy='step', # Policy of scheduler, also support CosineAnnealing, Cyclic, etc. Refer to details of supported LrUpdater from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9
step=7) # Steps to decay the learning rate
total_epochs = 9 # Total epochs to train the model
checkpoint_config = dict( # Config to set the checkpoint hook, Refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py for implementation
interval=1) # Interval to save checkpoint
evaluation = dict( # Config of evaluation during training
interval=1, # Interval to perform evaluation
metrics=['AR@AN']) # Metrics to be performed
log_config = dict( # Config to register logger hook
interval=50, # Interval to print the log
hooks=[ # Hooks to be implemented during training
dict(type='TextLoggerHook'), # The logger used to record the training process
# dict(type='TensorboardLoggerHook'), # The Tensorboard logger is also supported
])
# runtime settings
dist_params = dict(backend='nccl') # Parameters to setup distributed training, the port can also be set
log_level = 'INFO' # The level of logging
work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/' # Directory to save the model checkpoints and logs for the current experiments
load_from = None # load models as a pre-trained model from a given path. This will not resume training
resume_from = None # Resume checkpoints from a given path, the training will be resumed from the epoch when the checkpoint's is saved
workflow = [('train', 1)] # Workflow for runner. [('train', 1)] means there is only one workflow and the workflow named 'train' is executed once
output_config = dict( # Config of localization output
out=f'{work_dir}/results.json', # Path to output file
output_format='json') # File format of output file
```
### Config System for Action Recognition
We incorporate modular design into our config system,
which is convenient to conduct various experiments.
- An Example of TSN
To help the users have a basic idea of a complete config structure and the modules in an action recognition system,
we make brief comments on the config of TSN as the following.
For more detailed usage and alternative for per parameter in each module, please refer to the API documentation.
```python
# model settings
model = dict( # Config of the model
type='Recognizer2D', # Type of the recognizer
backbone=dict( # Dict for backbone
type='ResNet', # Name of the backbone
pretrained='torchvision://resnet50', # The url/site of the pretrained model
depth=50, # Depth of ResNet model
norm_eval=False), # Whether to set BN layers to eval mode when training
cls_head=dict( # Dict for classification head
type='TSNHead', # Name of classification head
num_classes=400, # Number of classes to be classified.
in_channels=2048, # The input channels of classification head.
spatial_type='avg', # Type of pooling in spatial dimension
consensus=dict(type='AvgConsensus', dim=1), # Config of consensus module
dropout_ratio=0.4, # Probability in dropout layer
init_std=0.01), # Std value for linear layer initiation
# model training and testing settings
train_cfg=None, # Config of training hyperparameters for TSN
test_cfg=dict(average_clips=None)) # Config for testing hyperparameters for TSN.
# dataset settings
dataset_type = 'RawframeDataset' # Type of dataset for training, validation and testing
data_root = 'data/kinetics400/rawframes_train/' # Root path to data for training
data_root_val = 'data/kinetics400/rawframes_val/' # Root path to data for validation and testing
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' # Path to the annotation file for training
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' # Path to the annotation file for validation
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' # Path to the annotation file for testing
img_norm_cfg = dict( # Config of image normalization used in data pipeline
mean=[123.675, 116.28, 103.53], # Mean values of different channels to normalize
std=[58.395, 57.12, 57.375], # Std values of different channels to normalize
to_bgr=False) # Whether to convert channels from RGB to BGR
train_pipeline = [ # List of training pipeline steps
dict( # Config of SampleFrames
type='SampleFrames', # Sample frames pipeline, sampling frames from video
clip_len=1, # Frames of each sampled output clip
frame_interval=1, # Temporal interval of adjacent sampled frames
num_clips=3), # Number of clips to be sampled
dict( # Config of RawFrameDecode
type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices
dict( # Config of Resize
type='Resize', # Resize pipeline
scale=(-1, 256)), # The scale to resize images
dict( # Config of MultiScaleCrop
type='MultiScaleCrop', # Multi scale crop pipeline, cropping images with a list of randomly selected scales
input_size=224, # Input size of the network
scales=(1, 0.875, 0.75, 0.66), # Scales of width and height to be selected
random_crop=False, # Whether to randomly sample cropping bbox
max_wh_scale_gap=1), # Maximum gap of w and h scale levels
dict( # Config of Resize
type='Resize', # Resize pipeline
scale=(224, 224), # The scale to resize images
keep_ratio=False), # Whether to resize with changing the aspect ratio
dict( # Config of Flip
type='Flip', # Flip Pipeline
flip_ratio=0.5), # Probability of implementing flip
dict( # Config of Normalize
type='Normalize', # Normalize pipeline
**img_norm_cfg), # Config of image normalization
dict( # Config of FormatShape
type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format
input_format='NCHW'), # Final image shape format
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the recognizer
keys=['imgs', 'label'], # Keys of input
meta_keys=[]), # Meta keys of input
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['imgs', 'label']) # Keys to be converted from image to tensor
]
val_pipeline = [ # List of validation pipeline steps
dict( # Config of SampleFrames
type='SampleFrames', # Sample frames pipeline, sampling frames from video
clip_len=1, # Frames of each sampled output clip
frame_interval=1, # Temporal interval of adjacent sampled frames
num_clips=3, # Number of clips to be sampled
test_mode=True), # Whether to set test mode in sampling
dict( # Config of RawFrameDecode
type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices
dict( # Config of Resize
type='Resize', # Resize pipeline
scale=(-1, 256)), # The scale to resize images
dict( # Config of CenterCrop
type='CenterCrop', # Center crop pipeline, cropping the center area from images
crop_size=224), # The size to crop images
dict( # Config of Flip
type='Flip', # Flip pipeline
flip_ratio=0), # Probability of implementing flip
dict( # Config of Normalize
type='Normalize', # Normalize pipeline
**img_norm_cfg), # Config of image normalization
dict( # Config of FormatShape
type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format
input_format='NCHW'), # Final image shape format
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the recognizer
keys=['imgs', 'label'], # Keys of input
meta_keys=[]), # Meta keys of input
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['imgs']) # Keys to be converted from image to tensor
]
test_pipeline = [ # List of testing pipeline steps
dict( # Config of SampleFrames
type='SampleFrames', # Sample frames pipeline, sampling frames from video
clip_len=1, # Frames of each sampled output clip
frame_interval=1, # Temporal interval of adjacent sampled frames
num_clips=25, # Number of clips to be sampled
test_mode=True), # Whether to set test mode in sampling
dict( # Config of RawFrameDecode
type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices
dict( # Config of Resize
type='Resize', # Resize pipeline
scale=(-1, 256)), # The scale to resize images
dict( # Config of TenCrop
type='TenCrop', # Ten crop pipeline, cropping ten area from images
crop_size=224), # The size to crop images
dict( # Config of Flip
type='Flip', # Flip pipeline
flip_ratio=0), # Probability of implementing flip
dict( # Config of Normalize
type='Normalize', # Normalize pipeline
**img_norm_cfg), # Config of image normalization
dict( # Config of FormatShape
type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format
input_format='NCHW'), # Final image shape format
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the recognizer
keys=['imgs', 'label'], # Keys of input
meta_keys=[]), # Meta keys of input
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['imgs']) # Keys to be converted from image to tensor
]
data = dict( # Config of data
videos_per_gpu=32, # Batch size of each single GPU
workers_per_gpu=2, # Workers to pre-fetch data for each single GPU
train_dataloader=dict( # Additional config of train dataloader
drop_last=True), # Whether to drop out the last batch of data in training
val_dataloader=dict( # Additional config of validation dataloader
videos_per_gpu=1), # Batch size of each single GPU during evaluation
test_dataloader=dict( # Additional config of test dataloader
videos_per_gpu=2), # Batch size of each single GPU during testing
train=dict( # Training dataset config
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict( # Validation dataset config
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict( # Testing dataset config
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
# optimizer
optimizer = dict(
# Config used to build optimizer, support (1). All the optimizers in PyTorch
# whose arguments are also the same as those in PyTorch. (2). Custom optimizers
# which are built on `constructor`, referring to "tutorials/5_new_modules.md"
# for implementation.
type='SGD', # Type of optimizer, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/optimizer/default_constructor.py#L13 for more details
lr=0.01, # Learning rate, see detail usages of the parameters in the documentation of PyTorch
momentum=0.9, # Momentum,
weight_decay=0.0001) # Weight decay of SGD
optimizer_config = dict( # Config used to build the optimizer hook
grad_clip=dict(max_norm=40, norm_type=2)) # Use gradient clip
# learning policy
lr_config = dict( # Learning rate scheduler config used to register LrUpdater hook
policy='step', # Policy of scheduler, also support CosineAnnealing, Cyclic, etc. Refer to details of supported LrUpdater from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9
step=[40, 80]) # Steps to decay the learning rate
total_epochs = 100 # Total epochs to train the model
checkpoint_config = dict( # Config to set the checkpoint hook, Refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py for implementation
interval=5) # Interval to save checkpoint
evaluation = dict( # Config of evaluation during training
interval=5, # Interval to perform evaluation
metrics=['top_k_accuracy', 'mean_class_accuracy'], # Metrics to be performed
metric_options=dict(top_k_accuracy=dict(topk=(1, 3))), # Set top-k accuracy to 1 and 3 during validation
save_best='top_k_accuracy') # set `top_k_accuracy` as key indicator to save best checkpoint
eval_config = dict(
metric_options=dict(top_k_accuracy=dict(topk=(1, 3)))) # Set top-k accuracy to 1 and 3 during testing. You can also use `--eval top_k_accuracy` to assign evaluation metrics
log_config = dict( # Config to register logger hook
interval=20, # Interval to print the log
hooks=[ # Hooks to be implemented during training
dict(type='TextLoggerHook'), # The logger used to record the training process
# dict(type='TensorboardLoggerHook'), # The Tensorboard logger is also supported
])
# runtime settings
dist_params = dict(backend='nccl') # Parameters to setup distributed training, the port can also be set
log_level = 'INFO' # The level of logging
work_dir = './work_dirs/tsn_r50_1x1x3_100e_kinetics400_rgb/' # Directory to save the model checkpoints and logs for the current experiments
load_from = None # load models as a pre-trained model from a given path. This will not resume training
resume_from = None # Resume checkpoints from a given path, the training will be resumed from the epoch when the checkpoint's is saved
workflow = [('train', 1)] # Workflow for runner. [('train', 1)] means there is only one workflow and the workflow named 'train' is executed once
```
### Config System for Spatio-Temporal Action Detection
We incorporate modular design into our config system, which is convenient to conduct various experiments.
- An Example of FastRCNN
To help the users have a basic idea of a complete config structure and the modules in a spatio-temporal action detection system,
we make brief comments on the config of FastRCNN as the following.
For more detailed usage and alternative for per parameter in each module, please refer to the API documentation.
```python
# model setting
model = dict( # Config of the model
type='FastRCNN', # Type of the detector
backbone=dict( # Dict for backbone
type='ResNet3dSlowOnly', # Name of the backbone
depth=50, # Depth of ResNet model
pretrained=None, # The url/site of the pretrained model
pretrained2d=False, # If the pretrained model is 2D
lateral=False, # If the backbone is with lateral connections
num_stages=4, # Stages of ResNet model
conv1_kernel=(1, 7, 7), # Conv1 kernel size
conv1_stride_t=1, # Conv1 temporal stride
pool1_stride_t=1, # Pool1 temporal stride
spatial_strides=(1, 2, 2, 1)), # The spatial stride for each ResNet stage
roi_head=dict( # Dict for roi_head
type='AVARoIHead', # Name of the roi_head
bbox_roi_extractor=dict( # Dict for bbox_roi_extractor
type='SingleRoIExtractor3D', # Name of the bbox_roi_extractor
roi_layer_type='RoIAlign', # Type of the RoI op
output_size=8, # Output feature size of the RoI op
with_temporal_pool=True), # If temporal dim is pooled
bbox_head=dict( # Dict for bbox_head
type='BBoxHeadAVA', # Name of the bbox_head
in_channels=2048, # Number of channels of the input feature
num_classes=81, # Number of action classes + 1
multilabel=True, # If the dataset is multilabel
dropout_ratio=0.5)), # The dropout ratio used
# model training and testing settings
train_cfg=dict( # Training config of FastRCNN
rcnn=dict( # Dict for rcnn training config
assigner=dict( # Dict for assigner
type='MaxIoUAssignerAVA', # Name of the assigner
pos_iou_thr=0.9, # IoU threshold for positive examples, > pos_iou_thr -> positive
neg_iou_thr=0.9, # IoU threshold for negative examples, < neg_iou_thr -> negative
min_pos_iou=0.9), # Minimum acceptable IoU for positive examples
sampler=dict( # Dict for sample
type='RandomSampler', # Name of the sampler
num=32, # Batch Size of the sampler
pos_fraction=1, # Positive bbox fraction of the sampler
neg_pos_ub=-1, # Upper bound of the ratio of num negative to num positive
add_gt_as_proposals=True), # Add gt bboxes as proposals
pos_weight=1.0, # Loss weight of positive examples
debug=False)), # Debug mode
test_cfg=dict( # Testing config of FastRCNN
rcnn=dict( # Dict for rcnn testing config
action_thr=0.002))) # The threshold of an action
# dataset settings
dataset_type = 'AVADataset' # Type of dataset for training, validation and testing
data_root = 'data/ava/rawframes' # Root path to data
anno_root = 'data/ava/annotations' # Root path to annotations
ann_file_train = f'{anno_root}/ava_train_v2.1.csv' # Path to the annotation file for training
ann_file_val = f'{anno_root}/ava_val_v2.1.csv' # Path to the annotation file for validation
exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' # Path to the exclude annotation file for training
exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' # Path to the exclude annotation file for validation
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' # Path to the label file
proposal_file_train = f'{anno_root}/ava_dense_proposals_train.FAIR.recall_93.9.pkl' # Path to the human detection proposals for training examples
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' # Path to the human detection proposals for validation examples
img_norm_cfg = dict( # Config of image normalization used in data pipeline
mean=[123.675, 116.28, 103.53], # Mean values of different channels to normalize
std=[58.395, 57.12, 57.375], # Std values of different channels to normalize
to_bgr=False) # Whether to convert channels from RGB to BGR
train_pipeline = [ # List of training pipeline steps
dict( # Config of SampleFrames
type='AVASampleFrames', # Sample frames pipeline, sampling frames from video
clip_len=4, # Frames of each sampled output clip
frame_interval=16), # Temporal interval of adjacent sampled frames
dict( # Config of RawFrameDecode
type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices
dict( # Config of RandomRescale
type='RandomRescale', # Randomly rescale the shortedge by a given range
scale_range=(256, 320)), # The shortedge size range of RandomRescale
dict( # Config of RandomCrop
type='RandomCrop', # Randomly crop a patch with the given size
size=256), # The size of the cropped patch
dict( # Config of Flip
type='Flip', # Flip Pipeline
flip_ratio=0.5), # Probability of implementing flip
dict( # Config of Normalize
type='Normalize', # Normalize pipeline
**img_norm_cfg), # Config of image normalization
dict( # Config of FormatShape
type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format
input_format='NCTHW', # Final image shape format
collapse=True), # Collapse the dim N if N == 1
dict( # Config of Rename
type='Rename', # Rename keys
mapping=dict(imgs='img')), # The old name to new name mapping
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']), # Keys to be converted from image to tensor
dict( # Config of ToDataContainer
type='ToDataContainer', # Convert other types to DataContainer type pipeline
fields=[ # Fields to convert to DataContainer
dict( # Dict of fields
key=['proposals', 'gt_bboxes', 'gt_labels'], # Keys to Convert to DataContainer
stack=False)]), # Whether to stack these tensor
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the detector
keys=['img', 'proposals', 'gt_bboxes', 'gt_labels'], # Keys of input
meta_keys=['scores', 'entity_ids']), # Meta keys of input
]
val_pipeline = [ # List of validation pipeline steps
dict( # Config of SampleFrames
type='AVASampleFrames', # Sample frames pipeline, sampling frames from video
clip_len=4, # Frames of each sampled output clip
frame_interval=16) # Temporal interval of adjacent sampled frames
dict( # Config of RawFrameDecode
type='RawFrameDecode'), # Load and decode Frames pipeline, picking raw frames with given indices
dict( # Config of Resize
type='Resize', # Resize pipeline
scale=(-1, 256)), # The scale to resize images
dict( # Config of Normalize
type='Normalize', # Normalize pipeline
**img_norm_cfg), # Config of image normalization
dict( # Config of FormatShape
type='FormatShape', # Format shape pipeline, Format final image shape to the given input_format
input_format='NCTHW', # Final image shape format
collapse=True), # Collapse the dim N if N == 1
dict( # Config of Rename
type='Rename', # Rename keys
mapping=dict(imgs='img')), # The old name to new name mapping
dict( # Config of ToTensor
type='ToTensor', # Convert other types to tensor type pipeline
keys=['img', 'proposals']), # Keys to be converted from image to tensor
dict( # Config of ToDataContainer
type='ToDataContainer', # Convert other types to DataContainer type pipeline
fields=[ # Fields to convert to DataContainer
dict( # Dict of fields
key=['proposals'], # Keys to Convert to DataContainer
stack=False)]), # Whether to stack these tensor
dict( # Config of Collect
type='Collect', # Collect pipeline that decides which keys in the data should be passed to the detector
keys=['img', 'proposals'], # Keys of input
meta_keys=['scores', 'entity_ids'], # Meta keys of input
nested=True) # Whether to wrap the data in a nested list
]
data = dict( # Config of data
videos_per_gpu=16, # Batch size of each single GPU
workers_per_gpu=2, # Workers to pre-fetch data for each single GPU
val_dataloader=dict( # Additional config of validation dataloader
videos_per_gpu=1), # Batch size of each single GPU during evaluation
train=dict( # Training dataset config
type=dataset_type,
ann_file=ann_file_train,
exclude_file=exclude_file_train,
pipeline=train_pipeline,
label_file=label_file,
proposal_file=proposal_file_train,
person_det_score_thr=0.9,
data_prefix=data_root),
val=dict( # Validation dataset config
type=dataset_type,
ann_file=ann_file_val,
exclude_file=exclude_file_val,
pipeline=val_pipeline,
label_file=label_file,
proposal_file=proposal_file_val,
person_det_score_thr=0.9,
data_prefix=data_root))
data['test'] = data['val'] # Set test_dataset as val_dataset
# optimizer
optimizer = dict(
# Config used to build optimizer, support (1). All the optimizers in PyTorch
# whose arguments are also the same as those in PyTorch. (2). Custom optimizers
# which are built on `constructor`, referring to "tutorials/5_new_modules.md"
# for implementation.
type='SGD', # Type of optimizer, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/optimizer/default_constructor.py#L13 for more details
lr=0.2, # Learning rate, see detail usages of the parameters in the documentation of PyTorch (for 8gpu)
momentum=0.9, # Momentum,
weight_decay=0.00001) # Weight decay of SGD
optimizer_config = dict( # Config used to build the optimizer hook
grad_clip=dict(max_norm=40, norm_type=2)) # Use gradient clip
lr_config = dict( # Learning rate scheduler config used to register LrUpdater hook
policy='step', # Policy of scheduler, also support CosineAnnealing, Cyclic, etc. Refer to details of supported LrUpdater from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9
step=[40, 80], # Steps to decay the learning rate
warmup='linear', # Warmup strategy
warmup_by_epoch=True, # Warmup_iters indicates iter num or epoch num
warmup_iters=5, # Number of iters or epochs for warmup
warmup_ratio=0.1) # The initial learning rate is warmup_ratio * lr
total_epochs = 20 # Total epochs to train the model
checkpoint_config = dict( # Config to set the checkpoint hook, Refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py for implementation
interval=1) # Interval to save checkpoint
workflow = [('train', 1)] # Workflow for runner. [('train', 1)] means there is only one workflow and the workflow named 'train' is executed once
evaluation = dict( # Config of evaluation during training
interval=1, save_best='mAP@0.5IOU') # Interval to perform evaluation and the key for saving best checkpoint
log_config = dict( # Config to register logger hook
interval=20, # Interval to print the log
hooks=[ # Hooks to be implemented during training
dict(type='TextLoggerHook'), # The logger used to record the training process
])
# runtime settings
dist_params = dict(backend='nccl') # Parameters to setup distributed training, the port can also be set
log_level = 'INFO' # The level of logging
work_dir = ('./work_dirs/ava/' # Directory to save the model checkpoints and logs for the current experiments
'slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb')
load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' # load models as a pre-trained model from a given path. This will not resume training
'slowonly_r50_4x16x1_256e_kinetics400_rgb/'
'slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth')
resume_from = None # Resume checkpoints from a given path, the training will be resumed from the epoch when the checkpoint's is saved
```
## FAQ
### Use intermediate variables in configs
Some intermediate variables are used in the config files, like `train_pipeline`/`val_pipeline`/`test_pipeline`,
`ann_file_train`/`ann_file_val`/`ann_file_test`, `img_norm_cfg` etc.
For Example, we would like to first define `train_pipeline`/`val_pipeline`/`test_pipeline` and pass them into `data`.
Thus, `train_pipeline`/`val_pipeline`/`test_pipeline` are intermediate variable.
we also define `ann_file_train`/`ann_file_val`/`ann_file_test` and `data_root`/`data_root_val` to provide data pipeline some
basic information.
In addition, we use `img_norm_cfg` as intermediate variables to construct data augmentation components.
```python
...
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
```
# Tutorial 2: Finetuning Models
This tutorial provides instructions for users to use the pre-trained models
to finetune them on other datasets, so that better performance can be achieved.
<!-- TOC -->
- [Outline](#outline)
- [Modify Head](#modify-head)
- [Modify Dataset](#modify-dataset)
- [Modify Training Schedule](#modify-training-schedule)
- [Use Pre-Trained Model](#use-pre-trained-model)
<!-- TOC -->
## Outline
There are two steps to finetune a model on a new dataset.
1. Add support for the new dataset. See [Tutorial 3: Adding New Dataset](3_new_dataset.md).
2. Modify the configs. This will be discussed in this tutorial.
For example, if the users want to finetune models pre-trained on Kinetics-400 Dataset to another dataset, say UCF101,
then four parts in the config (see [here](1_config.md)) needs attention.
## Modify Head
The `num_classes` in the `cls_head` need to be changed to the class number of the new dataset.
The weights of the pre-trained models are reused except for the final prediction layer.
So it is safe to change the class number.
In our case, UCF101 has 101 classes.
So we change it from 400 (class number of Kinetics-400) to 101.
```python
model = dict(
type='Recognizer2D',
backbone=dict(
type='ResNet',
pretrained='torchvision://resnet50',
depth=50,
norm_eval=False),
cls_head=dict(
type='TSNHead',
num_classes=101, # change from 400 to 101
in_channels=2048,
spatial_type='avg',
consensus=dict(type='AvgConsensus', dim=1),
dropout_ratio=0.4,
init_std=0.01),
train_cfg=None,
test_cfg=dict(average_clips=None))
```
Note that the `pretrained='torchvision://resnet50'` setting is used for initializing backbone.
If you are training a new model from ImageNet-pretrained weights, this is for you.
However, this setting is not related to our task at hand.
What we need is `load_from`, which will be discussed later.
## Modify Dataset
MMAction2 supports UCF101, Kinetics-400, Moments in Time, Multi-Moments in Time, THUMOS14,
Something-Something V1&V2, ActivityNet Dataset.
The users may need to adapt one of the above dataset to fit for their special datasets.
In our case, UCF101 is already supported by various dataset types, like `RawframeDataset`,
so we change the config as follows.
```python
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/ucf101/rawframes_train/'
data_root_val = 'data/ucf101/rawframes_val/'
ann_file_train = 'data/ucf101/ucf101_train_list.txt'
ann_file_val = 'data/ucf101/ucf101_val_list.txt'
ann_file_test = 'data/ucf101/ucf101_val_list.txt'
```
## Modify Training Schedule
Finetuning usually requires smaller learning rate and less training epochs.
```python
# optimizer
optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001) # change from 0.01 to 0.005
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='step', step=[20, 40])
total_epochs = 50 # change from 100 to 50
checkpoint_config = dict(interval=5)
```
## Use Pre-Trained Model
To use the pre-trained model for the whole network, the new config adds the link of pre-trained models in the `load_from`.
We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/tutorials/1_config.md), users can directly change it by setting `load_from` in their configs.
```python
# use the pre-trained model for the whole TSN network
load_from = 'https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmaction/mmaction-v1/recognition/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth' # model path can be found in model zoo
```
# Tutorial 3: Adding New Dataset
In this tutorial, we will introduce some methods about how to customize your own dataset by reorganizing data and mixing dataset for the project.
<!-- TOC -->
- [Customize Datasets by Reorganizing Data](#customize-datasets-by-reorganizing-data)
- [Reorganize datasets to existing format](#reorganize-datasets-to-existing-format)
- [An example of a custom dataset](#an-example-of-a-custom-dataset)
- [Customize Dataset by Mixing Dataset](#customize-dataset-by-mixing-dataset)
- [Repeat dataset](#repeat-dataset)
<!-- TOC -->
## Customize Datasets by Reorganizing Data
### Reorganize datasets to existing format
The simplest way is to convert your dataset to existing dataset formats (RawframeDataset or VideoDataset).
There are three kinds of annotation files.
- rawframe annotation
The annotation of a rawframe dataset is a text file with multiple lines,
and each line indicates `frame_directory` (relative path) of a video,
`total_frames` of a video and the `label` of a video, which are split by a whitespace.
Here is an example.
```
some/directory-1 163 1
some/directory-2 122 1
some/directory-3 258 2
some/directory-4 234 2
some/directory-5 295 3
some/directory-6 121 3
```
- video annotation
The annotation of a video dataset is a text file with multiple lines,
and each line indicates a sample video with the `filepath` (relative path) and `label`,
which are split by a whitespace.
Here is an example.
```
some/path/000.mp4 1
some/path/001.mp4 1
some/path/002.mp4 2
some/path/003.mp4 2
some/path/004.mp4 3
some/path/005.mp4 3
```
- ActivityNet annotation
The annotation of ActivityNet dataset is a json file. Each key is a video name
and the corresponding value is the meta data and annotation for the video.
Here is an example.
```
{
"video1": {
"duration_second": 211.53,
"duration_frame": 6337,
"annotations": [
{
"segment": [
30.025882995319815,
205.2318595943838
],
"label": "Rock climbing"
}
],
"feature_frame": 6336,
"fps": 30.0,
"rfps": 29.9579255898
},
"video2": {
"duration_second": 26.75,
"duration_frame": 647,
"annotations": [
{
"segment": [
2.578755070202808,
24.914101404056165
],
"label": "Drinking beer"
}
],
"feature_frame": 624,
"fps": 24.0,
"rfps": 24.1869158879
}
}
```
There are two ways to work with custom datasets.
- online conversion
You can write a new Dataset class inherited from [BaseDataset](/mmaction/datasets/base.py), and overwrite three methods
`load_annotations(self)`, `evaluate(self, results, metrics, logger)` and `dump_results(self, results, out)`,
like [RawframeDataset](/mmaction/datasets/rawframe_dataset.py), [VideoDataset](/mmaction/datasets/video_dataset.py) or [ActivityNetDataset](/mmaction/datasets/activitynet_dataset.py).
- offline conversion
You can convert the annotation format to the expected format above and save it to
a pickle or json file, then you can simply use `RawframeDataset`, `VideoDataset` or `ActivityNetDataset`.
After the data pre-processing, the users need to further modify the config files to use the dataset.
Here is an example of using a custom dataset in rawframe format.
In `configs/task/method/my_custom_config.py`:
```python
...
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'path/to/your/root'
data_root_val = 'path/to/your/root_val'
ann_file_train = 'data/custom/custom_train_list.txt'
ann_file_val = 'data/custom/custom_val_list.txt'
ann_file_test = 'data/custom/custom_val_list.txt'
...
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=ann_file_train,
...),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
...),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
...))
...
```
We use this way to support Rawframe dataset.
### An example of a custom dataset
Assume the annotation is in a new format in text files, and the image file name is of template like `img_00005.jpg`
The video annotations are stored in text file `annotation.txt` as following
```
directory,total frames,class
D32_1gwq35E,299,66
-G-5CJ0JkKY,249,254
T4h1bvOd9DA,299,33
4uZ27ivBl00,299,341
0LfESFkfBSw,249,186
-YIsNpBEx6c,299,169
```
We can create a new dataset in `mmaction/datasets/my_dataset.py` to load the data.
```python
import copy
import os.path as osp
import mmcv
from .base import BaseDataset
from .builder import DATASETS
@DATASETS.register_module()
class MyDataset(BaseDataset):
def __init__(self,
ann_file,
pipeline,
data_prefix=None,
test_mode=False,
filename_tmpl='img_{:05}.jpg'):
super(MyDataset, self).__init__(ann_file, pipeline, test_mode)
self.filename_tmpl = filename_tmpl
def load_annotations(self):
video_infos = []
with open(self.ann_file, 'r') as fin:
for line in fin:
if line.startswith("directory"):
continue
frame_dir, total_frames, label = line.split(',')
if self.data_prefix is not None:
frame_dir = osp.join(self.data_prefix, frame_dir)
video_infos.append(
dict(
frame_dir=frame_dir,
total_frames=int(total_frames),
label=int(label)))
return video_infos
def prepare_train_frames(self, idx):
results = copy.deepcopy(self.video_infos[idx])
results['filename_tmpl'] = self.filename_tmpl
return self.pipeline(results)
def prepare_test_frames(self, idx):
results = copy.deepcopy(self.video_infos[idx])
results['filename_tmpl'] = self.filename_tmpl
return self.pipeline(results)
def evaluate(self,
results,
metrics='top_k_accuracy',
topk=(1, 5),
logger=None):
pass
```
Then in the config, to use `MyDataset` you can modify the config as the following
```python
dataset_A_train = dict(
type='MyDataset',
ann_file=ann_file_train,
pipeline=train_pipeline
)
```
## Customize Dataset by Mixing Dataset
MMAction2 also supports to mix dataset for training. Currently it supports to repeat dataset.
### Repeat dataset
We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset as `Dataset_A`,
to repeat it, the config looks like the following
```python
dataset_A_train = dict(
type='RepeatDataset',
times=N,
dataset=dict( # This is the original config of Dataset_A
type='Dataset_A',
...
pipeline=train_pipeline
)
)
```
# Tutorial 4: Customize Data Pipelines
In this tutorial, we will introduce some methods about the design of data pipelines, and how to customize and extend your own data pipelines for the project.
<!-- TOC -->
- [Tutorial 4: Customize Data Pipelines](#tutorial-4-customize-data-pipelines)
- [Design of Data Pipelines](#design-of-data-pipelines)
- [Data loading](#data-loading)
- [Pre-processing](#pre-processing)
- [Formatting](#formatting)
- [Extend and Use Custom Pipelines](#extend-and-use-custom-pipelines)
<!-- TOC -->
## Design of Data Pipelines
Following typical conventions, we use `Dataset` and `DataLoader` for data loading
with multiple workers. `Dataset` returns a dict of data items corresponding
the arguments of models' forward method.
Since the data in action recognition & localization may not be the same size (image size, gt bbox size, etc.),
The `DataContainer` in MMCV is used to help collect and distribute data of different sizes.
See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
The data preparation pipeline and the dataset is decomposed. Usually a dataset
defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next operation.
We present a typical pipeline in the following figure. The blue blocks are pipeline operations.
With the pipeline going on, each operator can add new keys (marked as green) to the result dict or update the existing keys (marked as orange).
![pipeline figure](https://github.com/open-mmlab/mmaction2/raw/master/resources/data_pipeline.png)
The operations are categorized into data loading, pre-processing and formatting.
Here is a pipeline example for TSN.
```python
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
dict(type='RawFrameDecode', io_backend='disk'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=3,
test_mode=True),
dict(type='RawFrameDecode', io_backend='disk'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode', io_backend='disk'),
dict(type='Resize', scale=(-1, 256)),
dict(type='TenCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
```
We have supported some lazy operators and encourage users to apply them.
Lazy ops record how the data should be processed, but it will postpone the processing on the raw data until the raw data forward `Fuse` stage.
Specifically, lazy ops avoid frequent reading and modification operation on the raw data, but process the raw data once in the final Fuse stage, thus accelerating data preprocessing.
Here is a pipeline example applying lazy ops.
```python
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode', decoding_backend='turbojpeg'),
# The following three lazy ops only process the bbox of frames without
# modifying the raw data.
dict(type='Resize', scale=(-1, 256), lazy=True),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0,
lazy=True),
dict(type='Resize', scale=(224, 224), keep_ratio=False, lazy=True),
# Lazy operator `Flip` only record whether a frame should be fliped and the
# flip direction.
dict(type='Flip', flip_ratio=0.5, lazy=True),
# Processing the raw data once in Fuse stage.
dict(type='Fuse'),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
```
For each operation, we list the related dict fields that are added/updated/removed below, where `*` means the key may not be affected.
### Data loading
`SampleFrames`
- add: frame_inds, clip_len, frame_interval, num_clips, \*total_frames
`DenseSampleFrames`
- add: frame_inds, clip_len, frame_interval, num_clips, \*total_frames
`PyAVDecode`
- add: imgs, original_shape
- update: \*frame_inds
`DecordDecode`
- add: imgs, original_shape
- update: \*frame_inds
`OpenCVDecode`
- add: imgs, original_shape
- update: \*frame_inds
`RawFrameDecode`
- add: imgs, original_shape
- update: \*frame_inds
### Pre-processing
`RandomCrop`
- add: crop_bbox, img_shape
- update: imgs
`RandomResizedCrop`
- add: crop_bbox, img_shape
- update: imgs
`MultiScaleCrop`
- add: crop_bbox, img_shape, scales
- update: imgs
`Resize`
- add: img_shape, keep_ratio, scale_factor
- update: imgs
`Flip`
- add: flip, flip_direction
- update: imgs, label
`Normalize`
- add: img_norm_cfg
- update: imgs
`CenterCrop`
- add: crop_bbox, img_shape
- update: imgs
`ThreeCrop`
- add: crop_bbox, img_shape
- update: imgs
`TenCrop`
- add: crop_bbox, img_shape
- update: imgs
### Formatting
`ToTensor`
- update: specified by `keys`.
`ImageToTensor`
- update: specified by `keys`.
`Transpose`
- update: specified by `keys`.
`Collect`
- add: img_metas (the keys of img_metas is specified by `meta_keys`)
- remove: all other keys except for those specified by `keys`
It is **noteworthy** that the first key, commonly `imgs`, will be used as the main key to calculate the batch size.
`FormatShape`
- add: input_shape
- update: imgs
## Extend and Use Custom Pipelines
1. Write a new pipeline in any file, e.g., `my_pipeline.py`. It takes a dict as input and return a dict.
```python
from mmaction.datasets import PIPELINES
@PIPELINES.register_module()
class MyTransform:
def __call__(self, results):
results['key'] = value
return results
```
2. Import the new class.
```python
from .my_pipeline import MyTransform
```
3. Use it in config files.
```python
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='DenseSampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='RawFrameDecode', io_backend='disk'),
dict(type='MyTransform'), # use a custom pipeline
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
```
# Tutorial 5: Adding New Modules
In this tutorial, we will introduce some methods about how to customize optimizer, develop new components and new a learning rate scheduler for this project.
<!-- TOC -->
- [Customize Optimizer](#customize-optimizer)
- [Customize Optimizer Constructor](#customize-optimizer-constructor)
- [Develop New Components](#develop-new-components)
- [Add new backbones](#add-new-backbones)
- [Add new heads](#add-new-heads)
- [Add new loss](#add-new-loss)
- [Add new learning rate scheduler (updater)](#add-new-learning-rate-scheduler--updater-)
<!-- TOC -->
## Customize Optimizer
An example of customized optimizer is [CopyOfSGD](/mmaction/core/optimizer/copy_of_sgd.py) is defined in `mmaction/core/optimizer/copy_of_sgd.py`.
More generally, a customized optimizer could be defined as following.
Assume you want to add an optimizer named as `MyOptimizer`, which has arguments `a`, `b` and `c`.
You need to first implement the new optimizer in a file, e.g., in `mmaction/core/optimizer/my_optimizer.py`:
```python
from mmcv.runner import OPTIMIZERS
from torch.optim import Optimizer
@OPTIMIZERS.register_module()
class MyOptimizer(Optimizer):
def __init__(self, a, b, c):
```
Then add this module in `mmaction/core/optimizer/__init__.py`, thus the registry will find the new module and add it:
```python
from .my_optimizer import MyOptimizer
```
Then you can use `MyOptimizer` in `optimizer` field of config files.
In the configs, the optimizers are defined by the field `optimizer` like the following:
```python
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
```
To use your own optimizer, the field can be changed as
```python
optimizer = dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)
```
We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field of config files.
For example, if you want to use `ADAM`, though the performance will drop a lot, the modification could be as the following.
```python
optimizer = dict(type='Adam', lr=0.0003, weight_decay=0.0001)
```
The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
## Customize Optimizer Constructor
Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers.
The users can do those fine-grained parameter tuning through customizing optimizer constructor.
You can write a new optimizer constructor inherit from [DefaultOptimizerConstructor](https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/optimizer/default_constructor.py)
and overwrite the `add_params(self, params, module)` method.
An example of customized optimizer constructor is [TSMOptimizerConstructor](/mmaction/core/optimizer/tsm_optimizer_constructor.py).
More generally, a customized optimizer constructor could be defined as following.
In `mmaction/core/optimizer/my_optimizer_constructor.py`:
```python
from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
@OPTIMIZER_BUILDERS.register_module()
class MyOptimizerConstructor(DefaultOptimizerConstructor):
```
In `mmaction/core/optimizer/__init__.py`:
```python
from .my_optimizer_constructor import MyOptimizerConstructor
```
Then you can use `MyOptimizerConstructor` in `optimizer` field of config files.
```python
# optimizer
optimizer = dict(
type='SGD',
constructor='MyOptimizerConstructor',
paramwise_cfg=dict(fc_lr5=True),
lr=0.02,
momentum=0.9,
weight_decay=0.0001)
```
## Develop New Components
We basically categorize model components into 4 types.
- recognizer: the whole recognizer model pipeline, usually contains a backbone and cls_head.
- backbone: usually an FCN network to extract feature maps, e.g., ResNet, BNInception.
- cls_head: the component for classification task, usually contains an FC layer with some pooling layers.
- localizer: the model for temporal localization task, currently available: BSN, BMN, SSN.
### Add new backbones
Here we show how to develop new components with an example of TSN.
1. Create a new file `mmaction/models/backbones/resnet.py`.
```python
import torch.nn as nn
from ..builder import BACKBONES
@BACKBONES.register_module()
class ResNet(nn.Module):
def __init__(self, arg1, arg2):
pass
def forward(self, x): # should return a tuple
pass
def init_weights(self, pretrained=None):
pass
```
2. Import the module in `mmaction/models/backbones/__init__.py`.
```python
from .resnet import ResNet
```
3. Use it in your config file.
```python
model = dict(
...
backbone=dict(
type='ResNet',
arg1=xxx,
arg2=xxx),
)
```
### Add new heads
Here we show how to develop a new head with the example of TSNHead as the following.
1. Create a new file `mmaction/models/heads/tsn_head.py`.
You can write a new classification head inheriting from [BaseHead](/mmaction/models/heads/base.py),
and overwrite `init_weights(self)` and `forward(self, x)` method.
```python
from ..builder import HEADS
from .base import BaseHead
@HEADS.register_module()
class TSNHead(BaseHead):
def __init__(self, arg1, arg2):
pass
def forward(self, x):
pass
def init_weights(self):
pass
```
2. Import the module in `mmaction/models/heads/__init__.py`
```python
from .tsn_head import TSNHead
```
3. Use it in your config file
```python
model = dict(
...
cls_head=dict(
type='TSNHead',
num_classes=400,
in_channels=2048,
arg1=xxx,
arg2=xxx),
```
### Add new loss
Assume you want to add a new loss as `MyLoss`. To add a new loss function, the users need implement it in `mmaction/models/losses/my_loss.py`.
```python
import torch
import torch.nn as nn
from ..builder import LOSSES
def my_loss(pred, target):
assert pred.size() == target.size() and target.numel() > 0
loss = torch.abs(pred - target)
return loss
@LOSSES.register_module()
class MyLoss(nn.Module):
def forward(self, pred, target):
loss = my_loss(pred, target)
return loss
```
Then the users need to add it in the `mmaction/models/losses/__init__.py`
```python
from .my_loss import MyLoss, my_loss
```
To use it, modify the `loss_xxx` field. Since MyLoss is for regression, we can use it for the bbox loss `loss_bbox`.
```python
loss_bbox=dict(type='MyLoss'))
```
## Add new learning rate scheduler (updater)
The default manner of constructing a lr updater(namely, 'scheduler' by pytorch convention), is to modify the config such as:
```python
...
lr_config = dict(policy='step', step=[20, 40])
...
```
In the api for [`train.py`](/mmaction/apis/train.py), it will register the learning rate updater hook based on the config at:
```python
...
runner.register_training_hooks(
cfg.lr_config,
optimizer_config,
cfg.checkpoint_config,
cfg.log_config,
cfg.get('momentum_config', None))
...
```
So far, the supported updaters can be find in [mmcv](https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py), but if you want to customize a new learning rate updater, you may follow the steps below:
1. First, write your own LrUpdaterHook in `$MMAction2/mmaction/core/scheduler`. The snippet followed is an example of customized lr updater that uses learning rate based on a specific learning rate ratio: `lrs`, by which the learning rate decreases at each `steps`:
```python
@HOOKS.register_module()
# Register it here
class RelativeStepLrUpdaterHook(LrUpdaterHook):
# You should inheritate it from mmcv.LrUpdaterHook
def __init__(self, steps, lrs, **kwargs):
super().__init__(**kwargs)
assert len(steps) == (len(lrs))
self.steps = steps
self.lrs = lrs
def get_lr(self, runner, base_lr):
# Only this function is required to override
# This function is called before each training epoch, return the specific learning rate here.
progress = runner.epoch if self.by_epoch else runner.iter
for i in range(len(self.steps)):
if progress < self.steps[i]:
return self.lrs[i]
```
2. Modify your config:
In your config file, swap the original `lr_config` by:
```python
lr_config = dict(policy='RelativeStep', steps=[20, 40, 60], lrs=[0.1, 0.01, 0.001])
```
More examples can be found in [mmcv](https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py).
# Tutorial 6: Exporting a model to ONNX
Open Neural Network Exchange [(ONNX)](https://onnx.ai/) is an open ecosystem that empowers AI developers to choose the right tools as their project evolves.
<!-- TOC -->
- [Supported Models](#supported-models)
- [Usage](#usage)
- [Prerequisite](#prerequisite)
- [Recognizers](#recognizers)
- [Localizers](#localizers)
<!-- TOC -->
## Supported Models
So far, our codebase supports onnx exporting from pytorch models trained with MMAction2. The supported models are:
- I3D
- TSN
- TIN
- TSM
- R(2+1)D
- SLOWFAST
- SLOWONLY
- BMN
- BSN(tem, pem)
## Usage
For simple exporting, you can use the [script](/tools/deployment/pytorch2onnx.py) here. Note that the package `onnx` and `onnxruntime` are required for verification after exporting.
### Prerequisite
First, install onnx.
```shell
pip install onnx onnxruntime
```
We provide a python script to export the pytorch model trained by MMAction2 to ONNX.
```shell
python tools/deployment/pytorch2onnx.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--shape ${SHAPE}] \
[--verify] [--show] [--output-file ${OUTPUT_FILE}] [--is-localizer] [--opset-version ${VERSION}]
```
Optional arguments:
- `--shape`: The shape of input tensor to the model. For 2D recognizer(e.g. TSN), the input should be `$batch $clip $channel $height $width`(e.g. `1 1 3 224 224`); For 3D recognizer(e.g. I3D), the input should be `$batch $clip $channel $time $height $width`(e.g. `1 1 3 32 224 224`); For localizer such as BSN, the input for each module is different, please check the `forward` function for it. If not specified, it will be set to `1 1 3 224 224`.
- `--verify`: Determines whether to verify the exported model, runnably and numerically. If not specified, it will be set to `False`.
- `--show`: Determines whether to print the architecture of the exported model. If not specified, it will be set to `False`.
- `--output-file`: The output onnx model name. If not specified, it will be set to `tmp.onnx`.
- `--is-localizer`: Determines whether the model to be exported is a localizer. If not specified, it will be set to `False`.
- `--opset-version`: Determines the operation set version of onnx, we recommend you to use a higher version such as 11 for compatibility. If not specified, it will be set to `11`.
- `--softmax`: Determines whether to add a softmax layer at the end of recognizers. If not specified, it will be set to `False`. For now, localizers are not supported.
### Recognizers
For recognizers, please run:
```shell
python tools/deployment/pytorch2onnx.py $CONFIG_PATH $CHECKPOINT_PATH --shape $SHAPE --verify
```
### Localizers
For localizers, please run:
```shell
python tools/deployment/pytorch2onnx.py $CONFIG_PATH $CHECKPOINT_PATH --is-localizer --shape $SHAPE --verify
```
Please fire an issue if you discover any checkpoints that are not perfectly exported or suffer some loss in accuracy.
# Tutorial 7: Customize Runtime Settings
In this tutorial, we will introduce some methods about how to customize optimization methods, training schedules, workflow and hooks when running your own settings for the project.
<!-- TOC -->
- [Customize Optimization Methods](#customize-optimization-methods)
- [Customize optimizer supported by PyTorch](#customize-optimizer-supported-by-pytorch)
- [Customize self-implemented optimizer](#customize-self-implemented-optimizer)
- [1. Define a new optimizer](#1-define-a-new-optimizer)
- [2. Add the optimizer to registry](#2-add-the-optimizer-to-registry)
- [3. Specify the optimizer in the config file](#3-specify-the-optimizer-in-the-config-file)
- [Customize optimizer constructor](#customize-optimizer-constructor)
- [Additional settings](#additional-settings)
- [Customize Training Schedules](#customize-training-schedules)
- [Customize Workflow](#customize-workflow)
- [Customize Hooks](#customize-hooks)
- [Customize self-implemented hooks](#customize-self-implemented-hooks)
- [1. Implement a new hook](#1-implement-a-new-hook)
- [2. Register the new hook](#2-register-the-new-hook)
- [3. Modify the config](#3-modify-the-config)
- [Use hooks implemented in MMCV](#use-hooks-implemented-in-mmcv)
- [Modify default runtime hooks](#modify-default-runtime-hooks)
- [Checkpoint config](#checkpoint-config)
- [Log config](#log-config)
- [Evaluation config](#evaluation-config)
<!-- TOC -->
## Customize Optimization Methods
### Customize optimizer supported by PyTorch
We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field of config files.
For example, if you want to use `Adam`, the modification could be as the following.
```python
optimizer = dict(type='Adam', lr=0.0003, weight_decay=0.0001)
```
To modify the learning rate of the model, the users only need to modify the `lr` in the config of optimizer.
The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
For example, if you want to use `Adam` with the setting like `torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)` in PyTorch,
the modification could be set as the following.
```python
optimizer = dict(type='Adam', lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
```
### Customize self-implemented optimizer
#### 1. Define a new optimizer
A customized optimizer could be defined as following.
Assume you want to add an optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
You need to create a new directory named `mmaction/core/optimizer`.
And then implement the new optimizer in a file, e.g., in `mmaction/core/optimizer/my_optimizer.py`:
```python
from mmcv.runner import OPTIMIZERS
from torch.optim import Optimizer
@OPTIMIZERS.register_module()
class MyOptimizer(Optimizer):
def __init__(self, a, b, c):
```
#### 2. Add the optimizer to registry
To find the above module defined above, this module should be imported into the main namespace at first. There are two ways to achieve it.
- Modify `mmaction/core/optimizer/__init__.py` to import it.
The newly defined module should be imported in `mmaction/core/optimizer/__init__.py` so that the registry will
find the new module and add it:
```python
from .my_optimizer import MyOptimizer
```
- Use `custom_imports` in the config to manually import it
```python
custom_imports = dict(imports=['mmaction.core.optimizer.my_optimizer'], allow_failed_imports=False)
```
The module `mmaction.core.optimizer.my_optimizer` will be imported at the beginning of the program and the class `MyOptimizer` is then automatically registered.
Note that only the package containing the class `MyOptimizer` should be imported. `mmaction.core.optimizer.my_optimizer.MyOptimizer` **cannot** be imported directly.
#### 3. Specify the optimizer in the config file
Then you can use `MyOptimizer` in `optimizer` field of config files.
In the configs, the optimizers are defined by the field `optimizer` like the following:
```python
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
```
To use your own optimizer, the field can be changed to
```python
optimizer = dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)
```
### Customize optimizer constructor
Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers.
The users can do those fine-grained parameter tuning through customizing optimizer constructor.
```python
from mmcv.runner.optimizer import OPTIMIZER_BUILDERS
@OPTIMIZER_BUILDERS.register_module()
class MyOptimizerConstructor:
def __init__(self, optimizer_cfg, paramwise_cfg=None):
pass
def __call__(self, model):
return my_optimizer
```
The default optimizer constructor is implemented [here](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/optimizer/default_constructor.py#L11),
which could also serve as a template for new optimizer constructor.
### Additional settings
Tricks not implemented by the optimizer should be implemented through optimizer constructor (e.g., set parameter-wise learning rates) or hooks.
We list some common settings that could stabilize the training or accelerate the training. Feel free to create PR, issue for more settings.
- __Use gradient clip to stabilize training__:
Some models need gradient clip to clip the gradients to stabilize the training process. An example is as below:
```python
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
```
- __Use momentum schedule to accelerate model convergence__:
We support momentum scheduler to modify model's momentum according to learning rate, which could make the model converge in a faster way.
Momentum scheduler is usually used with LR scheduler, for example, the following config is used in 3D detection to accelerate convergence.
For more details, please refer to the implementation of [CyclicLrUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327)
and [CyclicMomentumUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130).
```python
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-4),
cyclic_times=1,
step_ratio_up=0.4,
)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.85 / 0.95, 1),
cyclic_times=1,
step_ratio_up=0.4,
)
```
## Customize Training Schedules
we use step learning rate with default value in config files, this calls [`StepLRHook`](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L153) in MMCV.
We support many other learning rate schedule [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py), such as `CosineAnnealing` and `Poly` schedule. Here are some examples
- Poly schedule:
```python
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
```
- ConsineAnnealing schedule:
```python
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=1000,
warmup_ratio=1.0 / 10,
min_lr_ratio=1e-5)
```
## Customize Workflow
By default, we recommend users to use `EvalHook` to do evaluation after training epoch, but they can still use `val` workflow as an alternative.
Workflow is a list of (phase, epochs) to specify the running order and epochs. By default it is set to be
```python
workflow = [('train', 1)]
```
which means running 1 epoch for training.
Sometimes user may want to check some metrics (e.g. loss, accuracy) about the model on the validate set.
In such case, we can set the workflow as
```python
[('train', 1), ('val', 1)]
```
so that 1 epoch for training and 1 epoch for validation will be run iteratively.
:::{note}
1. The parameters of model will not be updated during val epoch.
2. Keyword `total_epochs` in the config only controls the number of training epochs and will not affect the validation workflow.
3. Workflows `[('train', 1), ('val', 1)]` and `[('train', 1)]` will not change the behavior of `EvalHook` because `EvalHook` is called by `after_train_epoch` and validation workflow only affect hooks that are called through `after_val_epoch`.
Therefore, the only difference between `[('train', 1), ('val', 1)]` and `[('train', 1)]` is that the runner will calculate losses on validation set after each training epoch.
:::
## Customize Hooks
### Customize self-implemented hooks
#### 1. Implement a new hook
Here we give an example of creating a new hook in MMAction2 and using it in training.
```python
from mmcv.runner import HOOKS, Hook
@HOOKS.register_module()
class MyHook(Hook):
def __init__(self, a, b):
pass
def before_run(self, runner):
pass
def after_run(self, runner):
pass
def before_epoch(self, runner):
pass
def after_epoch(self, runner):
pass
def before_iter(self, runner):
pass
def after_iter(self, runner):
pass
```
Depending on the functionality of the hook, the users need to specify what the hook will do at each stage of the training in `before_run`, `after_run`, `before_epoch`, `after_epoch`, `before_iter`, and `after_iter`.
#### 2. Register the new hook
Then we need to make `MyHook` imported. Assuming the file is in `mmaction/core/utils/my_hook.py` there are two ways to do that:
- Modify `mmaction/core/utils/__init__.py` to import it.
The newly defined module should be imported in `mmaction/core/utils/__init__.py` so that the registry will
find the new module and add it:
```python
from .my_hook import MyHook
```
- Use `custom_imports` in the config to manually import it
```python
custom_imports = dict(imports=['mmaction.core.utils.my_hook'], allow_failed_imports=False)
```
#### 3. Modify the config
```python
custom_hooks = [
dict(type='MyHook', a=a_value, b=b_value)
]
```
You can also set the priority of the hook by adding key `priority` to `'NORMAL'` or `'HIGHEST'` as below
```python
custom_hooks = [
dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
]
```
By default the hook's priority is set as `NORMAL` during registration.
### Use hooks implemented in MMCV
If the hook is already implemented in MMCV, you can directly modify the config to use the hook as below
```python
mmcv_hooks = [
dict(type='MMCVHook', a=a_value, b=b_value, priority='NORMAL')
]
```
### Modify default runtime hooks
There are some common hooks that are not registered through `custom_hooks` but has been registered by default when importing MMCV, they are
- log_config
- checkpoint_config
- evaluation
- lr_config
- optimizer_config
- momentum_config
In those hooks, only the logger hook has the `VERY_LOW` priority, others' priority are `NORMAL`.
The above-mentioned tutorials already cover how to modify `optimizer_config`, `momentum_config`, and `lr_config`.
Here we reveals how what we can do with `log_config`, `checkpoint_config`, and `evaluation`.
#### Checkpoint config
The MMCV runner will use `checkpoint_config` to initialize [`CheckpointHook`](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/hooks/checkpoint.py#L9).
```python
checkpoint_config = dict(interval=1)
```
The users could set `max_keep_ckpts` to only save only small number of checkpoints or decide whether to store state dict of optimizer by `save_optimizer`.
More details of the arguments are [here](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.CheckpointHook)
#### Log config
The `log_config` wraps multiple logger hooks and enables to set intervals. Now MMCV supports `WandbLoggerHook`, `MlflowLoggerHook`, and `TensorboardLoggerHook`.
The detail usages can be found in the [doc](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook).
```python
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
```
#### Evaluation config
The config of `evaluation` will be used to initialize the [`EvalHook`](https://github.com/open-mmlab/mmaction2/blob/master/mmaction/core/evaluation/eval_hooks.py#L12).
Except the key `interval`, other arguments such as `metrics` will be passed to the `dataset.evaluate()`
```python
evaluation = dict(interval=1, metrics='bbox')
```
Apart from training/testing scripts, We provide lots of useful tools under the `tools/` directory.
## Useful Tools Link
<!-- TOC -->
- [Useful Tools Link](#useful-tools-link)
- [Log Analysis](#log-analysis)
- [Model Complexity](#model-complexity)
- [Model Conversion](#model-conversion)
- [MMAction2 model to ONNX (experimental)](#mmaction2-model-to-onnx-experimental)
- [Prepare a model for publishing](#prepare-a-model-for-publishing)
- [Model Serving](#model-serving)
- [1. Convert model from MMAction2 to TorchServe](#1-convert-model-from-mmaction2-to-torchserve)
- [2. Build `mmaction-serve` docker image](#2-build-mmaction-serve-docker-image)
- [3. Launch `mmaction-serve`](#3-launch-mmaction-serve)
- [4. Test deployment](#4-test-deployment)
- [Miscellaneous](#miscellaneous)
- [Evaluating a metric](#evaluating-a-metric)
- [Print the entire config](#print-the-entire-config)
- [Check videos](#check-videos)
<!-- TOC -->
## Log Analysis
`tools/analysis/analyze_logs.py` plots loss/top-k acc curves given a training log file. Run `pip install seaborn` first to install the dependency.
![acc_curve_image](https://github.com/open-mmlab/mmaction2/raw/master/resources/acc_curve.png)
```shell
python tools/analysis/analyze_logs.py plot_curve ${JSON_LOGS} [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
```
Examples:
- Plot the classification loss of some run.
```shell
python tools/analysis/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls
```
- Plot the top-1 acc and top-5 acc of some run, and save the figure to a pdf.
```shell
python tools/analysis/analyze_logs.py plot_curve log.json --keys top1_acc top5_acc --out results.pdf
```
- Compare the top-1 acc of two runs in the same figure.
```shell
python tools/analysis/analyze_logs.py plot_curve log1.json log2.json --keys top1_acc --legend run1 run2
```
You can also compute the average training speed.
```shell
python tools/analysis/analyze_logs.py cal_train_time ${JSON_LOGS} [--include-outliers]
```
- Compute the average training speed for a config file.
```shell
python tools/analysis/analyze_logs.py cal_train_time work_dirs/some_exp/20200422_153324.log.json
```
The output is expected to be like the following.
```text
-----Analyze train time of work_dirs/some_exp/20200422_153324.log.json-----
slowest epoch 60, average time is 0.9736
fastest epoch 18, average time is 0.9001
time std over epochs is 0.0177
average iter time: 0.9330 s/iter
```
## Model Complexity
`/tools/analysis/get_flops.py` is a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model.
```shell
python tools/analysis/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
```
We will get the result like this
```text
==============================
Input shape: (1, 3, 32, 340, 256)
Flops: 37.1 GMac
Params: 28.04 M
==============================
```
:::{note}
This tool is still experimental and we do not guarantee that the number is absolutely correct.
You may use the result for simple comparisons, but double check it before you adopt it in technical reports or papers.
(1) FLOPs are related to the input shape while parameters are not. The default input shape is (1, 3, 340, 256) for 2D recognizer, (1, 3, 32, 340, 256) for 3D recognizer.
(2) Some operators are not counted into FLOPs like GN and custom operators. Refer to [`mmcv.cnn.get_model_complexity_info()`](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/flops_counter.py) for details.
:::
## Model Conversion
### MMAction2 model to ONNX (experimental)
`/tools/deployment/pytorch2onnx.py` is a script to convert model to [ONNX](https://github.com/onnx/onnx) format.
It also supports comparing the output results between Pytorch and ONNX model for verification.
Run `pip install onnx onnxruntime` first to install the dependency.
Please note that a softmax layer could be added for recognizers by `--softmax` option, in order to get predictions in range `[0, 1]`.
- For recognizers, please run:
```shell
python tools/deployment/pytorch2onnx.py $CONFIG_PATH $CHECKPOINT_PATH --shape $SHAPE --verify
```
- For localizers, please run:
```shell
python tools/deployment/pytorch2onnx.py $CONFIG_PATH $CHECKPOINT_PATH --is-localizer --shape $SHAPE --verify
```
### Prepare a model for publishing
`tools/deployment/publish_model.py` helps users to prepare their model for publishing.
Before you upload a model to AWS, you may want to:
(1) convert model weights to CPU tensors.
(2) delete the optimizer states.
(3) compute the hash of the checkpoint file and append the hash id to the filename.
```shell
python tools/deployment/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
```
E.g.,
```shell
python tools/deployment/publish_model.py work_dirs/tsn_r50_1x1x3_100e_kinetics400_rgb/latest.pth tsn_r50_1x1x3_100e_kinetics400_rgb.pth
```
The final output filename will be `tsn_r50_1x1x3_100e_kinetics400_rgb-{hash id}.pth`.
## Model Serving
In order to serve an `MMAction2` model with [`TorchServe`](https://pytorch.org/serve/), you can follow the steps:
### 1. Convert model from MMAction2 to TorchServe
```shell
python tools/deployment/mmaction2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
--output_folder ${MODEL_STORE} \
--model-name ${MODEL_NAME} \
--label-file ${LABLE_FILE}
```
### 2. Build `mmaction-serve` docker image
```shell
DOCKER_BUILDKIT=1 docker build -t mmaction-serve:latest docker/serve/
```
### 3. Launch `mmaction-serve`
Check the official docs for [running TorchServe with docker](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment).
Example:
```shell
docker run --rm \
--cpus 8 \
--gpus device=0 \
-p8080:8080 -p8081:8081 -p8082:8082 \
--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
mmaction-serve:latest
```
**Note**: ${MODEL_STORE} needs to be an absolute path.
[Read the docs](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md) about the Inference (8080), Management (8081) and Metrics (8082) APis
### 4. Test deployment
```shell
# Assume you are under the directory `mmaction2`
curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T demo/demo.mp4
```
You should obtain a response similar to:
```json
{
"arm wrestling": 1.0,
"rock scissors paper": 4.962051880497143e-10,
"shaking hands": 3.9761663406245873e-10,
"massaging feet": 1.1924419784925533e-10,
"stretching leg": 1.0601879096849842e-10
}
```
## Miscellaneous
### Evaluating a metric
`tools/analysis/eval_metric.py` evaluates certain metrics of the results saved in a file according to a config file.
The saved result file is created on `tools/test.py` by setting the arguments `--out ${RESULT_FILE}` to indicate the result file,
which stores the final output of the whole model.
```shell
python tools/analysis/eval_metric.py ${CONFIG_FILE} ${RESULT_FILE} [--eval ${EVAL_METRICS}] [--cfg-options ${CFG_OPTIONS}] [--eval-options ${EVAL_OPTIONS}]
```
### Print the entire config
`tools/analysis/print_config.py` prints the whole config verbatim, expanding all its imports.
```shell
python tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}]
```
### Check videos
`tools/analysis/check_videos.py` uses specified video encoder to iterate all samples that are specified by the input configuration file, looks for invalid videos (corrupted or missing), and saves the corresponding file path to the output file. Please note that after deleting invalid videos, users need to regenerate the video file list.
```shell
python tools/analysis/check_videos.py ${CONFIG} [-h] [--options OPTIONS [OPTIONS ...]] [--cfg-options CFG_OPTIONS [CFG_OPTIONS ...]] [--output-file OUTPUT_FILE] [--split SPLIT] [--decoder DECODER] [--num-processes NUM_PROCESSES] [--remove-corrupted-videos]
```
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
mmaction.apis
-------------
.. automodule:: mmaction.apis
:members:
mmaction.core
-------------
optimizer
^^^^^^^^^
.. automodule:: mmaction.core.optimizer
:members:
evaluation
^^^^^^^^^^
.. automodule:: mmaction.core.evaluation
:members:
scheduler
^^
.. automodule:: mmaction.core.scheduler
:members:
mmaction.localization
---------------------
localization
^^^^^^^^^^^^
.. automodule:: mmaction.localization
:members:
mmaction.models
---------------
models
^^^^^^
.. automodule:: mmaction.models
:members:
recognizers
^^^^^^^^^^^
.. automodule:: mmaction.models.recognizers
:members:
localizers
^^^^^^^^^^
.. automodule:: mmaction.models.localizers
:members:
common
^^^^^^
.. automodule:: mmaction.models.common
:members:
backbones
^^^^^^^^^
.. automodule:: mmaction.models.backbones
:members:
heads
^^^^^
.. automodule:: mmaction.models.heads
:members:
necks
^^^^^
.. automodule:: mmaction.models.necks
:members:
losses
^^^^^^
.. automodule:: mmaction.models.losses
:members:
mmaction.datasets
-----------------
datasets
^^^^^^^^
.. automodule:: mmaction.datasets
:members:
pipelines
^^^^^^^^^
.. automodule:: mmaction.datasets.pipelines
:members:
samplers
^^^^^^^^
.. automodule:: mmaction.datasets.samplers
:members:
mmaction.utils
--------------
.. automodule:: mmaction.utils
:members:
mmaction.localization
---------------------
.. automodule:: mmaction.localization
:members:
# 基准测试
这里将 MMAction2 与其他流行的代码框架和官方开源代码的速度性能进行对比
## 配置
### 硬件环境
- 8 NVIDIA Tesla V100 (32G) GPUs
- Intel(R) Xeon(R) Gold 6146 CPU @ 3.20GHz
### 软件环境
- Python 3.7
- PyTorch 1.4
- CUDA 10.1
- CUDNN 7.6.03
- NCCL 2.4.08
### 评测指标
这里测量的时间是一轮训练迭代的平均时间,包括数据处理和模型训练。
训练速度以 s/iter 为单位,其值越低越好。注意,这里跳过了前 50 个迭代时间,因为它们可能包含设备的预热时间。
### 比较规则
这里以一轮训练迭代时间为基准,使用了相同的数据和模型设置对 MMAction2 和其他的视频理解工具箱进行比较。参与评测的其他代码库包括
- MMAction: commit id [7f3490d](https://github.com/open-mmlab/mmaction/tree/7f3490d3db6a67fe7b87bfef238b757403b670e3)(1/5/2020)
- Temporal-Shift-Module: commit id [8d53d6f](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd)(5/5/2020)
- PySlowFast: commit id [8299c98](https://github.com/facebookresearch/SlowFast/tree/8299c9862f83a067fa7114ce98120ae1568a83ec)(7/7/2020)
- BSN(boundary sensitive network): commit id [f13707f](https://github.com/wzmsltw/BSN-boundary-sensitive-network/tree/f13707fbc362486e93178c39f9c4d398afe2cb2f)(12/12/2018)
- BMN(boundary matching network): commit id [45d0514](https://github.com/JJBOY/BMN-Boundary-Matching-Network/tree/45d05146822b85ca672b65f3d030509583d0135a)(17/10/2019)
为了公平比较,这里基于相同的硬件环境和数据进行对比实验。
使用的视频帧数据集是通过 [数据准备工具](/tools/data/kinetics/README.md) 生成的,
使用的视频数据集是通过 [该脚本](/tools/data/resize_videos.py) 生成的,以快速解码为特点的,"短边 256,密集关键帧编码“的视频数据集。
正如以下表格所示,在对比正常的短边 256 视频时,可以观察到速度上的显著提升,尤其是在采样特别稀疏的情况下,如 [TSN](/configs/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb.py)
## 主要结果
### 行为识别器
| 模型 | 输入 | IO 后端 | 批大小 x GPU 数量 | MMAction2 (s/iter) | GPU 显存占用 (GB) | MMAction (s/iter) | GPU 显存占用 (GB) | Temporal-Shift-Module (s/iter) | GPU 显存占用 (GB) | PySlowFast (s/iter) | GPU 显存占用 (GB) |
| :------------------------------------------------------------------------------------------ | :----------------------: | :-------: | :---------------: | :-------------------------------------------------------------------------------------------------------------------------: | :---------------: | :------------------------------------------------------------------------------------------------------------------: | :---------------: | :-------------------------------------------------------------------------------------------------------------------------------: | :---------------: | :--------------------------------------------------------------------------------------------------------------------: | :---------------: |
| [TSN](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) | 256p rawframes | Memcached | 32x8 | **[0.32](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsn_256p_rawframes_memcahed_32x8.zip)** | 8.1 | [0.38](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction/tsn_256p_rawframes_memcached_32x8.zip) | 8.1 | [0.42](https://download.openmmlab.com/mmaction/benchmark/recognition/temporal_shift_module/tsn_256p_rawframes_memcached_32x8.zip) | 10.5 | x | x |
| [TSN](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) | 256p videos | Disk | 32x8 | **[1.42](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsn_256p_videos_disk_32x8.zip)** | 8.1 | x | x | x | x | TODO | TODO |
| [TSN](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 32x8 | **[0.61](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsn_256p_fast_videos_disk_32x8.zip)** | 8.1 | x | x | x | x | TODO | TODO |
| [I3D heavy](/configs/recognition/i3d/i3d_r50_video_heavy_8x8x1_100e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.34](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/i3d_heavy_256p_videos_disk_8x8.zip)** | 4.6 | x | x | x | x | [0.44](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_i3d_r50_8x8_video.log) | 4.6 |
| [I3D heavy](/configs/recognition/i3d/i3d_r50_video_heavy_8x8x1_100e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.35](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/i3d_heavy_256p_fast_videos_disk_8x8.zip)** | 4.6 | x | x | x | x | [0.36](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_i3d_r50_8x8_fast_video.log) | 4.6 |
| [I3D](/configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py) | 256p rawframes | Memcached | 8x8 | **[0.43](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/i3d_256p_rawframes_memcahed_8x8.zip)** | 5.0 | [0.56](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction/i3d_256p_rawframes_memcached_8x8.zip) | 5.0 | x | x | x | x |
| [TSM](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | 256p rawframes | Memcached | 8x8 | **[0.31](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsm_256p_rawframes_memcahed_8x8.zip)** | 6.9 | x | x | [0.41](https://download.openmmlab.com/mmaction/benchmark/recognition/temporal_shift_module/tsm_256p_rawframes_memcached_8x8.zip) | 9.1 | x | x |
| [Slowonly](/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.32](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowonly_256p_videos_disk_8x8.zip)** | 3.1 | TODO | TODO | x | x | [0.34](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowonly_r50_4x16_video.log) | 3.4 |
| [Slowonly](/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.25](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowonly_256p_fast_videos_disk_8x8.zip)** | 3.1 | TODO | TODO | x | x | [0.28](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowonly_r50_4x16_fast_video.log) | 3.4 |
| [Slowfast](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.69](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowfast_256p_videos_disk_8x8.zip)** | 6.1 | x | x | x | x | [1.04](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowfast_r50_4x16_video.log) | 7.0 |
| [Slowfast](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.68](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowfast_256p_fast_videos_disk_8x8.zip)** | 6.1 | x | x | x | x | [0.96](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowfast_r50_4x16_fast_video.log) | 7.0 |
| [R(2+1)D](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.45](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/r2plus1d_256p_videos_disk_8x8.zip)** | 5.1 | x | x | x | x | x | x |
| [R(2+1)D](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.44](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/r2plus1d_256p_fast_videos_disk_8x8.zip)** | 5.1 | x | x | x | x | x | x |
### 时序动作检测器
| Model | MMAction2 (s/iter) | BSN(boundary sensitive network) (s/iter) | BMN(boundary matching network) (s/iter) |
| :------------------------------------------------------------------------------------------------------------------ | :-----------------------: | :--------------------------------------: | :-------------------------------------: |
| BSN ([TEM + PEM + PGM](/configs/localization/bsn)) | **0.074(TEM)+0.040(PEM)** | 0.101(TEM)+0.040(PEM) | x |
| BMN ([bmn_400x100_2x8_9e_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py)) | **3.27** | x | 3.30 |
## 比较细节
### TSN
- **MMAction2**
```shell
# 处理视频帧
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_tsn configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_tsn_rawframes
# 处理视频
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_tsn configs/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_tsn_video
```
- **MMAction**
```shell
python -u tools/train_recognizer.py configs/TSN/tsn_kinetics400_2d_rgb_r50_seg3_f1s1.py
```
- **Temporal-Shift-Module**
```shell
python main.py kinetics RGB --arch resnet50 --num_segments 3 --gd 20 --lr 0.02 --wd 1e-4 --lr_steps 20 40 --epochs 1 --batch-size 256 -j 32 --dropout 0.5 --consensus_type=avg --eval-freq=10 --npb --print-freq 1
```
### I3D
- **MMAction2**
```shell
# 处理视频帧
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_i3d configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_i3d_rawframes
# 处理视频
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_i3d configs/recognition/i3d/i3d_r50_video_heavy_8x8x1_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_i3d_video
```
- **MMAction**
```shell
python -u tools/train_recognizer.py configs/I3D_RGB/i3d_kinetics400_3d_rgb_r50_c3d_inflate3x1x1_seg1_f32s2.py
```
- **PySlowFast**
```shell
python tools/run_net.py --cfg configs/Kinetics/I3D_8x8_R50.yaml DATA.PATH_TO_DATA_DIR ${DATA_ROOT} NUM_GPUS 8 TRAIN.BATCH_SIZE 64 TRAIN.AUTO_RESUME False LOG_PERIOD 1 SOLVER.MAX_EPOCH 1 > pysf_i3d_r50_8x8_video.log
```
可以通过编写一个简单的脚本对日志文件的 'time_diff' 域进行解析,以复现对应的结果。
### SlowFast
- **MMAction2**
```shell
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_slowfast configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py --work-dir work_dirs/benchmark_slowfast_video
```
- **MMAction**
```shell
python tools/run_net.py --cfg configs/Kinetics/SLOWFAST_4x16_R50.yaml DATA.PATH_TO_DATA_DIR ${DATA_ROOT} NUM_GPUS 8 TRAIN.BATCH_SIZE 64 TRAIN.AUTO_RESUME False LOG_PERIOD 1 SOLVER.MAX_EPOCH 1 > pysf_slowfast_r50_4x16_video.log
```
可以通过编写一个简单的脚本对日志文件的 'time_diff' 域进行解析,以复现对应的结果。
### SlowOnly
- **MMAction2**
```shell
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_slowonly configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py --work-dir work_dirs/benchmark_slowonly_video
```
- **PySlowFast**
```shell
python tools/run_net.py --cfg configs/Kinetics/SLOW_4x16_R50.yaml DATA.PATH_TO_DATA_DIR ${DATA_ROOT} NUM_GPUS 8 TRAIN.BATCH_SIZE 64 TRAIN.AUTO_RESUME False LOG_PERIOD 1 SOLVER.MAX_EPOCH 1 > pysf_slowonly_r50_4x16_video.log
```
可以通过编写一个简单的脚本对日志文件的 'time_diff' 域进行解析,以复现对应的结果。
### R2plus1D
- **MMAction2**
```shell
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_r2plus1d configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py --work-dir work_dirs/benchmark_r2plus1d_video
```
# Copyright (c) OpenMMLab. All rights reserved.
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import subprocess
import sys
import pytorch_sphinx_theme
sys.path.insert(0, os.path.abspath('..'))
# -- Project information -----------------------------------------------------
project = 'MMAction2'
copyright = '2020, OpenMMLab'
author = 'MMAction2 Authors'
version_file = '../mmaction/version.py'
def get_version():
with open(version_file, 'r') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']
# The full version, including alpha/beta/rc tags
release = get_version()
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode',
'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser'
]
# numpy and torch are required
autodoc_mock_imports = ['mmaction.version', 'PIL']
copybutton_prompt_text = r'>>> |\.\.\. '
copybutton_prompt_is_regexp = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# -- Options for HTML output -------------------------------------------------
source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'}
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'pytorch_sphinx_theme'
html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
html_theme_options = {
# 'logo_url': 'https://mmocr.readthedocs.io/en/latest/',
'menu': [
{
'name':
'教程',
'url':
'https://colab.research.google.com/github/'
'open-mmlab/mmaction2/blob/master/demo/'
'mmaction2_tutorial_zh-CN.ipynb'
},
{
'name': 'GitHub',
'url': 'https://github.com/open-mmlab/mmaction2'
},
{
'name':
'上游代码库',
'children': [
{
'name': 'MMCV',
'url': 'https://github.com/open-mmlab/mmcv',
'description': '计算机视觉基础库'
},
{
'name': 'MMClassification',
'url': 'https://github.com/open-mmlab/mmclassification',
'description': '图像分类代码库'
},
{
'name': 'MMDetection',
'url': 'https://github.com/open-mmlab/mmdetection',
'description': '物体检测代码库'
},
]
},
],
# Specify the language of shared menu
'menu_lang':
'cn'
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = ['css/readthedocs.css']
myst_enable_extensions = ['colon_fence']
myst_heading_anchors = 3
language = 'zh_CN'
master_doc = 'index'
def builder_inited_handler(app):
subprocess.run(['./merge_docs.sh'])
subprocess.run(['./stat.py'])
def setup(app):
app.connect('builder-inited', builder_inited_handler)
# 准备数据
本文为 MMAction2 的数据准备提供一些指南。
<!-- TOC -->
- [视频格式数据的一些注意事项](#%E8%A7%86%E9%A2%91%E6%A0%BC%E5%BC%8F%E6%95%B0%E6%8D%AE%E7%9A%84%E4%B8%80%E4%BA%9B%E6%B3%A8%E6%84%8F%E4%BA%8B%E9%A1%B9)
- [获取数据](#%E8%8E%B7%E5%8F%96%E6%95%B0%E6%8D%AE)
- [准备视频](#%E5%87%86%E5%A4%87%E8%A7%86%E9%A2%91)
- [提取帧](#%E6%8F%90%E5%8F%96%E5%B8%A7)
- [denseflow 的替代项](#denseflow-%E7%9A%84%E6%9B%BF%E4%BB%A3%E9%A1%B9)
- [生成文件列表](#%E7%94%9F%E6%88%90%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8)
- [准备音频](#%E5%87%86%E5%A4%87%E9%9F%B3%E9%A2%91)
<!-- TOC -->
## 视频格式数据的一些注意事项
MMAction2 支持两种数据类型:原始帧和视频。前者在过去的项目中经常出现,如 TSN。
如果能把原始帧存储在固态硬盘上,处理帧格式的数据是非常快的,但对于大规模的数据集来说,原始帧需要占据大量的磁盘空间。
(举例来说,最新版本的 [Kinetics](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) 有 650K 个视频,其所有原始帧需要占据几个 TB 的磁盘空间。)
视频格式的数据能够节省很多空间,但在运行模型时,必须进行视频解码,算力开销很大。
为了加速视频解码,MMAction2 支持了若干种高效的视频加载库,如 [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV) 等。
## 获取数据
本文介绍如何构建自定义数据集。
与上述数据集相似,推荐用户把数据放在 `$MMACTION2/data/$DATASET` 中。
### 准备视频
请参照官网或官方脚本准备视频。
注意,应该按照下面两种方法之一来组织视频数据文件夹结构:
(1) 形如 `${CLASS_NAME}/${VIDEO_ID}` 的两级文件目录结构,这种结构推荐在动作识别数据集中使用(如 UCF101 和 Kinetics)
(2) 单级文件目录结构,这种结构推荐在动作检测数据集或者多标签数据集中使用(如 THUMOS14)
### 提取帧
若想同时提取帧和光流,可以使用 OpenMMLab 准备的 [denseflow](https://github.com/open-mmlab/denseflow) 工具。
因为不同的帧提取工具可能产生不同数量的帧,建议使用同一工具来提取 RGB 帧和光流,以避免它们的数量不同。
```shell
python build_rawframes.py ${SRC_FOLDER} ${OUT_FOLDER} [--task ${TASK}] [--level ${LEVEL}] \
[--num-worker ${NUM_WORKER}] [--flow-type ${FLOW_TYPE}] [--out-format ${OUT_FORMAT}] \
[--ext ${EXT}] [--new-width ${NEW_WIDTH}] [--new-height ${NEW_HEIGHT}] [--new-short ${NEW_SHORT}] \
[--resume] [--use-opencv] [--mixed-ext]
```
- `SRC_FOLDER`: 视频源文件夹
- `OUT_FOLDER`: 存储提取出的帧和光流的根文件夹
- `TASK`: 提取任务,说明提取帧,光流,还是都提取,选项为 `rgb`, `flow`, `both`
- `LEVEL`: 目录层级。1 指单级文件目录,2 指两级文件目录
- `NUM_WORKER`: 提取原始帧的线程数
- `FLOW_TYPE`: 提取的光流类型,如 `None`, `tvl1`, `warp_tvl1`, `farn`, `brox`
- `OUT_FORMAT`: 提取帧的输出文件类型,如 `jpg`, `h5`, `png`
- `EXT`: 视频文件后缀名,如 `avi`, `mp4`
- `NEW_WIDTH`: 调整尺寸后,输出图像的宽
- `NEW_HEIGHT`: 调整尺寸后,输出图像的高
- `NEW_SHORT`: 等比例缩放图片后,输出图像的短边长
- `--resume`: 是否接续之前的光流提取任务,还是覆盖之前的输出结果重新提取
- `--use-opencv`: 是否使用 OpenCV 提取 RGB 帧
- `--mixed-ext`: 说明是否处理不同文件类型的视频文件
根据实际经验,推荐设置为:
1.`$OUT_FOLDER` 设置为固态硬盘上的文件夹。
2. 软连接 `$OUT_FOLDER``$MMACTION2/data/$DATASET/rawframes`
3. 使用 `new-short` 而不是 `new-width``new-height` 来调整图像尺寸
```shell
ln -s ${YOUR_FOLDER} $MMACTION2/data/$DATASET/rawframes
```
#### denseflow 的替代项
如果用户因依赖要求(如 Nvidia 显卡驱动版本),无法安装 [denseflow](https://github.com/open-mmlab/denseflow)
或者只需要一些关于光流提取的快速演示,可用 Python 脚本 `tools/misc/flow_extraction.py` 替代 denseflow。
这个脚本可用于一个或多个视频提取 RGB 帧和光流。注意,由于该脚本时在 CPU 上运行光流算法,其速度比 denseflow 慢很多。
```shell
python tools/misc/flow_extraction.py --input ${INPUT} [--prefix ${PREFIX}] [--dest ${DEST}] [--rgb-tmpl ${RGB_TMPL}] \
[--flow-tmpl ${FLOW_TMPL}] [--start-idx ${START_IDX}] [--method ${METHOD}] [--bound ${BOUND}] [--save-rgb]
```
- `INPUT`: 用于提取帧的视频,可以是单个视频或一个视频列表,视频列表应该是一个 txt 文件,并且只包含视频文件名,不包含目录
- `PREFIX`: 输入视频的前缀,当输入是一个视频列表时使用
- `DEST`: 保存提取出的帧的位置
- `RGB_TMPL`: RGB 帧的文件名格式
- `FLOW_TMPL`: 光流的文件名格式
- `START_IDX`: 提取帧的开始索引
- `METHOD`: 用于生成光流的方法
- `BOUND`: 光流的最大值
- `SAVE_RGB`: 同时保存提取的 RGB 帧
### 生成文件列表
MMAction2 提供了便利的脚本用于生成文件列表。在完成视频下载(或更进一步完成视频抽帧)后,用户可以使用如下的脚本生成文件列表。
```shell
cd $MMACTION2
python tools/data/build_file_list.py ${DATASET} ${SRC_FOLDER} [--rgb-prefix ${RGB_PREFIX}] \
[--flow-x-prefix ${FLOW_X_PREFIX}] [--flow-y-prefix ${FLOW_Y_PREFIX}] [--num-split ${NUM_SPLIT}] \
[--subset ${SUBSET}] [--level ${LEVEL}] [--format ${FORMAT}] [--out-root-path ${OUT_ROOT_PATH}] \
[--seed ${SEED}] [--shuffle]
```
- `DATASET`: 所要准备的数据集,例如:`ucf101` , `kinetics400` , `thumos14` , `sthv1` , `sthv2` 等。
- `SRC_FOLDER`: 存放对应格式的数据的目录:
- 如目录为 "$MMACTION2/data/$DATASET/rawframes",则需设置 `--format rawframes`
- 如目录为 "$MMACTION2/data/$DATASET/videos",则需设置 `--format videos`
- `RGB_PREFIX`: RGB 帧的文件前缀。
- `FLOW_X_PREFIX`: 光流 x 分量帧的文件前缀。
- `FLOW_Y_PREFIX`: 光流 y 分量帧的文件前缀。
- `NUM_SPLIT`: 数据集总共的划分个数。
- `SUBSET`: 需要生成文件列表的子集名称。可选项为 `train`, `val`, `test`
- `LEVEL`: 目录级别数量,1 表示一级目录(数据集中所有视频或帧文件夹位于同一目录), 2 表示二级目录(数据集中所有视频或帧文件夹按类别存放于各子目录)。
- `FORMAT`: 需要生成文件列表的源数据格式。可选项为 `rawframes`, `videos`
- `OUT_ROOT_PATH`: 生成文件的根目录。
- `SEED`: 随机种子。
- `--shuffle`: 是否打乱生成的文件列表。
至此为止,用户可参考 [基础教程](getting_started.md) 来进行模型的训练及测试。
### 准备音频
MMAction2 还提供如下脚本来提取音频的波形并生成梅尔频谱。
```shell
cd $MMACTION2
python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \
[--level ${LEVEL}]
```
- `ROOT`: 视频的根目录。
- `DST_ROOT`: 存放生成音频的根目录。
- `EXT`: 视频的后缀名,如 `mp4`
- `N_WORKERS`: 使用的进程数量。
成功提取出音频后,用户可参照 [配置文件](/configs/recognition_audio/resnet/tsn_r50_64x1x1_100e_kinetics400_audio.py) 在线解码并生成梅尔频谱。如果音频文件的目录结构与帧文件夹一致,用户可以直接使用帧数据所用的标注文件作为音频数据的标注文件。在线解码的缺陷在于速度较慢,因此,MMAction2 也提供如下脚本用于离线地生成梅尔频谱。
```shell
cd $MMACTION2
python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \
[--ext $EXT] [--num-workers $N_WORKERS] [--part $PART]
```
- `AUDIO_HOME_PATH`: 音频文件的根目录。
- `SPECTROGRAM_SAVE_PATH`: 存放生成音频特征的根目录。
- `EXT`: 音频的后缀名,如 `m4a`
- `N_WORKERS`: 使用的进程数量。
- `PART`: 将完整的解码任务分为几部分并执行其中一份。如 `2/5` 表示将所有待解码数据分成 5 份,并对其中的第 2 份进行解码。这一选项在用户有多台机器时发挥作用。
梅尔频谱特征所对应的标注文件与帧文件夹一致,用户可以直接复制 `dataset_[train/val]_list_rawframes.txt` 并将其重命名为 `dataset_[train/val]_list_audio_feature.txt`
# Demo 示例
## 目录
- [Demo 示例](#demo-%E7%A4%BA%E4%BE%8B)
- [目录](#%E7%9B%AE%E5%BD%95)
- [预测视频的动作标签](#%E9%A2%84%E6%B5%8B%E8%A7%86%E9%A2%91%E7%9A%84%E5%8A%A8%E4%BD%9C%E6%A0%87%E7%AD%BE)
- [预测视频的时空检测结果](#%E9%A2%84%E6%B5%8B%E8%A7%86%E9%A2%91%E7%9A%84%E6%97%B6%E7%A9%BA%E6%A3%80%E6%B5%8B%E7%BB%93%E6%9E%9C)
- [可视化输入视频的 GradCAM](#%E5%8F%AF%E8%A7%86%E5%8C%96%E8%BE%93%E5%85%A5%E8%A7%86%E9%A2%91%E7%9A%84-gradcam)
- [使用网络摄像头的实时动作识别](#%E4%BD%BF%E7%94%A8%E7%BD%91%E7%BB%9C%E6%91%84%E5%83%8F%E5%A4%B4%E7%9A%84%E5%AE%9E%E6%97%B6%E5%8A%A8%E4%BD%9C%E8%AF%86%E5%88%AB)
- [滑动窗口预测长视频中不同动作类别](#%E6%BB%91%E5%8A%A8%E7%AA%97%E5%8F%A3%E9%A2%84%E6%B5%8B%E9%95%BF%E8%A7%86%E9%A2%91%E4%B8%AD%E4%B8%8D%E5%90%8C%E5%8A%A8%E4%BD%9C%E7%B1%BB%E5%88%AB)
- [基于网络摄像头的实时时空动作检测](#%E5%9F%BA%E4%BA%8E%E7%BD%91%E7%BB%9C%E6%91%84%E5%83%8F%E5%A4%B4%E7%9A%84%E5%AE%9E%E6%97%B6%E6%97%B6%E7%A9%BA%E5%8A%A8%E4%BD%9C%E6%A3%80%E6%B5%8B)
- [基于人体姿态预测动作标签](#%E5%9F%BA%E4%BA%8E%E4%BA%BA%E4%BD%93%E5%A7%BF%E6%80%81%E9%A2%84%E6%B5%8B%E5%8A%A8%E4%BD%9C%E6%A0%87%E7%AD%BE)
- [视频结构化预测](#%E8%A7%86%E9%A2%91%E7%BB%93%E6%9E%84%E5%8C%96%E9%A2%84%E6%B5%8B)
- [基于音频的动作识别](#%E5%9F%BA%E4%BA%8E%E9%9F%B3%E9%A2%91%E7%9A%84%E5%8A%A8%E4%BD%9C%E8%AF%86%E5%88%AB)
## 预测视频的动作标签
MMAction2 提供如下脚本以预测视频的动作标签。为得到 \[0, 1\] 间的动作分值,请确保在配置文件中设定 `model['test_cfg'] = dict(average_clips='prob')`
```shell
python demo/demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} {LABEL_FILE} [--use-frames] \
[--device ${DEVICE_TYPE}] [--fps {FPS}] [--font-scale {FONT_SCALE}] [--font-color {FONT_COLOR}] \
[--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}]
```
可选参数:
- `--use-frames`: 如指定,代表使用帧目录作为输入;否则代表使用视频作为输入。
- `DEVICE_TYPE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`)。默认为 `cuda:0`
- `FPS`: 使用帧目录作为输入时,代表输入的帧率。默认为 30。
- `FONT_SCALE`: 输出视频上的字体缩放比例。默认为 0.5。
- `FONT_COLOR`: 输出视频上的字体颜色,默认为白色( `white`)。
- `TARGET_RESOLUTION`: 输出视频的分辨率,如未指定,使用输入视频的分辨率。
- `RESIZE_ALGORITHM`: 缩放视频时使用的插值方法,默认为 `bicubic`
- `OUT_FILE`: 输出视频的路径,如未指定,则不会生成输出视频。
示例:
以下示例假设用户的当前目录为 `$MMACTION2`,并已经将所需的模型权重文件下载至目录 `checkpoints/` 下,用户也可以使用所提供的 URL 来直接加载模型权重,文件将会被默认下载至 `$HOME/.cache/torch/checkpoints`
1. 在 cuda 设备上,使用 TSN 模型进行视频识别:
```shell
# demo.mp4 及 label_map_k400.txt 均来自 Kinetics-400 数据集
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
```
2. 在 cuda 设备上,使用 TSN 模型进行视频识别,并利用 URL 加载模型权重文件:
```shell
# demo.mp4 及 label_map_k400.txt 均来自 Kinetics-400 数据集
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt
```
3. 在 CPU 上,使用 TSN 模型进行视频识别,输入为视频抽好的帧:
```shell
python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_FRAMES/ LABEL_FILE --use-frames --device cpu
```
4. 使用 TSN 模型进行视频识别,输出 MP4 格式的识别结果:
```shell
# demo.mp4 及 label_map_k400.txt 均来自 Kinetics-400 数据集
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --out-filename demo/demo_out.mp4
```
5. 使用 TSN 模型进行视频识别,输入为视频抽好的帧,将识别结果存为 GIF 格式:
```shell
python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_FRAMES/ LABEL_FILE --use-frames --out-filename demo/demo_out.gif
```
6. 使用 TSN 模型进行视频识别,输出 MP4 格式的识别结果,并指定输出视频分辨率及缩放视频时使用的插值方法:
```shell
# demo.mp4 及 label_map_k400.txt 均来自 Kinetics-400 数据集
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --target-resolution 340 256 --resize-algorithm bilinear \
--out-filename demo/demo_out.mp4
```
```shell
# demo.mp4 及 label_map_k400.txt 均来自 Kinetics-400 数据集
# 若 TARGET_RESOLUTION 的任一维度被设置为 -1,视频帧缩放时将保持长宽比
# 如设定 --target-resolution 为 170 -1,原先长宽为 (340, 256) 的视频帧将被缩放至 (170, 128)
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --target-resolution 170 -1 --resize-algorithm bilinear \
--out-filename demo/demo_out.mp4
```
7. 使用 TSN 模型进行视频识别,输出 MP4 格式的识别结果,指定输出视频中使用红色文字,字体大小为 10 像素:
```shell
# demo.mp4 及 label_map_k400.txt 均来自 Kinetics-400 数据集
python demo/demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
demo/demo.mp4 tools/data/kinetics/label_map_k400.txt --font-size 10 --font-color red \
--out-filename demo/demo_out.mp4
```
8. 使用 TSN 模型进行视频识别,输入为视频抽好的帧,将识别结果存为 MP4 格式,帧率设置为 24fps:
```shell
python demo/demo.py configs/recognition/tsn/tsn_r50_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_FRAMES/ LABEL_FILE --use-frames --fps 24 --out-filename demo/demo_out.gif
```
## 预测视频的时空检测结果
MMAction2 提供如下脚本以预测视频的时空检测结果。
```shell
python demo/demo_spatiotemporal_det.py --video ${VIDEO_FILE} \
[--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
[--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
[--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
[--label-map ${LABEL_MAP}] \
[--device ${DEVICE}] \
[--out-filename ${OUTPUT_FILENAME}] \
[--predict-stepsize ${PREDICT_STEPSIZE}] \
[--output-stepsize ${OUTPUT_STEPSIZE}] \
[--output-fps ${OUTPUT_FPS}]
```
可选参数:
- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: 时空检测配置文件路径。
- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: 时空检测模型权重文件路径。
- `HUMAN_DETECTION_CONFIG_FILE`: 人体检测配置文件路径。
- `HUMAN_DETECTION_CHECKPOINT`: 人体检测模型权重文件路径。
- `HUMAN_DETECTION_SCORE_THRE`: 人体检测分数阈值,默认为 0.9。
- `ACTION_DETECTION_SCORE_THRESHOLD`: 动作检测分数阈值,默认为 0.5。
- `LABEL_MAP`: 所使用的标签映射文件,默认为 `tools/data/ava/label_map.txt`
- `DEVICE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`)。默认为 `cuda:0`
- `OUTPUT_FILENAME`: 输出视频的路径,默认为 `demo/stdet_demo.mp4`
- `PREDICT_STEPSIZE`: 每 N 帧进行一次预测(以节约计算资源),默认值为 8。
- `OUTPUT_STEPSIZE`: 对于输入视频的每 N 帧,输出 1 帧至输出视频中, 默认值为 4,注意需满足 `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`
- `OUTPUT_FPS`: 输出视频的帧率,默认值为 6。
示例:
以下示例假设用户的当前目录为 `$MMACTION2`,并已经将所需的模型权重文件下载至目录 `checkpoints/` 下,用户也可以使用所提供的 URL 来直接加载模型权重,文件将会被默认下载至 `$HOME/.cache/torch/checkpoints`
1. 使用 Faster RCNN 作为人体检测器,SlowOnly-8x8-R101 作为动作检测器。每 8 帧进行一次预测,原视频中每 4 帧输出 1 帧至输出视频中,设置输出视频的帧率为 6。
```shell
python demo/demo_spatiotemporal_det.py --video demo/demo.mp4 \
--config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--action-score-thr 0.5 \
--label-map tools/data/ava/label_map.txt \
--predict-stepsize 8 \
--output-stepsize 4 \
--output-fps 6
```
## 可视化输入视频的 GradCAM
MMAction2 提供如下脚本以可视化输入视频的 GradCAM。
```shell
python demo/demo_gradcam.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} [--use-frames] \
[--device ${DEVICE_TYPE}] [--target-layer-name ${TARGET_LAYER_NAME}] [--fps {FPS}] \
[--target-resolution ${TARGET_RESOLUTION}] [--resize-algorithm {RESIZE_ALGORITHM}] [--out-filename {OUT_FILE}]
```
可选参数:
- `--use-frames`: 如指定,代表使用帧目录作为输入;否则代表使用视频作为输入。
- `DEVICE_TYPE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`)。默认为 `cuda:0`
- `TARGET_LAYER_NAME`: 需要生成 GradCAM 可视化的网络层名称。
- `FPS`: 使用帧目录作为输入时,代表输入的帧率。默认为 30。
- `TARGET_RESOLUTION`: 输出视频的分辨率,如未指定,使用输入视频的分辨率。
- `RESIZE_ALGORITHM`: 缩放视频时使用的插值方法,默认为 `bilinear`
- `OUT_FILE`: 输出视频的路径,如未指定,则不会生成输出视频。
示例:
以下示例假设用户的当前目录为 `$MMACTION2`,并已经将所需的模型权重文件下载至目录 `checkpoints/` 下,用户也可以使用所提供的 URL 来直接加载模型权重,文件将会被默认下载至 `$HOME/.cache/torch/checkpoints`
1. 对于 I3D 模型进行 GradCAM 的可视化,使用视频作为输入,并输出一帧率为 10 的 GIF 文件:
```shell
python demo/demo_gradcam.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
checkpoints/i3d_r50_video_32x2x1_100e_kinetics400_rgb_20200826-e31c6f52.pth demo/demo.mp4 \
--target-layer-name backbone/layer4/1/relu --fps 10 \
--out-filename demo/demo_gradcam.gif
```
2. 对于 I3D 模型进行 GradCAM 的可视化,使用视频作为输入,并输出一 GIF 文件,此示例利用 URL 加载模型权重文件:
```shell
python demo/demo_gradcam.py configs/recognition/tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth \
demo/demo.mp4 --target-layer-name backbone/layer4/1/relu --out-filename demo/demo_gradcam_tsm.gif
```
## 使用网络摄像头的实时动作识别
MMAction2 提供如下脚本来进行使用网络摄像头的实时动作识别。为得到 \[0, 1\] 间的动作分值,请确保在配置文件中设定 `model['test_cfg'] = dict(average_clips='prob')`
```shell
python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${LABEL_FILE} \
[--device ${DEVICE_TYPE}] [--camera-id ${CAMERA_ID}] [--threshold ${THRESHOLD}] \
[--average-size ${AVERAGE_SIZE}] [--drawing-fps ${DRAWING_FPS}] [--inference-fps ${INFERENCE_FPS}]
```
可选参数:
- `DEVICE_TYPE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`)。默认为 `cuda:0`
- `CAMERA_ID`: 摄像头设备的 ID,默认为 0。
- `THRESHOLD`: 动作识别的分数阈值,只有分数大于阈值的动作类型会被显示,默认为 0。
- `AVERAGE_SIZE`: 使用最近 N 个片段的平均结果作为预测,默认为 1。
- `DRAWING_FPS`: 可视化结果时的最高帧率,默认为 20。
- `INFERENCE_FPS`: 进行推理时的最高帧率,默认为 4。
**注**: 若用户的硬件配置足够,可增大可视化帧率和推理帧率以带来更好体验。
示例:
以下示例假设用户的当前目录为 `$MMACTION2`,并已经将所需的模型权重文件下载至目录 `checkpoints/` 下,用户也可以使用所提供的 URL 来直接加载模型权重,文件将会被默认下载至 `$HOME/.cache/torch/checkpoints`
1. 使用 TSN 模型进行利用网络摄像头的实时动作识别,平均最近 5 个片段结果作为预测,输出大于阈值 0.2 的动作类别:
```shell
python demo/webcam_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth tools/data/kinetics/label_map_k400.txt --average-size 5 \
--threshold 0.2 --device cpu
```
2. 使用 TSN 模型在 CPU 上进行利用网络摄像头的实时动作识别,平均最近 5 个片段结果作为预测,输出大于阈值 0.2 的动作类别,此示例利用 URL 加载模型权重文件:
```shell
python demo/webcam_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
tools/data/kinetics/label_map_k400.txt --average-size 5 --threshold 0.2 --device cpu
```
3. 使用 I3D 模型在 GPU 上进行利用网络摄像头的实时动作识别,平均最近 5 个片段结果作为预测,输出大于阈值 0.2 的动作类别:
```shell
python demo/webcam_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
checkpoints/i3d_r50_32x2x1_100e_kinetics400_rgb_20200614-c25ef9a4.pth tools/data/kinetics/label_map_k400.txt \
--average-size 5 --threshold 0.2
```
**注:** 考虑到用户所使用的推理设备具有性能差异,可进行如下改动在用户设备上取得更好效果:
1). 更改配置文件中的 `test_pipeline``SampleFrames` 步骤 (特别是 `clip_len``num_clips`)。
2). 更改配置文件中的 `test_pipeline` 下的裁剪方式类型(可选项含:`TenCrop`, `ThreeCrop`, `CenterCrop`)。
3). 调低 `AVERAGE_SIZE` 以加快推理。
## 滑动窗口预测长视频中不同动作类别
MMAction2 提供如下脚本来预测长视频中的不同动作类别。为得到 \[0, 1\] 间的动作分值,请确保在配置文件中设定 `model['test_cfg'] = dict(average_clips='prob')`
```shell
python demo/long_video_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \
${OUT_FILE} [--input-step ${INPUT_STEP}] [--device ${DEVICE_TYPE}] [--threshold ${THRESHOLD}]
```
可选参数:
- `OUT_FILE`: 输出视频的路径。
- `INPUT_STEP`: 在视频中的每 N 帧中选取一帧作为输入,默认为 1。
- `DEVICE_TYPE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`)。默认为 `cuda:0`
- `THRESHOLD`: 动作识别的分数阈值,只有分数大于阈值的动作类型会被显示,默认为 0.01。
- `STRIDE`: 默认情况下,脚本为每帧给出单独预测,较为耗时。可以设定 `STRIDE` 参数进行加速,此时脚本将会为每 `STRIDE x sample_length` 帧做一次预测(`sample_length` 指模型采帧时的时间窗大小,等于 `clip_len x frame_interval`)。例如,若 sample_length 为 64 帧且 `STRIDE` 设定为 0.5,模型将每 32 帧做一次预测。若 `STRIDE` 设为 0,模型将为每帧做一次预测。`STRIDE` 的理想取值为 (0, 1\] 间,若大于 1,脚本亦可正常执行。`STRIDE` 默认值为 0。
示例:
以下示例假设用户的当前目录为 `$MMACTION2`,并已经将所需的模型权重文件下载至目录 `checkpoints/` 下,用户也可以使用所提供的 URL 来直接加载模型权重,文件将会被默认下载至 `$HOME/.cache/torch/checkpoints`
1. 利用 TSN 模型在 CPU 上预测长视频中的不同动作类别,设置 `INPUT_STEP` 为 3(即每 3 帧随机选取 1 帧作为输入),输出分值大于 0.2 的动作类别:
```shell
python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
--input-step 3 --device cpu --threshold 0.2
```
2. 利用 TSN 模型在 CPU 上预测长视频中的不同动作类别,设置 `INPUT_STEP` 为 3,输出分值大于 0.2 的动作类别,此示例利用 URL 加载模型权重文件:
```shell
python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
```
3. 利用 TSN 模型在 CPU 上预测网络长视频(利用 URL 读取)中的不同动作类别,设置 `INPUT_STEP` 为 3,输出分值大于 0.2 的动作类别,此示例利用 URL 加载模型权重文件:
```shell
python demo/long_video_demo.py configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4 \
tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO --input-step 3 --device cpu --threshold 0.2
```
4. 利用 I3D 模型在 GPU 上预测长视频中的不同动作类别,设置 `INPUT_STEP` 为 3,动作识别的分数阈值为 0.01:
```shell
python demo/long_video_demo.py configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py \
checkpoints/i3d_r50_256p_32x2x1_100e_kinetics400_rgb_20200801-7d9f44de.pth PATH_TO_LONG_VIDEO tools/data/kinetics/label_map_k400.txt PATH_TO_SAVED_VIDEO \
```
## 基于网络摄像头的实时时空动作检测
MMAction2 提供本脚本实现基于网络摄像头的实时时空动作检测。
```shell
python demo/webcam_demo_spatiotemporal_det.py \
[--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
[--checkpoint ${SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
[--input-video] ${INPUT_VIDEO} \
[--label-map ${LABEL_MAP}] \
[--device ${DEVICE}] \
[--output-fps ${OUTPUT_FPS}] \
[--out-filename ${OUTPUT_FILENAME}] \
[--show] \
[--display-height] ${DISPLAY_HEIGHT} \
[--display-width] ${DISPLAY_WIDTH} \
[--predict-stepsize ${PREDICT_STEPSIZE}] \
[--clip-vis-length] ${CLIP_VIS_LENGTH}
```
可选参数:
- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: 时空检测配置文件路径。
- `SPATIOTEMPORAL_ACTION_DETECTION_CHECKPOINT`: 时空检测模型权重文件路径。
- `ACTION_DETECTION_SCORE_THRESHOLD`: 动作检测分数阈值,默认为 0.4。
- `HUMAN_DETECTION_CONFIG_FILE`: 人体检测配置文件路径。
- `HUMAN_DETECTION_CHECKPOINT`: 人体检测模型权重文件路径。
- `HUMAN_DETECTION_SCORE_THRE`: 人体检测分数阈值,默认为 0.9。
- `INPUT_VIDEO`: 网络摄像头编号或本地视频文件路径,默认为 `0`
- `LABEL_MAP`: 所使用的标签映射文件,默认为 `tools/data/ava/label_map.txt`
- `DEVICE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`),默认为 `cuda:0`
- `OUTPUT_FPS`: 输出视频的帧率,默认为 15。
- `OUTPUT_FILENAME`: 输出视频的路径,默认为 `None`
- `--show`: 是否通过 `cv2.imshow` 展示预测结果。
- `DISPLAY_HEIGHT`: 输出结果图像高度,默认为 0。
- `DISPLAY_WIDTH`: 输出结果图像宽度,默认为 0。若 `DISPLAY_HEIGHT <= 0 and DISPLAY_WIDTH <= 0`,则表示输出图像形状与输入视频形状相同。
- `PREDICT_STEPSIZE`: 每 N 帧进行一次预测(以控制计算资源),默认为 8。
- `CLIP_VIS_LENGTH`: 预测结果可视化持续帧数,即每次预测结果将可视化到 `CLIP_VIS_LENGTH` 帧中,默认为 8。
小技巧:
- 如何设置 `--output-fps` 的数值?
- `--output-fps` 建议设置为视频读取线程的帧率。
- 视频读取线程帧率已通过日志输出,格式为 `DEBUG:__main__:Read Thread: {duration} ms, {fps} fps`
- 如何设置 `--predict-stepsize` 的数值?
- 该参数选择与模型选型有关。
- 模型输入构建时间(视频读取线程)应大于等于模型推理时间(主线程)。
- 模型输入构建时间与模型推理时间均已通过日志输出。
- `--predict-stepsize` 数值越大,模型输入构建时间越长。
- 可降低 `--predict-stepsize` 数值增加模型推理频率,从而充分利用计算资源。
示例:
以下示例假设用户的当前目录为 $MMACTION2,并已经将所需的模型权重文件下载至目录 checkpoints/ 下,用户也可以使用所提供的 URL 来直接加载模型权重,文件将会被默认下载至 $HOME/.cache/torch/checkpoints。
1. 使用 Faster RCNN 作为人体检测器,SlowOnly-8x8-R101 作为动作检测器,每 8 帧进行一次预测,设置输出视频的帧率为 20,并通过 `cv2.imshow` 展示预测结果。
```shell
python demo/webcam_demo_spatiotemporal_det.py \
--input-video 0 \
--config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--action-score-thr 0.5 \
--label-map tools/data/ava/label_map.txt \
--predict-stepsize 40 \
--output-fps 20 \
--show
```
## 基于人体姿态预测动作标签
MMAction2 提供本脚本实现基于人体姿态的动作标签预测。
```shell
python demo/demo_skeleton.py ${VIDEO_FILE} ${OUT_FILENAME} \
[--config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
[--checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
[--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
[--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
[--label-map ${LABEL_MAP}] \
[--device ${DEVICE}] \
[--short-side] ${SHORT_SIDE}
```
可选参数:
- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: 基于人体姿态的动作识别模型配置文件路径。
- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: 基于人体姿态的动作识别模型权重文件路径。
- `HUMAN_DETECTION_CONFIG_FILE`: 人体检测配置文件路径。
- `HUMAN_DETECTION_CHECKPOINT`: 人体检测模型权重文件路径。
- `HUMAN_DETECTION_SCORE_THRE`: 人体检测分数阈值,默认为 0.9。
- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: 人体姿态估计模型配置文件路径 (需在 COCO-keypoint 数据集上训练)。
- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: 人体姿态估计模型权重文件路径 (需在 COCO-keypoint 数据集上训练).
- `LABEL_MAP`: 所使用的标签映射文件,默认为 `tools/data/skeleton/label_map_ntu120.txt`
- `DEVICE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`),默认为 `cuda:0`
- `SHORT_SIDE`: 视频抽帧时使用的短边长度,默认为 480。
示例:
以下示例假设用户的当前目录为 $MMACTION2。
1. 使用 Faster RCNN 作为人体检测器,HRNetw32 作为人体姿态估计模型,PoseC3D-NTURGB+D-120-Xsub-keypoint 作为基于人体姿态的动作识别模型。
```shell
python demo/demo_skeleton.py demo/ntu_sample.avi demo/skeleton_demo.mp4 \
--config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint/slowonly_r50_u48_240e_ntu120_xsub_keypoint-6736b03f.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--pose-config demo/hrnet_w32_coco_256x192.py \
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--label-map tools/data/skeleton/label_map_ntu120.txt
```
2. 使用 Faster RCNN 作为人体检测器,HRNetw32 作为人体姿态估计模型,STGCN-NTURGB+D-60-Xsub-keypoint 作为基于人体姿态的动作识别模型。
```shell
python demo/demo_skeleton.py demo/ntu_sample.avi demo/skeleton_demo.mp4 \
--config configs/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint.py \
--checkpoint https://download.openmmlab.com/mmaction/skeleton/stgcn/stgcn_80e_ntu60_xsub_keypoint/stgcn_80e_ntu60_xsub_keypoint-e7bb9653.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--det-score-thr 0.9 \
--pose-config demo/hrnet_w32_coco_256x192.py \
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--label-map tools/data/skeleton/label_map_ntu120.txt
```
## 视频结构化预测
MMAction2 提供本脚本实现基于人体姿态和RGB的视频结构化预测。
```shell
python demo/demo_video_structuralize.py
[--rgb-stdet-config ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
[--rgb-stdet-checkpoint ${RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--skeleton-stdet-checkpoint ${SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT}] \
[--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
[--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
[--pose-config ${HUMAN_POSE_ESTIMATION_CONFIG_FILE}] \
[--pose-checkpoint ${HUMAN_POSE_ESTIMATION_CHECKPOINT}] \
[--skeleton-config ${SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
[--skeleton-checkpoint ${SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
[--rgb-config ${RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE}] \
[--rgb-checkpoint ${RGB_BASED_ACTION_RECOGNITION_CHECKPOINT}] \
[--use-skeleton-stdet ${USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD}] \
[--use-skeleton-recog ${USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD}] \
[--det-score-thr ${HUMAN_DETECTION_SCORE_THRE}] \
[--action-score-thr ${ACTION_DETECTION_SCORE_THRE}] \
[--video ${VIDEO_FILE}] \
[--label-map-stdet ${LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION}] \
[--device ${DEVICE}] \
[--out-filename ${OUTPUT_FILENAME}] \
[--predict-stepsize ${PREDICT_STEPSIZE}] \
[--output-stepsize ${OUTPU_STEPSIZE}] \
[--output-fps ${OUTPUT_FPS}] \
[--cfg-options]
```
可选参数:
- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CONFIG_FILE`: 基于 RGB 的时空检测配置文件路径。
- `RGB_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: 基于 RGB 的时空检测模型权重文件路径。
- `SKELETON_BASED_SPATIO_TEMPORAL_ACTION_DETECTION_CHECKPOINT`: 基于人体姿态的时空检测模型权重文件路径。
- `HUMAN_DETECTION_CONFIG_FILE`: 人体检测配置文件路径。
- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
- `HUMAN_POSE_ESTIMATION_CONFIG_FILE`: 人体姿态估计模型配置文件路径 (需在 COCO-keypoint 数据集上训练)。
- `HUMAN_POSE_ESTIMATION_CHECKPOINT`: 人体姿态估计模型权重文件路径 (需在 COCO-keypoint 数据集上训练)。
- `SKELETON_BASED_ACTION_RECOGNITION_CONFIG_FILE`: 基于人体姿态的动作识别模型配置文件路径。
- `SKELETON_BASED_ACTION_RECOGNITION_CHECKPOINT`: 基于人体姿态的动作识别模型权重文件路径。
- `RGB_BASED_ACTION_RECOGNITION_CONFIG_FILE`: 基于 RGB 的行为识别配置文件路径。
- `RGB_BASED_ACTION_RECOGNITION_CHECKPOINT`: 基于 RGB 的行为识别模型权重文件路径。
- `USE_SKELETON_BASED_SPATIO_TEMPORAL_DETECTION_METHOD`: 使用基于人体姿态的时空检测方法。
- `USE_SKELETON_BASED_ACTION_RECOGNITION_METHOD`: 使用基于人体姿态的行为识别方法。
- `HUMAN_DETECTION_SCORE_THRE`: 人体检测分数阈值,默认为 0.9。
- `ACTION_DETECTION_SCORE_THRE`: 动作检测分数阈值,默认为 0.5。
- `LABEL_MAP_FOR_SPATIO_TEMPORAL_ACTION_DETECTION`: 时空动作检测所使用的标签映射文件,默认为: `tools/data/ava/label_map.txt`
- `LABEL_MAP`: 行为识别所使用的标签映射文件, 默认为: `tools/data/kinetics/label_map_k400.txt`
- `DEVICE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`),默认为 `cuda:0`
- `OUTPUT_FILENAME`: 输出视频的路径,默认为 `demo/test_stdet_recognition_output.mp4`
- `PREDICT_STEPSIZE`: 每 N 帧进行一次预测(以节约计算资源),默认值为 8。
- `OUTPUT_STEPSIZE`: 对于输入视频的每 N 帧,输出 1 帧至输出视频中, 默认值为 1,注意需满足 `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`
- `OUTPUT_FPS`: 输出视频的帧率,默认为 24。
示例:
以下示例假设用户的当前目录为 $MMACTION2。
1. 使用 Faster RCNN 作为人体检测器,HRNetw32 作为人体姿态估计模型,PoseC3D 作为基于人体姿态的动作识别模型和时空动作检测器。每 8 帧进行一次预测,原视频中每 1 帧输出 1 帧至输出视频中,设置输出视频的帧率为 24。
```shell
python demo/demo_video_structuralize.py
--skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--pose-config demo/hrnet_w32_coco_256x192.py
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/
posec3d_k400.pth \
--use-skeleton-stdet \
--use-skeleton-recog \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
2. 使用 Faster RCNN 作为人体检测器,TSN-R50-1x1x3 作为动作识别模型, SlowOnly-8x8-R101 作为时空动检测器。每 8 帧进行一次预测,原视频中每 1 帧输出 1 帧至输出视频中,设置输出视频的帧率为 24。
```shell
python demo/demo_video_structuralize.py
--rgb-stdet-config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--rgb-stdet-checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--rgb-config configs/recognition/tsn/
tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
--rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/
tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/
tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
3. 使用 Faster RCNN 作为人体检测器,HRNetw32 作为人体姿态估计模型,PoseC3D 作为基于人体姿态的动作识别模型, SlowOnly-8x8-R101 作为时空动作检测器。每 8 帧进行一次预测,原视频中每 1 帧输出 1 帧至输出视频中,设置输出视频的帧率为 24。
```shell
python demo/demo_video_structuralize.py
--rgb-stdet-config configs/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py \
--rgb-stdet-checkpoint https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--pose-config demo/hrnet_w32_coco_256x192.py
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--skeleton-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/
posec3d_k400.pth \
--use-skeleton-recog \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
4. 使用 Faster RCNN 作为人体检测器,HRNetw32 作为人体姿态估计模型,TSN-R50-1x1x3 作为动作识别模型, PoseC3D 作为基于人体姿态的时空动作检测器。每 8 帧进行一次预测,原视频中每 1 帧输出 1 帧至输出视频中,设置输出视频的帧率为 24。
```shell
python demo/demo_video_structuralize.py
--skeleton-stdet-checkpoint https://download.openmmlab.com/mmaction/skeleton/posec3d/posec3d_ava.pth \
--det-config demo/faster_rcnn_r50_fpn_2x_coco.py \
--det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
--pose-config demo/hrnet_w32_coco_256x192.py
--pose-checkpoint https://download.openmmlab.com/mmpose/top_down/hrnet/
hrnet_w32_coco_256x192-c78dce93_20200708.pth \
--skeleton-config configs/skeleton/posec3d/slowonly_r50_u48_240e_ntu120_xsub_keypoint.py \
--rgb-config configs/recognition/tsn/
tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py \
--rgb-checkpoint https://download.openmmlab.com/mmaction/recognition/
tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/
tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth \
--use-skeleton-stdet \
--label-map-stdet tools/data/ava/label_map.txt \
--label-map tools/data/kinetics/label_map_k400.txt
```
## 基于音频的动作识别
本脚本可用于进行基于音频特征的动作识别。
脚本 `extract_audio.py` 可被用于从视频中提取音频,脚本 `build_audio_features.py` 可被用于基于音频文件提取音频特征。
```shell
python demo/demo_audio.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${AUDIO_FILE} {LABEL_FILE} [--device ${DEVICE}]
```
可选参数:
- `DEVICE`: 指定脚本运行设备,支持 cuda 设备(如 `cuda:0`)或 cpu(`cpu`),默认为 `cuda:0`
示例:
以下示例假设用户的当前目录为 $MMACTION2。
1. 在 GPU 上,使用 TSN 模型进行基于音频特征的动作识别。
```shell
python demo/demo_audio.py \
configs/recognition_audio/resnet/tsn_r18_64x1x1_100e_kinetics400_audio_feature.py \
https://download.openmmlab.com/mmaction/recognition/audio_recognition/tsn_r18_64x1x1_100e_kinetics400_audio_feature/tsn_r18_64x1x1_100e_kinetics400_audio_feature_20201012-bf34df6c.pth \
audio_feature.npy label_map_k400.txt
```
# 常见问题解答
本文这里列出了用户们遇到的一些常见问题,及相应的解决方案。
如果您发现了任何社区中经常出现的问题,也有了相应的解决方案,欢迎充实本文档来帮助他人。
如果本文档不包括您的问题,欢迎使用提供的 [模板](/.github/ISSUE_TEMPLATE/error-report.md) 创建问题,还请确保您在模板中填写了所有必需的信息。
## 安装
- **"No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"**
1. 使用 `pip uninstall mmcv` 卸载环境中已安装的 `mmcv`
2. 遵循 [MMCV 安装文档](https://mmcv.readthedocs.io/en/latest/#installation) 来安装 `mmcv-full`
- **"OSError: MoviePy Error: creation of None failed because of the following error"**
参照 [MMAction2 安装文档](https://github.com/open-mmlab/mmaction2/blob/master/docs_zh_CN/install.md#%E5%AE%89%E8%A3%85%E4%BE%9D%E8%B5%96%E5%8C%85)
1. 对于 Windows 用户,[ImageMagick](https://www.imagemagick.org/script/index.php) 不再被 MoviePy 自动检测,
需要获取名为 `magick` 的 ImageMagick 二进制包的路径,来修改 `moviepy/config_defaults.py` 文件中的 `IMAGEMAGICK_BINARY`,如 `IMAGEMAGICK_BINARY = "C:\\Program Files\\ImageMagick_VERSION\\magick.exe"`
2. 对于 Linux 用户,如果 ImageMagick 没有被 moviepy 检测,需要注释掉 `/etc/ImageMagick-6/policy.xml` 文件中的 `<policy domain="path" rights="none" pattern="@*" />`,即改为 `<!-- <policy domain="path" rights="none" pattern="@*" /> -->`
- **"Please install XXCODEBASE to use XXX"**
如得到报错消息 "Please install XXCODEBASE to use XXX",代表 MMAction2 无法从 XXCODEBASE 中 import XXX。用户可以执行对应 import 语句定位原因。
一个可能的原因是,对于部分 OpenMMLAB 中的代码库,需先安装 mmcv-full 后再进行安装。
## 数据
- **FileNotFound 如 `No such file or directory: xxx/xxx/img_00300.jpg`**
在 MMAction2 中,对于帧数据集,`start_index` 的默认值为 1,而对于视频数据集, `start_index` 的默认值为 0。
如果 FileNotFound 错误发生于视频的第一帧或最后一帧,则需根据视频首帧(即 `xxx_00000.jpg``xxx_00001.jpg`)的偏移量,修改配置文件中数据处理流水线的 `start_index` 值。
- **如何处理数据集中传入视频的尺寸?是把所有视频调整为固定尺寸,如 “340x256”,还是把所有视频的短边调整成相同的长度(256像素或320像素)?**
从基准测试来看,总体来说,后者(把所有视频的短边调整成相同的长度)效果更好,所以“调整尺寸为短边256像素”被设置为默认的数据处理方式。用户可以在 [TSN 数据基准测试](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn)[SlowOnly 数据基准测试](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn) 中查看相关的基准测试结果。
- **输入数据格式(视频或帧)与数据流水线不匹配,导致异常,如 `KeyError: 'total_frames'`**
对于视频和帧,我们都有相应的流水线来处理。
**对于视频**,应该在处理时首先对其进行解码。可选的解码方式,有 `DecordInit & DecordDecode`, `OpenCVInit & OpenCVDecode`, `PyAVInit & PyAVDecode` 等等。可以参照 [这个例子](https://github.com/open-mmlab/mmaction2/blob/023777cfd26bb175f85d78c455f6869673e0aa09/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py#L47-L49)
**对于帧**,已经事先在本地对其解码,所以使用 `RawFrameDecode` 对帧处理即可。可以参照 [这个例子](https://github.com/open-mmlab/mmaction2/blob/023777cfd26bb175f85d78c455f6869673e0aa09/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py#L49)
`KeyError: 'total_frames'` 是因为错误地使用了 `RawFrameDecode` 来处理视频。当输入是视频的时候,程序是无法事先得到 `total_frame` 的。
## 训练
- **如何使用训练过的识别器作为主干网络的预训练模型?**
参照 [使用预训练模型](https://github.com/open-mmlab/mmaction2/blob/master/docs_zh_CN/tutorials/2_finetune.md#%E4%BD%BF%E7%94%A8%E9%A2%84%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8B)
如果想对整个网络使用预训练模型,可以在配置文件中,将 `load_from` 设置为预训练模型的链接。
如果只想对主干网络使用预训练模型,可以在配置文件中,将主干网络 `backbone` 中的 `pretrained` 设置为预训练模型的地址或链接。
在训练时,预训练模型中无法与主干网络对应的参数会被忽略。
- **如何实时绘制训练集和验证集的准确率/损失函数曲线图?**
使用 `log_config` 中的 `TensorboardLoggerHook`,如:
```python
log_config=dict(
interval=20,
hooks=[
dict(type='TensorboardLoggerHook')
]
)
```
可以参照 [教程1:如何编写配置文件](tutorials/1_config.md)[教程7:如何自定义模型运行参数](tutorials/7_customize_runtime.md#log-config),和 [这个例子](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py#L118) 了解更多相关内容。
- **在 batchnorm.py 中抛出错误: Expected more than 1 value per channel when training**
BatchNorm 层要求批大小(batch size)大于 1。构建数据集时, 若 `drop_last` 被设为 `False`,有时每个轮次的最后一个批次的批大小可能为 1,进而在训练时抛出错误,可以设置 `drop_last=True` 来避免该错误,如:
```python
train_dataloader=dict(drop_last=True)
```
- **微调模型参数时,如何冻结主干网络中的部分参数?**
可以参照 [`def _freeze_stages()`](https://github.com/open-mmlab/mmaction2/blob/0149a0e8c1e0380955db61680c0006626fd008e9/mmaction/models/backbones/x3d.py#L458)[`frozen_stages`](https://github.com/open-mmlab/mmaction2/blob/0149a0e8c1e0380955db61680c0006626fd008e9/mmaction/models/backbones/x3d.py#L183-L184)。在分布式训练和测试时,还须设置 `find_unused_parameters = True`
实际上,除了少数模型,如 C3D 等,用户都能通过设置 `frozen_stages` 来冻结模型参数,因为大多数主干网络继承自 `ResNet``ResNet3D`,而这两个模型都支持 `_freeze_stages()` 方法。
- **如何在配置文件中设置 `load_from` 参数以进行模型微调?**
MMAction2 在 `configs/_base_/default_runtime.py` 文件中将 `load_from=None` 设为默认。由于配置文件的可继承性,用户可直接在下游配置文件中设置 `load_from` 的值来进行更改。
## 测试
- **如何将预测分值用 softmax 归一化到 \[0, 1\] 区间内?**
可以通过设置 `model['test_cfg'] = dict(average_clips='prob')` 来实现。
- **如果模型太大,连一个测试样例都没法放进显存,怎么办?**
默认情况下,3D 模型是以 `10 clips x 3 crops` 的设置进行测试的,也即采样 10 个帧片段,每帧裁剪出 3 个图像块,总计有 30 个视图。
对于特别大的模型,GPU 显存可能连一个视频都放不下。对于这种情况,您可以在配置文件的 `model['test_cfg']` 中设置 `max_testing_views=n`
如此设置,在模型推理过程中,一个批只会使用 n 个视图,以节省显存。
- **如何保存测试结果?**
测试时,用户可在运行指令中设置可选项 `--out xxx.json/pkl/yaml` 来输出结果文件,以供后续检查。输出的测试结果顺序和测试集顺序保持一致。
除此之外,MMAction2 也在 [`tools/analysis/eval_metric.py`](/tools/analysis/eval_metric.py) 中提供了分析工具,用于结果文件的模型评估。
## 部署
- **为什么由 MMAction2 转换的 ONNX 模型在转换到其他框架(如 TensorRT)时会抛出错误?**
目前只能确保 MMAction2 中的模型与 ONNX 兼容。但是,ONNX 中的某些算子可能不受其他框架支持,例如 [这个问题](https://github.com/open-mmlab/mmaction2/issues/414) 中的 TensorRT。当这种情况发生时,如果 `pytorch2onnx.py` 没有出现问题,转换过去的 ONNX 模型也通过了数值检验,可以提 issue 让社区提供帮助。
# 特征提取
MMAction2 为特征提取提供了便捷使用的脚本。
## 片段级特征提取
片段级特征提取是从长度一般为几秒到几十秒不等的剪辑片段中提取深度特征。从每个片段中提取的特征是一个 n 维向量。当进行多视图特征提取时,例如 n 个片段 × m 种裁剪,提取的特征将会是 n\*m 个视图的平均值。
在应用片段级特征提取之前,用户需要准备一个视频列表包含所有想要进行特征提取的视频。例如,由 UCF101 中视频组成的视频列表如下:
```
ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi
ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c02.avi
ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c03.avi
ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c04.avi
ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c05.avi
...
YoYo/v_YoYo_g25_c01.avi
YoYo/v_YoYo_g25_c02.avi
YoYo/v_YoYo_g25_c03.avi
YoYo/v_YoYo_g25_c04.avi
YoYo/v_YoYo_g25_c05.avi
```
假设 UCF101 中的视频所在目录为 `data/ucf101/videos`,视频列表的文件名为 `ucf101.txt`,使用 TSN(Kinetics-400 预训练)从 UCF101 中提取片段级特征,用户可以使用脚本如下:
```shell
python tools/misc/clip_feature_extraction.py \
configs/recognition/tsn/tsn_r50_clip_feature_extraction_1x1x3_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_320p_1x1x3_100e_kinetics400_rgb_20200702-cc665e2a.pth \
--video-list ucf101.txt \
--video-root data/ucf101/videos \
--out ucf101_feature.pkl
```
被提取的特征存储于 `ucf101_feature.pkl`
用户也可以使用分布式片段级特征提取。以下是使用拥有 8 gpus 的计算节点的示例。
```shell
bash tools/misc/dist_clip_feature_extraction.sh \
configs/recognition/tsn/tsn_r50_clip_feature_extraction_1x1x3_rgb.py \
https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_320p_1x1x3_100e_kinetics400_rgb_20200702-cc665e2a.pth \
8 \
--video-list ucf101.txt \
--video-root data/ucf101/videos \
--out ucf101_feature.pkl
```
使用 SlowOnly(Kinetics-400 预训练)从 UCF101 中提取片段级特征,用户可以使用脚本如下:
```shell
python tools/misc/clip_feature_extraction.py \
configs/recognition/slowonly/slowonly_r50_clip_feature_extraction_4x16x1_rgb.py \
https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_320p_4x16x1_256e_kinetics400_rgb/slowonly_r50_video_320p_4x16x1_256e_kinetics400_rgb_20201014-c9cdc656.pth \
--video-list ucf101.txt \
--video-root data/ucf101/videos \
--out ucf101_feature.pkl
```
这两个配置文件展示了用于特征提取的最小配置。用户也可以使用其他存在的配置文件进行特征提取,只要注意使用视频数据进行训练和测试,而不是原始帧数据。
```shell
python tools/misc/clip_feature_extraction.py \
configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py \
https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_320p_4x16x1_256e_kinetics400_rgb/slowonly_r50_video_320p_4x16x1_256e_kinetics400_rgb_20201014-c9cdc656.pth \
--video-list ucf101.txt \
--video-root data/ucf101/videos \
--out ucf101_feature.pkl
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment