Commit 5b3e36dc authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add model TSM

parents
Pipeline #315 failed with stages
in 0 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "speaking-algebra",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import cv2\n",
"import os.path as osp\n",
"import decord\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import urllib\n",
"import moviepy.editor as mpy\n",
"import random as rd\n",
"from mmpose.apis import vis_pose_result\n",
"from mmpose.models import TopDown\n",
"from mmcv import load, dump\n",
"\n",
"# We assume the annotation is already prepared\n",
"gym_train_ann_file = '../data/skeleton/gym_train.pkl'\n",
"gym_val_ann_file = '../data/skeleton/gym_val.pkl'\n",
"ntu60_xsub_train_ann_file = '../data/skeleton/ntu60_xsub_train.pkl'\n",
"ntu60_xsub_val_ann_file = '../data/skeleton/ntu60_xsub_val.pkl'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "alive-consolidation",
"metadata": {},
"outputs": [],
"source": [
"FONTFACE = cv2.FONT_HERSHEY_DUPLEX\n",
"FONTSCALE = 0.6\n",
"FONTCOLOR = (255, 255, 255)\n",
"BGBLUE = (0, 119, 182)\n",
"THICKNESS = 1\n",
"LINETYPE = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ranging-conjunction",
"metadata": {},
"outputs": [],
"source": [
"def add_label(frame, label, BGCOLOR=BGBLUE):\n",
" threshold = 30\n",
" def split_label(label):\n",
" label = label.split()\n",
" lines, cline = [], ''\n",
" for word in label:\n",
" if len(cline) + len(word) < threshold:\n",
" cline = cline + ' ' + word\n",
" else:\n",
" lines.append(cline)\n",
" cline = word\n",
" if cline != '':\n",
" lines += [cline]\n",
" return lines\n",
" \n",
" if len(label) > 30:\n",
" label = split_label(label)\n",
" else:\n",
" label = [label]\n",
" label = ['Action: '] + label\n",
" \n",
" sizes = []\n",
" for line in label:\n",
" sizes.append(cv2.getTextSize(line, FONTFACE, FONTSCALE, THICKNESS)[0])\n",
" box_width = max([x[0] for x in sizes]) + 10\n",
" text_height = sizes[0][1]\n",
" box_height = len(sizes) * (text_height + 6)\n",
" \n",
" cv2.rectangle(frame, (0, 0), (box_width, box_height), BGCOLOR, -1)\n",
" for i, line in enumerate(label):\n",
" location = (5, (text_height + 6) * i + text_height + 3)\n",
" cv2.putText(frame, line, location, FONTFACE, FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE)\n",
" return frame\n",
" \n",
"\n",
"def vis_skeleton(vid_path, anno, category_name=None, ratio=0.5):\n",
" vid = decord.VideoReader(vid_path)\n",
" frames = [x.asnumpy() for x in vid]\n",
" \n",
" h, w, _ = frames[0].shape\n",
" new_shape = (int(w * ratio), int(h * ratio))\n",
" frames = [cv2.resize(f, new_shape) for f in frames]\n",
" \n",
" assert len(frames) == anno['total_frames']\n",
" # The shape is N x T x K x 3\n",
" kps = np.concatenate([anno['keypoint'], anno['keypoint_score'][..., None]], axis=-1)\n",
" kps[..., :2] *= ratio\n",
" # Convert to T x N x K x 3\n",
" kps = kps.transpose([1, 0, 2, 3])\n",
" vis_frames = []\n",
"\n",
" # we need an instance of TopDown model, so build a minimal one\n",
" model = TopDown(backbone=dict(type='ShuffleNetV1'))\n",
"\n",
" for f, kp in zip(frames, kps):\n",
" result = [dict(keypoints=k) for k in kp]\n",
" vis_frame = vis_pose_result(model, f, result)\n",
" \n",
" if category_name is not None:\n",
" vis_frame = add_label(vis_frame, category_name)\n",
" \n",
" vis_frames.append(vis_frame)\n",
" return vis_frames"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "applied-humanity",
"metadata": {},
"outputs": [],
"source": [
"keypoint_pipeline = [\n",
" dict(type='PoseDecode'),\n",
" dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),\n",
" dict(type='Resize', scale=(-1, 64)),\n",
" dict(type='CenterCrop', crop_size=64),\n",
" dict(type='GeneratePoseTarget', sigma=0.6, use_score=True, with_kp=True, with_limb=False)\n",
"]\n",
"\n",
"limb_pipeline = [\n",
" dict(type='PoseDecode'),\n",
" dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),\n",
" dict(type='Resize', scale=(-1, 64)),\n",
" dict(type='CenterCrop', crop_size=64),\n",
" dict(type='GeneratePoseTarget', sigma=0.6, use_score=True, with_kp=False, with_limb=True)\n",
"]\n",
"\n",
"from mmaction.datasets.pipelines import Compose\n",
"def get_pseudo_heatmap(anno, flag='keypoint'):\n",
" assert flag in ['keypoint', 'limb']\n",
" pipeline = Compose(keypoint_pipeline if flag == 'keypoint' else limb_pipeline)\n",
" return pipeline(anno)['imgs']\n",
"\n",
"def vis_heatmaps(heatmaps, channel=-1, ratio=8):\n",
" # if channel is -1, draw all keypoints / limbs on the same map\n",
" import matplotlib.cm as cm\n",
" h, w, _ = heatmaps[0].shape\n",
" newh, neww = int(h * ratio), int(w * ratio)\n",
" \n",
" if channel == -1:\n",
" heatmaps = [np.max(x, axis=-1) for x in heatmaps]\n",
" cmap = cm.viridis\n",
" heatmaps = [(cmap(x)[..., :3] * 255).astype(np.uint8) for x in heatmaps]\n",
" heatmaps = [cv2.resize(x, (neww, newh)) for x in heatmaps]\n",
" return heatmaps"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "automatic-commons",
"metadata": {},
"outputs": [],
"source": [
"# Load GYM annotations\n",
"lines = list(urllib.request.urlopen('https://sdolivia.github.io/FineGym/resources/dataset/gym99_categories.txt'))\n",
"gym_categories = [x.decode().strip().split('; ')[-1] for x in lines]\n",
"gym_annos = load(gym_train_ann_file) + load(gym_val_ann_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "numerous-bristol",
"metadata": {},
"outputs": [],
"source": [
"# download sample videos of GYM\n",
"!wget https://download.openmmlab.com/mmaction/posec3d/gym_samples.tar\n",
"!tar -xf gym_samples.tar\n",
"!rm gym_samples.tar"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ranging-harrison",
"metadata": {},
"outputs": [],
"source": [
"gym_root = 'gym_samples/'\n",
"gym_vids = os.listdir(gym_root)\n",
"# visualize pose of which video? index in 0 - 50.\n",
"idx = 1\n",
"vid = gym_vids[idx]\n",
"\n",
"frame_dir = vid.split('.')[0]\n",
"vid_path = osp.join(gym_root, vid)\n",
"anno = [x for x in gym_annos if x['frame_dir'] == frame_dir][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fitting-courage",
"metadata": {},
"outputs": [],
"source": [
"# Visualize Skeleton\n",
"vis_frames = vis_skeleton(vid_path, anno, gym_categories[anno['label']])\n",
"vid = mpy.ImageSequenceClip(vis_frames, fps=24)\n",
"vid.ipython_display()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "orange-logging",
"metadata": {},
"outputs": [],
"source": [
"keypoint_heatmap = get_pseudo_heatmap(anno)\n",
"keypoint_mapvis = vis_heatmaps(keypoint_heatmap)\n",
"keypoint_mapvis = [add_label(f, gym_categories[anno['label']]) for f in keypoint_mapvis]\n",
"vid = mpy.ImageSequenceClip(keypoint_mapvis, fps=24)\n",
"vid.ipython_display()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "residential-conjunction",
"metadata": {},
"outputs": [],
"source": [
"limb_heatmap = get_pseudo_heatmap(anno, 'limb')\n",
"limb_mapvis = vis_heatmaps(limb_heatmap)\n",
"limb_mapvis = [add_label(f, gym_categories[anno['label']]) for f in limb_mapvis]\n",
"vid = mpy.ImageSequenceClip(limb_mapvis, fps=24)\n",
"vid.ipython_display()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "coupled-stranger",
"metadata": {},
"outputs": [],
"source": [
"# The name list of \n",
"ntu_categories = ['drink water', 'eat meal/snack', 'brushing teeth', 'brushing hair', 'drop', 'pickup', \n",
" 'throw', 'sitting down', 'standing up (from sitting position)', 'clapping', 'reading', \n",
" 'writing', 'tear up paper', 'wear jacket', 'take off jacket', 'wear a shoe', \n",
" 'take off a shoe', 'wear on glasses', 'take off glasses', 'put on a hat/cap', \n",
" 'take off a hat/cap', 'cheer up', 'hand waving', 'kicking something', \n",
" 'reach into pocket', 'hopping (one foot jumping)', 'jump up', \n",
" 'make a phone call/answer phone', 'playing with phone/tablet', 'typing on a keyboard', \n",
" 'pointing to something with finger', 'taking a selfie', 'check time (from watch)', \n",
" 'rub two hands together', 'nod head/bow', 'shake head', 'wipe face', 'salute', \n",
" 'put the palms together', 'cross hands in front (say stop)', 'sneeze/cough', \n",
" 'staggering', 'falling', 'touch head (headache)', 'touch chest (stomachache/heart pain)', \n",
" 'touch back (backache)', 'touch neck (neckache)', 'nausea or vomiting condition', \n",
" 'use a fan (with hand or paper)/feeling warm', 'punching/slapping other person', \n",
" 'kicking other person', 'pushing other person', 'pat on back of other person', \n",
" 'point finger at the other person', 'hugging other person', \n",
" 'giving something to other person', \"touch other person's pocket\", 'handshaking', \n",
" 'walking towards each other', 'walking apart from each other']\n",
"ntu_annos = load(ntu60_xsub_train_ann_file) + load(ntu60_xsub_val_ann_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "seasonal-palmer",
"metadata": {},
"outputs": [],
"source": [
"# download sample videos of NTU-60\n",
"!wget https://download.openmmlab.com/mmaction/posec3d/ntu_samples.tar\n",
"!tar -xf ntu_samples.tar\n",
"!rm ntu_samples.tar"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "critical-review",
"metadata": {},
"outputs": [],
"source": [
"ntu_root = 'ntu_samples/'\n",
"ntu_vids = os.listdir(ntu_root)\n",
"# visualize pose of which video? index in 0 - 50.\n",
"idx = 20\n",
"vid = ntu_vids[idx]\n",
"\n",
"frame_dir = vid.split('.')[0]\n",
"vid_path = osp.join(ntu_root, vid)\n",
"anno = [x for x in ntu_annos if x['frame_dir'] == frame_dir.split('_')[0]][0]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "accompanied-invitation",
"metadata": {},
"outputs": [],
"source": [
"vis_frames = vis_skeleton(vid_path, anno, ntu_categories[anno['label']])\n",
"vid = mpy.ImageSequenceClip(vis_frames, fps=24)\n",
"vid.ipython_display()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "respiratory-conclusion",
"metadata": {},
"outputs": [],
"source": [
"keypoint_heatmap = get_pseudo_heatmap(anno)\n",
"keypoint_mapvis = vis_heatmaps(keypoint_heatmap)\n",
"keypoint_mapvis = [add_label(f, gym_categories[anno['label']]) for f in keypoint_mapvis]\n",
"vid = mpy.ImageSequenceClip(keypoint_mapvis, fps=24)\n",
"vid.ipython_display()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "thirty-vancouver",
"metadata": {},
"outputs": [],
"source": [
"limb_heatmap = get_pseudo_heatmap(anno, 'limb')\n",
"limb_mapvis = vis_heatmaps(limb_heatmap)\n",
"limb_mapvis = [add_label(f, gym_categories[anno['label']]) for f in limb_mapvis]\n",
"vid = mpy.ImageSequenceClip(limb_mapvis, fps=24)\n",
"vid.ipython_display()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import time
from collections import deque
from operator import itemgetter
from threading import Thread
import cv2
import numpy as np
import torch
from mmcv import Config, DictAction
from mmcv.parallel import collate, scatter
from mmaction.apis import init_recognizer
from mmaction.datasets.pipelines import Compose
FONTFACE = cv2.FONT_HERSHEY_COMPLEX_SMALL
FONTSCALE = 1
FONTCOLOR = (255, 255, 255) # BGR, white
MSGCOLOR = (128, 128, 128) # BGR, gray
THICKNESS = 1
LINETYPE = 1
EXCLUED_STEPS = [
'OpenCVInit', 'OpenCVDecode', 'DecordInit', 'DecordDecode', 'PyAVInit',
'PyAVDecode', 'RawFrameDecode'
]
def parse_args():
parser = argparse.ArgumentParser(description='MMAction2 webcam demo')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('label', help='label file')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--camera-id', type=int, default=0, help='camera device id')
parser.add_argument(
'--threshold',
type=float,
default=0.01,
help='recognition score threshold')
parser.add_argument(
'--average-size',
type=int,
default=1,
help='number of latest clips to be averaged for prediction')
parser.add_argument(
'--drawing-fps',
type=int,
default=20,
help='Set upper bound FPS value of the output drawing')
parser.add_argument(
'--inference-fps',
type=int,
default=4,
help='Set upper bound FPS value of model inference')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
args = parser.parse_args()
assert args.drawing_fps >= 0 and args.inference_fps >= 0, \
'upper bound FPS value of drawing and inference should be set as ' \
'positive number, or zero for no limit'
return args
def show_results():
print('Press "Esc", "q" or "Q" to exit')
text_info = {}
cur_time = time.time()
while True:
msg = 'Waiting for action ...'
_, frame = camera.read()
frame_queue.append(np.array(frame[:, :, ::-1]))
if len(result_queue) != 0:
text_info = {}
results = result_queue.popleft()
for i, result in enumerate(results):
selected_label, score = result
if score < threshold:
break
location = (0, 40 + i * 20)
text = selected_label + ': ' + str(round(score, 2))
text_info[location] = text
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
elif len(text_info) != 0:
for location, text in text_info.items():
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
else:
cv2.putText(frame, msg, (0, 40), FONTFACE, FONTSCALE, MSGCOLOR,
THICKNESS, LINETYPE)
cv2.imshow('camera', frame)
ch = cv2.waitKey(1)
if ch == 27 or ch == ord('q') or ch == ord('Q'):
break
if drawing_fps > 0:
# add a limiter for actual drawing fps <= drawing_fps
sleep_time = 1 / drawing_fps - (time.time() - cur_time)
if sleep_time > 0:
time.sleep(sleep_time)
cur_time = time.time()
def inference():
score_cache = deque()
scores_sum = 0
cur_time = time.time()
while True:
cur_windows = []
while len(cur_windows) == 0:
if len(frame_queue) == sample_length:
cur_windows = list(np.array(frame_queue))
if data['img_shape'] is None:
data['img_shape'] = frame_queue.popleft().shape[:2]
cur_data = data.copy()
cur_data['imgs'] = cur_windows
cur_data = test_pipeline(cur_data)
cur_data = collate([cur_data], samples_per_gpu=1)
if next(model.parameters()).is_cuda:
cur_data = scatter(cur_data, [device])[0]
with torch.no_grad():
scores = model(return_loss=False, **cur_data)[0]
score_cache.append(scores)
scores_sum += scores
if len(score_cache) == average_size:
scores_avg = scores_sum / average_size
num_selected_labels = min(len(label), 5)
scores_tuples = tuple(zip(label, scores_avg))
scores_sorted = sorted(
scores_tuples, key=itemgetter(1), reverse=True)
results = scores_sorted[:num_selected_labels]
result_queue.append(results)
scores_sum -= score_cache.popleft()
if inference_fps > 0:
# add a limiter for actual inference fps <= inference_fps
sleep_time = 1 / inference_fps - (time.time() - cur_time)
if sleep_time > 0:
time.sleep(sleep_time)
cur_time = time.time()
camera.release()
cv2.destroyAllWindows()
def main():
global frame_queue, camera, frame, results, threshold, sample_length, \
data, test_pipeline, model, device, average_size, label, \
result_queue, drawing_fps, inference_fps
args = parse_args()
average_size = args.average_size
threshold = args.threshold
drawing_fps = args.drawing_fps
inference_fps = args.inference_fps
device = torch.device(args.device)
cfg = Config.fromfile(args.config)
cfg.merge_from_dict(args.cfg_options)
model = init_recognizer(cfg, args.checkpoint, device=device)
camera = cv2.VideoCapture(args.camera_id)
data = dict(img_shape=None, modality='RGB', label=-1)
with open(args.label, 'r') as f:
label = [line.strip() for line in f]
# prepare test pipeline from non-camera pipeline
cfg = model.cfg
sample_length = 0
pipeline = cfg.data.test.pipeline
pipeline_ = pipeline.copy()
for step in pipeline:
if 'SampleFrames' in step['type']:
sample_length = step['clip_len'] * step['num_clips']
data['num_clips'] = step['num_clips']
data['clip_len'] = step['clip_len']
pipeline_.remove(step)
if step['type'] in EXCLUED_STEPS:
# remove step to decode frames
pipeline_.remove(step)
test_pipeline = Compose(pipeline_)
assert sample_length > 0
try:
frame_queue = deque(maxlen=sample_length)
result_queue = deque(maxlen=1)
pw = Thread(target=show_results, args=(), daemon=True)
pr = Thread(target=inference, args=(), daemon=True)
pw.start()
pr.start()
pw.join()
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
"""Webcam Spatio-Temporal Action Detection Demo.
Some codes are based on https://github.com/facebookresearch/SlowFast
"""
import argparse
import atexit
import copy
import logging
import queue
import threading
import time
from abc import ABCMeta, abstractmethod
import cv2
import mmcv
import numpy as np
import torch
from mmcv import Config, DictAction
from mmcv.runner import load_checkpoint
from mmaction.models import build_detector
try:
from mmdet.apis import inference_detector, init_detector
except (ImportError, ModuleNotFoundError):
raise ImportError('Failed to import `inference_detector` and '
'`init_detector` form `mmdet.apis`. These apis are '
'required in this demo! ')
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(
description='MMAction2 webcam spatio-temporal detection demo')
parser.add_argument(
'--config',
default=('configs/detection/ava/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb.py'),
help='spatio temporal detection config file path')
parser.add_argument(
'--checkpoint',
default=('https://download.openmmlab.com/mmaction/detection/ava/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/'
'slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb'
'_20201217-16378594.pth'),
help='spatio temporal detection checkpoint file/url')
parser.add_argument(
'--action-score-thr',
type=float,
default=0.4,
help='the threshold of human action score')
parser.add_argument(
'--det-config',
default='demo/faster_rcnn_r50_fpn_2x_coco.py',
help='human detection config file path (from mmdet)')
parser.add_argument(
'--det-checkpoint',
default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
'faster_rcnn_r50_fpn_2x_coco/'
'faster_rcnn_r50_fpn_2x_coco_'
'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
help='human detection checkpoint file/url')
parser.add_argument(
'--det-score-thr',
type=float,
default=0.9,
help='the threshold of human detection score')
parser.add_argument(
'--input-video',
default='0',
type=str,
help='webcam id or input video file/url')
parser.add_argument(
'--label-map',
default='tools/data/ava/label_map.txt',
help='label map file')
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
parser.add_argument(
'--output-fps',
default=15,
type=int,
help='the fps of demo video output')
parser.add_argument(
'--out-filename',
default=None,
type=str,
help='the filename of output video')
parser.add_argument(
'--show',
action='store_true',
help='Whether to show results with cv2.imshow')
parser.add_argument(
'--display-height',
type=int,
default=0,
help='Image height for human detector and draw frames.')
parser.add_argument(
'--display-width',
type=int,
default=0,
help='Image width for human detector and draw frames.')
parser.add_argument(
'--predict-stepsize',
default=8,
type=int,
help='give out a prediction per n frames')
parser.add_argument(
'--clip-vis-length',
default=8,
type=int,
help='Number of draw frames per clip.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
args = parser.parse_args()
return args
class TaskInfo:
"""Wapper for a clip.
Transmit data around three threads.
1) Read Thread: Create task and put task into read queue. Init `frames`,
`processed_frames`, `img_shape`, `ratio`, `clip_vis_length`.
2) Main Thread: Get data from read queue, predict human bboxes and stdet
action labels, draw predictions and put task into display queue. Init
`display_bboxes`, `stdet_bboxes` and `action_preds`, update `frames`.
3) Display Thread: Get data from display queue, show/write frames and
delete task.
"""
def __init__(self):
self.id = -1
# raw frames, used as human detector input, draw predictions input
# and output, display input
self.frames = None
# stdet params
self.processed_frames = None # model inputs
self.frames_inds = None # select frames from processed frames
self.img_shape = None # model inputs, processed frame shape
# `action_preds` is `list[list[tuple]]`. The outer brackets indicate
# different bboxes and the intter brackets indicate different action
# results for the same bbox. tuple contains `class_name` and `score`.
self.action_preds = None # stdet results
# human bboxes with the format (xmin, ymin, xmax, ymax)
self.display_bboxes = None # bboxes coords for self.frames
self.stdet_bboxes = None # bboxes coords for self.processed_frames
self.ratio = None # processed_frames.shape[1::-1]/frames.shape[1::-1]
# for each clip, draw predictions on clip_vis_length frames
self.clip_vis_length = -1
def add_frames(self, idx, frames, processed_frames):
"""Add the clip and corresponding id.
Args:
idx (int): the current index of the clip.
frames (list[ndarray]): list of images in "BGR" format.
processed_frames (list[ndarray]): list of resize and normed images
in "BGR" format.
"""
self.frames = frames
self.processed_frames = processed_frames
self.id = idx
self.img_shape = processed_frames[0].shape[:2]
def add_bboxes(self, display_bboxes):
"""Add correspondding bounding boxes."""
self.display_bboxes = display_bboxes
self.stdet_bboxes = display_bboxes.clone()
self.stdet_bboxes[:, ::2] = self.stdet_bboxes[:, ::2] * self.ratio[0]
self.stdet_bboxes[:, 1::2] = self.stdet_bboxes[:, 1::2] * self.ratio[1]
def add_action_preds(self, preds):
"""Add the corresponding action predictions."""
self.action_preds = preds
def get_model_inputs(self, device):
"""Convert preprocessed images to MMAction2 STDet model inputs."""
cur_frames = [self.processed_frames[idx] for idx in self.frames_inds]
input_array = np.stack(cur_frames).transpose((3, 0, 1, 2))[np.newaxis]
input_tensor = torch.from_numpy(input_array).to(device)
return dict(
return_loss=False,
img=[input_tensor],
proposals=[[self.stdet_bboxes]],
img_metas=[[dict(img_shape=self.img_shape)]])
class BaseHumanDetector(metaclass=ABCMeta):
"""Base class for Human Dector.
Args:
device (str): CPU/CUDA device option.
"""
def __init__(self, device):
self.device = torch.device(device)
@abstractmethod
def _do_detect(self, image):
"""Get human bboxes with shape [n, 4].
The format of bboxes is (xmin, ymin, xmax, ymax) in pixels.
"""
def predict(self, task):
"""Add keyframe bboxes to task."""
# keyframe idx == (clip_len * frame_interval) // 2
keyframe = task.frames[len(task.frames) // 2]
# call detector
bboxes = self._do_detect(keyframe)
# convert bboxes to torch.Tensor and move to target device
if isinstance(bboxes, np.ndarray):
bboxes = torch.from_numpy(bboxes).to(self.device)
elif isinstance(bboxes, torch.Tensor) and bboxes.device != self.device:
bboxes = bboxes.to(self.device)
# update task
task.add_bboxes(bboxes)
return task
class MmdetHumanDetector(BaseHumanDetector):
"""Wrapper for mmdetection human detector.
Args:
config (str): Path to mmdetection config.
ckpt (str): Path to mmdetection checkpoint.
device (str): CPU/CUDA device option.
score_thr (float): The threshold of human detection score.
person_classid (int): Choose class from detection results.
Default: 0. Suitable for COCO pretrained models.
"""
def __init__(self, config, ckpt, device, score_thr, person_classid=0):
super().__init__(device)
self.model = init_detector(config, ckpt, device)
self.person_classid = person_classid
self.score_thr = score_thr
def _do_detect(self, image):
"""Get bboxes in shape [n, 4] and values in pixels."""
result = inference_detector(self.model, image)[self.person_classid]
result = result[result[:, 4] >= self.score_thr][:, :4]
return result
class StdetPredictor:
"""Wrapper for MMAction2 spatio-temporal action models.
Args:
config (str): Path to stdet config.
ckpt (str): Path to stdet checkpoint.
device (str): CPU/CUDA device option.
score_thr (float): The threshold of human action score.
label_map_path (str): Path to label map file. The format for each line
is `{class_id}: {class_name}`.
"""
def __init__(self, config, checkpoint, device, score_thr, label_map_path):
self.score_thr = score_thr
# load model
config.model.backbone.pretrained = None
model = build_detector(config.model, test_cfg=config.get('test_cfg'))
load_checkpoint(model, checkpoint, map_location='cpu')
model.to(device)
model.eval()
self.model = model
self.device = device
# init label map, aka class_id to class_name dict
with open(label_map_path) as f:
lines = f.readlines()
lines = [x.strip().split(': ') for x in lines]
self.label_map = {int(x[0]): x[1] for x in lines}
try:
if config['data']['train']['custom_classes'] is not None:
self.label_map = {
id + 1: self.label_map[cls]
for id, cls in enumerate(config['data']['train']
['custom_classes'])
}
except KeyError:
pass
def predict(self, task):
"""Spatio-temporval Action Detection model inference."""
# No need to do inference if no one in keyframe
if len(task.stdet_bboxes) == 0:
return task
with torch.no_grad():
result = self.model(**task.get_model_inputs(self.device))[0]
# pack results of human detector and stdet
preds = []
for _ in range(task.stdet_bboxes.shape[0]):
preds.append([])
for class_id in range(len(result)):
if class_id + 1 not in self.label_map:
continue
for bbox_id in range(task.stdet_bboxes.shape[0]):
if result[class_id][bbox_id, 4] > self.score_thr:
preds[bbox_id].append((self.label_map[class_id + 1],
result[class_id][bbox_id, 4]))
# update task
# `preds` is `list[list[tuple]]`. The outer brackets indicate
# different bboxes and the intter brackets indicate different action
# results for the same bbox. tuple contains `class_name` and `score`.
task.add_action_preds(preds)
return task
class ClipHelper:
"""Multithrading utils to manage the lifecycle of task."""
def __init__(self,
config,
display_height=0,
display_width=0,
input_video=0,
predict_stepsize=40,
output_fps=25,
clip_vis_length=8,
out_filename=None,
show=True,
stdet_input_shortside=256):
# stdet sampling strategy
val_pipeline = config.data.val.pipeline
sampler = [x for x in val_pipeline
if x['type'] == 'SampleAVAFrames'][0]
clip_len, frame_interval = sampler['clip_len'], sampler[
'frame_interval']
self.window_size = clip_len * frame_interval
# asserts
assert (out_filename or show), \
'out_filename and show cannot both be None'
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
assert clip_vis_length <= predict_stepsize
assert 0 < predict_stepsize <= self.window_size
# source params
try:
self.cap = cv2.VideoCapture(int(input_video))
self.webcam = True
except ValueError:
self.cap = cv2.VideoCapture(input_video)
self.webcam = False
assert self.cap.isOpened()
# stdet input preprocessing params
h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
self.stdet_input_size = mmcv.rescale_size(
(w, h), (stdet_input_shortside, np.Inf))
img_norm_cfg = config['img_norm_cfg']
if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
to_bgr = img_norm_cfg.pop('to_bgr')
img_norm_cfg['to_rgb'] = to_bgr
img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
img_norm_cfg['std'] = np.array(img_norm_cfg['std'])
self.img_norm_cfg = img_norm_cfg
# task init params
self.clip_vis_length = clip_vis_length
self.predict_stepsize = predict_stepsize
self.buffer_size = self.window_size - self.predict_stepsize
frame_start = self.window_size // 2 - (clip_len // 2) * frame_interval
self.frames_inds = [
frame_start + frame_interval * i for i in range(clip_len)
]
self.buffer = []
self.processed_buffer = []
# output/display params
if display_height > 0 and display_width > 0:
self.display_size = (display_width, display_height)
elif display_height > 0 or display_width > 0:
self.display_size = mmcv.rescale_size(
(w, h), (np.Inf, max(display_height, display_width)))
else:
self.display_size = (w, h)
self.ratio = tuple(
n / o for n, o in zip(self.stdet_input_size, self.display_size))
if output_fps <= 0:
self.output_fps = int(self.cap.get(cv2.CAP_PROP_FPS))
else:
self.output_fps = output_fps
self.show = show
self.video_writer = None
if out_filename is not None:
self.video_writer = self.get_output_video_writer(out_filename)
display_start_idx = self.window_size // 2 - self.predict_stepsize // 2
self.display_inds = [
display_start_idx + i for i in range(self.predict_stepsize)
]
# display multi-theading params
self.display_id = -1 # task.id for display queue
self.display_queue = {}
self.display_lock = threading.Lock()
self.output_lock = threading.Lock()
# read multi-theading params
self.read_id = -1 # task.id for read queue
self.read_id_lock = threading.Lock()
self.read_queue = queue.Queue()
self.read_lock = threading.Lock()
self.not_end = True # cap.read() flag
# program state
self.stopped = False
atexit.register(self.clean)
def read_fn(self):
"""Main function for read thread.
Contains three steps:
1) Read and preprocess (resize + norm) frames from source.
2) Create task by frames from previous step and buffer.
3) Put task into read queue.
"""
was_read = True
start_time = time.time()
while was_read and not self.stopped:
# init task
task = TaskInfo()
task.clip_vis_length = self.clip_vis_length
task.frames_inds = self.frames_inds
task.ratio = self.ratio
# read buffer
frames = []
processed_frames = []
if len(self.buffer) != 0:
frames = self.buffer
if len(self.processed_buffer) != 0:
processed_frames = self.processed_buffer
# read and preprocess frames from source and update task
with self.read_lock:
before_read = time.time()
read_frame_cnt = self.window_size - len(frames)
while was_read and len(frames) < self.window_size:
was_read, frame = self.cap.read()
if not self.webcam:
# Reading frames too fast may lead to unexpected
# performance degradation. If you have enough
# resource, this line could be commented.
time.sleep(1 / self.output_fps)
if was_read:
frames.append(mmcv.imresize(frame, self.display_size))
processed_frame = mmcv.imresize(
frame, self.stdet_input_size).astype(np.float32)
_ = mmcv.imnormalize_(processed_frame,
**self.img_norm_cfg)
processed_frames.append(processed_frame)
task.add_frames(self.read_id + 1, frames, processed_frames)
# update buffer
if was_read:
self.buffer = frames[-self.buffer_size:]
self.processed_buffer = processed_frames[-self.buffer_size:]
# update read state
with self.read_id_lock:
self.read_id += 1
self.not_end = was_read
self.read_queue.put((was_read, copy.deepcopy(task)))
cur_time = time.time()
logger.debug(
f'Read thread: {1000*(cur_time - start_time):.0f} ms, '
f'{read_frame_cnt / (cur_time - before_read):.0f} fps')
start_time = cur_time
def display_fn(self):
"""Main function for display thread.
Read data from display queue and display predictions.
"""
start_time = time.time()
while not self.stopped:
# get the state of the read thread
with self.read_id_lock:
read_id = self.read_id
not_end = self.not_end
with self.display_lock:
# If video ended and we have display all frames.
if not not_end and self.display_id == read_id:
break
# If the next task are not available, wait.
if (len(self.display_queue) == 0 or
self.display_queue.get(self.display_id + 1) is None):
time.sleep(0.02)
continue
# get display data and update state
self.display_id += 1
was_read, task = self.display_queue[self.display_id]
del self.display_queue[self.display_id]
display_id = self.display_id
# do display predictions
with self.output_lock:
if was_read and task.id == 0:
# the first task
cur_display_inds = range(self.display_inds[-1] + 1)
elif not was_read:
# the last task
cur_display_inds = range(self.display_inds[0],
len(task.frames))
else:
cur_display_inds = self.display_inds
for frame_id in cur_display_inds:
frame = task.frames[frame_id]
if self.show:
cv2.imshow('Demo', frame)
cv2.waitKey(int(1000 / self.output_fps))
if self.video_writer:
self.video_writer.write(frame)
cur_time = time.time()
logger.debug(
f'Display thread: {1000*(cur_time - start_time):.0f} ms, '
f'read id {read_id}, display id {display_id}')
start_time = cur_time
def __iter__(self):
return self
def __next__(self):
"""Get data from read queue.
This function is part of the main thread.
"""
if self.read_queue.qsize() == 0:
time.sleep(0.02)
return not self.stopped, None
was_read, task = self.read_queue.get()
if not was_read:
# If we reach the end of the video, there aren't enough frames
# in the task.processed_frames, so no need to model inference
# and draw predictions. Put task into display queue.
with self.read_id_lock:
read_id = self.read_id
with self.display_lock:
self.display_queue[read_id] = was_read, copy.deepcopy(task)
# main thread doesn't need to handle this task again
task = None
return was_read, task
def start(self):
"""Start read thread and display thread."""
self.read_thread = threading.Thread(
target=self.read_fn, args=(), name='VidRead-Thread', daemon=True)
self.read_thread.start()
self.display_thread = threading.Thread(
target=self.display_fn,
args=(),
name='VidDisplay-Thread',
daemon=True)
self.display_thread.start()
return self
def clean(self):
"""Close all threads and release all resources."""
self.stopped = True
self.read_lock.acquire()
self.cap.release()
self.read_lock.release()
self.output_lock.acquire()
cv2.destroyAllWindows()
if self.video_writer:
self.video_writer.release()
self.output_lock.release()
def join(self):
"""Waiting for the finalization of read and display thread."""
self.read_thread.join()
self.display_thread.join()
def display(self, task):
"""Add the visualized task to the display queue.
Args:
task (TaskInfo object): task object that contain the necessary
information for prediction visualization.
"""
with self.display_lock:
self.display_queue[task.id] = (True, task)
def get_output_video_writer(self, path):
"""Return a video writer object.
Args:
path (str): path to the output video file.
"""
return cv2.VideoWriter(
filename=path,
fourcc=cv2.VideoWriter_fourcc(*'mp4v'),
fps=float(self.output_fps),
frameSize=self.display_size,
isColor=True)
class BaseVisualizer(metaclass=ABCMeta):
"""Base class for visualization tools."""
def __init__(self, max_labels_per_bbox):
self.max_labels_per_bbox = max_labels_per_bbox
def draw_predictions(self, task):
"""Visualize stdet predictions on raw frames."""
# read bboxes from task
bboxes = task.display_bboxes.cpu().numpy()
# draw predictions and update task
keyframe_idx = len(task.frames) // 2
draw_range = [
keyframe_idx - task.clip_vis_length // 2,
keyframe_idx + (task.clip_vis_length - 1) // 2
]
assert draw_range[0] >= 0 and draw_range[1] < len(task.frames)
task.frames = self.draw_clip_range(task.frames, task.action_preds,
bboxes, draw_range)
return task
def draw_clip_range(self, frames, preds, bboxes, draw_range):
"""Draw a range of frames with the same bboxes and predictions."""
# no predictions to be draw
if bboxes is None or len(bboxes) == 0:
return frames
# draw frames in `draw_range`
left_frames = frames[:draw_range[0]]
right_frames = frames[draw_range[1] + 1:]
draw_frames = frames[draw_range[0]:draw_range[1] + 1]
# get labels(texts) and draw predictions
draw_frames = [
self.draw_one_image(frame, bboxes, preds) for frame in draw_frames
]
return list(left_frames) + draw_frames + list(right_frames)
@abstractmethod
def draw_one_image(self, frame, bboxes, preds):
"""Draw bboxes and corresponding texts on one frame."""
@staticmethod
def abbrev(name):
"""Get the abbreviation of label name:
'take (an object) from (a person)' -> 'take ... from ...'
"""
while name.find('(') != -1:
st, ed = name.find('('), name.find(')')
name = name[:st] + '...' + name[ed + 1:]
return name
class DefaultVisualizer(BaseVisualizer):
"""Tools to visualize predictions.
Args:
max_labels_per_bbox (int): Max number of labels to visualize for a
person box. Default: 5.
plate (str): The color plate used for visualization. Two recommended
plates are blue plate `03045e-023e8a-0077b6-0096c7-00b4d8-48cae4`
and green plate `004b23-006400-007200-008000-38b000-70e000`. These
plates are generated by https://coolors.co/.
Default: '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'.
text_fontface (int): Fontface from OpenCV for texts.
Default: cv2.FONT_HERSHEY_DUPLEX.
text_fontscale (float): Fontscale from OpenCV for texts.
Default: 0.5.
text_fontcolor (tuple): fontface from OpenCV for texts.
Default: (255, 255, 255).
text_thickness (int): Thickness from OpenCV for texts.
Default: 1.
text_linetype (int): LInetype from OpenCV for texts.
Default: 1.
"""
def __init__(
self,
max_labels_per_bbox=5,
plate='03045e-023e8a-0077b6-0096c7-00b4d8-48cae4',
text_fontface=cv2.FONT_HERSHEY_DUPLEX,
text_fontscale=0.5,
text_fontcolor=(255, 255, 255), # white
text_thickness=1,
text_linetype=1):
super().__init__(max_labels_per_bbox=max_labels_per_bbox)
self.text_fontface = text_fontface
self.text_fontscale = text_fontscale
self.text_fontcolor = text_fontcolor
self.text_thickness = text_thickness
self.text_linetype = text_linetype
def hex2color(h):
"""Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
plate = plate.split('-')
self.plate = [hex2color(h) for h in plate]
def draw_one_image(self, frame, bboxes, preds):
"""Draw predictions on one image."""
for bbox, pred in zip(bboxes, preds):
# draw bbox
box = bbox.astype(np.int64)
st, ed = tuple(box[:2]), tuple(box[2:])
cv2.rectangle(frame, st, ed, (0, 0, 255), 2)
# draw texts
for k, (label, score) in enumerate(pred):
if k >= self.max_labels_per_bbox:
break
text = f'{self.abbrev(label)}: {score:.4f}'
location = (0 + st[0], 18 + k * 18 + st[1])
textsize = cv2.getTextSize(text, self.text_fontface,
self.text_fontscale,
self.text_thickness)[0]
textwidth = textsize[0]
diag0 = (location[0] + textwidth, location[1] - 14)
diag1 = (location[0], location[1] + 2)
cv2.rectangle(frame, diag0, diag1, self.plate[k + 1], -1)
cv2.putText(frame, text, location, self.text_fontface,
self.text_fontscale, self.text_fontcolor,
self.text_thickness, self.text_linetype)
return frame
def main(args):
# init human detector
human_detector = MmdetHumanDetector(args.det_config, args.det_checkpoint,
args.device, args.det_score_thr)
# init action detector
config = Config.fromfile(args.config)
config.merge_from_dict(args.cfg_options)
try:
# In our spatiotemporal detection demo, different actions should have
# the same number of bboxes.
config['model']['test_cfg']['rcnn']['action_thr'] = .0
except KeyError:
pass
stdet_predictor = StdetPredictor(
config=config,
checkpoint=args.checkpoint,
device=args.device,
score_thr=args.action_score_thr,
label_map_path=args.label_map)
# init clip helper
clip_helper = ClipHelper(
config=config,
display_height=args.display_height,
display_width=args.display_width,
input_video=args.input_video,
predict_stepsize=args.predict_stepsize,
output_fps=args.output_fps,
clip_vis_length=args.clip_vis_length,
out_filename=args.out_filename,
show=args.show)
# init visualizer
vis = DefaultVisualizer()
# start read and display thread
clip_helper.start()
try:
# Main thread main function contains:
# 1) get data from read queue
# 2) get human bboxes and stdet predictions
# 3) draw stdet predictions and update task
# 4) put task into display queue
for able_to_read, task in clip_helper:
# get data from read queue
if not able_to_read:
# read thread is dead and all tasks are processed
break
if task is None:
# when no data in read queue, wait
time.sleep(0.01)
continue
inference_start = time.time()
# get human bboxes
human_detector.predict(task)
# get stdet predictions
stdet_predictor.predict(task)
# draw stdet predictions in raw frames
vis.draw_predictions(task)
logger.info(f'Stdet Results: {task.action_preds}')
# add draw frames to display queue
clip_helper.display(task)
logger.debug('Main thread inference time '
f'{1000*(time.time() - inference_start):.0f} ms')
# wait for display thread
clip_helper.join()
except KeyboardInterrupt:
pass
finally:
# close read & display thread, release all resources
clip_helper.clean()
if __name__ == '__main__':
main(parse_args())
ARG PYTORCH="1.6.0"
ARG CUDA="10.1"
ARG CUDNN="7"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
# To fix GPG key error when running apt-get update
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 ffmpeg \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install MMCV
RUN pip install --no-cache-dir --upgrade pip wheel setuptools
RUN pip install --no-cache-dir mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
# Install MMAction2
RUN conda clean --all
RUN git clone https://github.com/open-mmlab/mmaction2.git /mmaction2
WORKDIR /mmaction2
RUN mkdir -p /mmaction2/data
ENV FORCE_CUDA="1"
RUN pip install cython --no-cache-dir
RUN pip install --no-cache-dir -e .
ARG PYTORCH="1.9.0"
ARG CUDA="10.2"
ARG CUDNN="7"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ARG MMCV="1.3.8"
ARG MMACTION="0.24.0"
ENV PYTHONUNBUFFERED TRUE
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
ca-certificates \
g++ \
openjdk-11-jre-headless \
# MMDET Requirements
ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
libsndfile1 libturbojpeg \
&& rm -rf /var/lib/apt/lists/*
ENV PATH="/opt/conda/bin:$PATH"
RUN export FORCE_CUDA=1
# TORCHSEVER
RUN pip install torchserve torch-model-archiver
# MMLAB
ARG PYTORCH
ARG CUDA
RUN ["/bin/bash", "-c", "pip install mmcv-full==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"]
# RUN pip install mmaction2==${MMACTION}
RUN pip install git+https://github.com/open-mmlab/mmaction2.git
RUN useradd -m model-server \
&& mkdir -p /home/model-server/tmp
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
RUN chmod +x /usr/local/bin/entrypoint.sh \
&& chown -R model-server /home/model-server
COPY config.properties /home/model-server/config.properties
RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
EXPOSE 8080 8081 8082
USER model-server
WORKDIR /home/model-server
ENV TEMP=/home/model-server/tmp
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
CMD ["serve"]
inference_address=http://0.0.0.0:8080
management_address=http://0.0.0.0:8081
metrics_address=http://0.0.0.0:8082
model_store=/home/model-server/model-store
load_models=all
#!/bin/bash
set -e
if [[ "$1" = "serve" ]]; then
shift 1
torchserve --start --ts-config /home/model-server/config.properties
else
eval "$@"
fi
# prevent docker exit
tail -f /dev/null
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.header-logo {
background-image: url("../images/mmaction2.png");
background-size: 130px 40px;
height: 40px;
width: 130px;
}
mmaction.apis
-------------
.. automodule:: mmaction.apis
:members:
mmaction.core
-------------
optimizer
^^^^^^^^^
.. automodule:: mmaction.core.optimizer
:members:
evaluation
^^^^^^^^^^
.. automodule:: mmaction.core.evaluation
:members:
scheduler
^^
.. automodule:: mmaction.core.scheduler
:members:
mmaction.localization
---------------------
localization
^^^^^^^^^^^^
.. automodule:: mmaction.localization
:members:
mmaction.models
---------------
models
^^^^^^
.. automodule:: mmaction.models
:members:
recognizers
^^^^^^^^^^^
.. automodule:: mmaction.models.recognizers
:members:
localizers
^^^^^^^^^^
.. automodule:: mmaction.models.localizers
:members:
common
^^^^^^
.. automodule:: mmaction.models.common
:members:
backbones
^^^^^^^^^
.. automodule:: mmaction.models.backbones
:members:
heads
^^^^^
.. automodule:: mmaction.models.heads
:members:
necks
^^^^^
.. automodule:: mmaction.models.necks
:members:
losses
^^^^^^
.. automodule:: mmaction.models.losses
:members:
mmaction.datasets
-----------------
datasets
^^^^^^^^
.. automodule:: mmaction.datasets
:members:
pipelines
^^^^^^^^^
.. automodule:: mmaction.datasets.pipelines
:members:
samplers
^^^^^^^^
.. automodule:: mmaction.datasets.samplers
:members:
mmaction.utils
--------------
.. automodule:: mmaction.utils
:members:
mmaction.localization
---------------------
.. automodule:: mmaction.localization
:members:
# Benchmark
We compare our results with some popular frameworks and official releases in terms of speed.
## Settings
### Hardware
- 8 NVIDIA Tesla V100 (32G) GPUs
- Intel(R) Xeon(R) Gold 6146 CPU @ 3.20GHz
### Software Environment
- Python 3.7
- PyTorch 1.4
- CUDA 10.1
- CUDNN 7.6.03
- NCCL 2.4.08
### Metrics
The time we measured is the average training time for an iteration, including data processing and model training.
The training speed is measure with s/iter. The lower, the better. Note that we skip the first 50 iter times as they may contain the device warmup time.
### Comparison Rules
Here we compare our MMAction2 repo with other video understanding toolboxes in the same data and model settings
by the training time per iteration. Here, we use
- commit id [7f3490d](https://github.com/open-mmlab/mmaction/tree/7f3490d3db6a67fe7b87bfef238b757403b670e3)(1/5/2020) of MMAction
- commit id [8d53d6f](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd)(5/5/2020) of Temporal-Shift-Module
- commit id [8299c98](https://github.com/facebookresearch/SlowFast/tree/8299c9862f83a067fa7114ce98120ae1568a83ec)(7/7/2020) of PySlowFast
- commit id [f13707f](https://github.com/wzmsltw/BSN-boundary-sensitive-network/tree/f13707fbc362486e93178c39f9c4d398afe2cb2f)(12/12/2018) of BSN(boundary sensitive network)
- commit id [45d0514](https://github.com/JJBOY/BMN-Boundary-Matching-Network/tree/45d05146822b85ca672b65f3d030509583d0135a)(17/10/2019) of BMN(boundary matching network)
To ensure the fairness of the comparison, the comparison experiments were conducted under the same hardware environment and using the same dataset. The rawframe dataset we used is generated by the [data preparation tools](/tools/data/kinetics/README.md), the video dataset we used is a special version of resized video cache called '256p dense-encoded video', featuring a faster decoding speed which is generated by the scripts [here](/tools/data/resize_videos.py). Significant improvement can be observed when comparing with normal 256p videos as shown in the table below, especially when the sampling is sparse(like [TSN](/configs/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb.py)).
For each model setting, we kept the same data preprocessing methods to make sure the same feature input.
In addition, we also used Memcached, a distributed cached system, to load the data for the same IO time except for fair comparisons with Pyslowfast which uses raw videos directly from disk by default.
We provide the training log based on which we calculate the average iter time, with the actual setting logged inside, feel free to verify it and fire an issue if something does not make sense.
## Main Results
### Recognizers
| Model | input | io backend | batch size x gpus | MMAction2 (s/iter) | GPU mem(GB) | MMAction (s/iter) | GPU mem(GB) | Temporal-Shift-Module (s/iter) | GPU mem(GB) | PySlowFast (s/iter) | GPU mem(GB) |
| :------------------------------------------------------------------------------------------ | :----------------------: | :--------: | :---------------: | :-------------------------------------------------------------------------------------------------------------------------: | :---------: | :------------------------------------------------------------------------------------------------------------------: | :---------: | :-------------------------------------------------------------------------------------------------------------------------------: | :---------: | :--------------------------------------------------------------------------------------------------------------------: | :---------: |
| [TSN](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) | 256p rawframes | Memcached | 32x8 | **[0.32](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsn_256p_rawframes_memcahed_32x8.zip)** | 8.1 | [0.38](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction/tsn_256p_rawframes_memcached_32x8.zip) | 8.1 | [0.42](https://download.openmmlab.com/mmaction/benchmark/recognition/temporal_shift_module/tsn_256p_rawframes_memcached_32x8.zip) | 10.5 | x | x |
| [TSN](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) | 256p videos | Disk | 32x8 | **[1.42](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsn_256p_videos_disk_32x8.zip)** | 8.1 | x | x | x | x | TODO | TODO |
| [TSN](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 32x8 | **[0.61](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsn_256p_fast_videos_disk_32x8.zip)** | 8.1 | x | x | x | x | TODO | TODO |
| [I3D heavy](/configs/recognition/i3d/i3d_r50_video_heavy_8x8x1_100e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.34](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/i3d_heavy_256p_videos_disk_8x8.zip)** | 4.6 | x | x | x | x | [0.44](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_i3d_r50_8x8_video.log) | 4.6 |
| [I3D heavy](/configs/recognition/i3d/i3d_r50_video_heavy_8x8x1_100e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.35](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/i3d_heavy_256p_fast_videos_disk_8x8.zip)** | 4.6 | x | x | x | x | [0.36](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_i3d_r50_8x8_fast_video.log) | 4.6 |
| [I3D](/configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py) | 256p rawframes | Memcached | 8x8 | **[0.43](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/i3d_256p_rawframes_memcahed_8x8.zip)** | 5.0 | [0.56](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction/i3d_256p_rawframes_memcached_8x8.zip) | 5.0 | x | x | x | x |
| [TSM](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) | 256p rawframes | Memcached | 8x8 | **[0.31](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/tsm_256p_rawframes_memcahed_8x8.zip)** | 6.9 | x | x | [0.41](https://download.openmmlab.com/mmaction/benchmark/recognition/temporal_shift_module/tsm_256p_rawframes_memcached_8x8.zip) | 9.1 | x | x |
| [Slowonly](/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.32](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowonly_256p_videos_disk_8x8.zip)** | 3.1 | TODO | TODO | x | x | [0.34](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowonly_r50_4x16_video.log) | 3.4 |
| [Slowonly](/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.25](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowonly_256p_fast_videos_disk_8x8.zip)** | 3.1 | TODO | TODO | x | x | [0.28](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowonly_r50_4x16_fast_video.log) | 3.4 |
| [Slowfast](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.69](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowfast_256p_videos_disk_8x8.zip)** | 6.1 | x | x | x | x | [1.04](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowfast_r50_4x16_video.log) | 7.0 |
| [Slowfast](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.68](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/slowfast_256p_fast_videos_disk_8x8.zip)** | 6.1 | x | x | x | x | [0.96](https://download.openmmlab.com/mmaction/benchmark/recognition/pyslowfast/pysf_slowfast_r50_4x16_fast_video.log) | 7.0 |
| [R(2+1)D](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | 256p videos | Disk | 8x8 | **[0.45](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/r2plus1d_256p_videos_disk_8x8.zip)** | 5.1 | x | x | x | x | x | x |
| [R(2+1)D](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) | 256p dense-encoded video | Disk | 8x8 | **[0.44](https://download.openmmlab.com/mmaction/benchmark/recognition/mmaction2/r2plus1d_256p_fast_videos_disk_8x8.zip)** | 5.1 | x | x | x | x | x | x |
### Localizers
| Model | MMAction2 (s/iter) | BSN(boundary sensitive network) (s/iter) | BMN(boundary matching network) (s/iter) |
| :------------------------------------------------------------------------------------------------------------------ | :-----------------------: | :--------------------------------------: | :-------------------------------------: |
| BSN ([TEM + PEM + PGM](/configs/localization/bsn)) | **0.074(TEM)+0.040(PEM)** | 0.101(TEM)+0.040(PEM) | x |
| BMN ([bmn_400x100_2x8_9e_activitynet_feature](/configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py)) | **3.27** | x | 3.30 |
## Details of Comparison
### TSN
- **MMAction2**
```shell
# rawframes
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_tsn configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_tsn_rawframes
# videos
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_tsn configs/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_tsn_video
```
- **MMAction**
```shell
python -u tools/train_recognizer.py configs/TSN/tsn_kinetics400_2d_rgb_r50_seg3_f1s1.py
```
- **Temporal-Shift-Module**
```shell
python main.py kinetics RGB --arch resnet50 --num_segments 3 --gd 20 --lr 0.02 --wd 1e-4 --lr_steps 20 40 --epochs 1 --batch-size 256 -j 32 --dropout 0.5 --consensus_type=avg --eval-freq=10 --npb --print-freq 1
```
### I3D
- **MMAction2**
```shell
# rawframes
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_i3d configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_i3d_rawframes
# videos
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_i3d configs/recognition/i3d/i3d_r50_video_heavy_8x8x1_100e_kinetics400_rgb.py --work-dir work_dirs/benchmark_i3d_video
```
- **MMAction**
```shell
python -u tools/train_recognizer.py configs/I3D_RGB/i3d_kinetics400_3d_rgb_r50_c3d_inflate3x1x1_seg1_f32s2.py
```
- **PySlowFast**
```shell
python tools/run_net.py --cfg configs/Kinetics/I3D_8x8_R50.yaml DATA.PATH_TO_DATA_DIR ${DATA_ROOT} NUM_GPUS 8 TRAIN.BATCH_SIZE 64 TRAIN.AUTO_RESUME False LOG_PERIOD 1 SOLVER.MAX_EPOCH 1 > pysf_i3d_r50_8x8_video.log
```
You may reproduce the result by writing a simple script to parse out the value of the field 'time_diff'.
### SlowFast
- **MMAction2**
```shell
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_slowfast configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py --work-dir work_dirs/benchmark_slowfast_video
```
- **PySlowFast**
```shell
python tools/run_net.py --cfg configs/Kinetics/SLOWFAST_4x16_R50.yaml DATA.PATH_TO_DATA_DIR ${DATA_ROOT} NUM_GPUS 8 TRAIN.BATCH_SIZE 64 TRAIN.AUTO_RESUME False LOG_PERIOD 1 SOLVER.MAX_EPOCH 1 > pysf_slowfast_r50_4x16_video.log
```
You may reproduce the result by writing a simple script to parse out the value of the field 'time_diff'.
### SlowOnly
- **MMAction2**
```shell
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_slowonly configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py --work-dir work_dirs/benchmark_slowonly_video
```
- **PySlowFast**
```shell
python tools/run_net.py --cfg configs/Kinetics/SLOW_4x16_R50.yaml DATA.PATH_TO_DATA_DIR ${DATA_ROOT} NUM_GPUS 8 TRAIN.BATCH_SIZE 64 TRAIN.AUTO_RESUME False LOG_PERIOD 1 SOLVER.MAX_EPOCH 1 > pysf_slowonly_r50_4x16_video.log
```
You may reproduce the result by writing a simple script to parse out the value of the field 'time_diff'.
### R2plus1D
- **MMAction2**
```shell
bash tools/slurm_train.sh ${PARTATION_NAME} benchmark_r2plus1d configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py --work-dir work_dirs/benchmark_r2plus1d_video
```
## Changelog
### 0.24.0 (05/05/2022)
**Highlights**
- Support different seeds
**New Features**
- Add lateral norm in multigrid config ([#1567](https://github.com/open-mmlab/mmaction2/pull/1567))
- Add openpose 25 joints in graph config ([#1578](https://github.com/open-mmlab/mmaction2/pull/1578))
- Support MLU Backend ([#1608](https://github.com/open-mmlab/mmaction2/pull/1608))
**Bug and Typo Fixes**
- Fix local_rank ([#1558](https://github.com/open-mmlab/mmaction2/pull/1558))
- Fix install typo ([#1571](https://github.com/open-mmlab/mmaction2/pull/1571))
- Fix the inference API doc ([#1580](https://github.com/open-mmlab/mmaction2/pull/1580))
- Fix zh-CN demo.md and getting_started.md ([#1587](https://github.com/open-mmlab/mmaction2/pull/1587))
- Remove Recommonmark ([#1595](https://github.com/open-mmlab/mmaction2/pull/1595))
- Fix inference with ndarray ([#1603](https://github.com/open-mmlab/mmaction2/pull/1603))
- Fix the log error when `IterBasedRunner` is used ([#1606](https://github.com/open-mmlab/mmaction2/pull/1606))
### 0.23.0 (04/01/2022)
**Highlights**
- Support different seeds
- Provide multi-node training & testing script
- Update error log
**New Features**
- Support different seeds([#1502](https://github.com/open-mmlab/mmaction2/pull/1502))
- Provide multi-node training & testing script([#1521](https://github.com/open-mmlab/mmaction2/pull/1521))
- Update error log([#1546](https://github.com/open-mmlab/mmaction2/pull/1546))
**Documentations**
- Update gpus in Slowfast readme([#1497](https://github.com/open-mmlab/mmaction2/pull/1497))
- Fix work_dir in multigrid config([#1498](https://github.com/open-mmlab/mmaction2/pull/1498))
- Add sub bn docs([#1503](https://github.com/open-mmlab/mmaction2/pull/1503))
- Add shortcycle sampler docs([#1513](https://github.com/open-mmlab/mmaction2/pull/1513))
- Update Windows Declaration([#1520](https://github.com/open-mmlab/mmaction2/pull/1520))
- Update the link for ST-GCN([#1544](https://github.com/open-mmlab/mmaction2/pull/1544))
- Update install commands([#1549](https://github.com/open-mmlab/mmaction2/pull/1549))
**Bug and Typo Fixes**
- Update colab tutorial install cmds([#1522](https://github.com/open-mmlab/mmaction2/pull/1522))
- Fix num_iters_per_epoch in analyze_logs.py([#1530](https://github.com/open-mmlab/mmaction2/pull/1530))
- Fix distributed_sampler([#1532](https://github.com/open-mmlab/mmaction2/pull/1532))
- Fix cd dir error([#1545](https://github.com/open-mmlab/mmaction2/pull/1545))
- Update arg names([#1548](https://github.com/open-mmlab/mmaction2/pull/1548))
**ModelZoo**
### 0.22.0 (03/05/2022)
**Highlights**
- Support Multigrid training strategy
- Support CPU training
- Support audio demo
- Support topk customizing in models/heads/base.py
**New Features**
- Support Multigrid training strategy([#1378](https://github.com/open-mmlab/mmaction2/pull/1378))
- Support STGCN in demo_skeleton.py([#1391](https://github.com/open-mmlab/mmaction2/pull/1391))
- Support CPU training([#1407](https://github.com/open-mmlab/mmaction2/pull/1407))
- Support audio demo([#1425](https://github.com/open-mmlab/mmaction2/pull/1425))
- Support topk customizing in models/heads/base.py([#1452](https://github.com/open-mmlab/mmaction2/pull/1452))
**Documentations**
- Add OpenMMLab platform([#1393](https://github.com/open-mmlab/mmaction2/pull/1393))
- Update links([#1394](https://github.com/open-mmlab/mmaction2/pull/1394))
- Update readme in configs([#1404](https://github.com/open-mmlab/mmaction2/pull/1404))
- Update instructions to install mmcv-full([#1426](https://github.com/open-mmlab/mmaction2/pull/1426))
- Add shortcut([#1433](https://github.com/open-mmlab/mmaction2/pull/1433))
- Update modelzoo([#1439](https://github.com/open-mmlab/mmaction2/pull/1439))
- add video_structuralize in readme([#1455](https://github.com/open-mmlab/mmaction2/pull/1455))
- Update OpenMMLab repo information([#1482](https://github.com/open-mmlab/mmaction2/pull/1482))
**Bug and Typo Fixes**
- Update train.py([#1375](https://github.com/open-mmlab/mmaction2/pull/1375))
- Fix printout bug([#1382](<(https://github.com/open-mmlab/mmaction2/pull/1382)>))
- Update multi processing setting([#1395](https://github.com/open-mmlab/mmaction2/pull/1395))
- Setup multi processing both in train and test([#1405](https://github.com/open-mmlab/mmaction2/pull/1405))
- Fix bug in nondistributed multi-gpu training([#1406](https://github.com/open-mmlab/mmaction2/pull/1406))
- Add variable fps in ava_dataset.py([#1409](https://github.com/open-mmlab/mmaction2/pull/1409))
- Only support distributed training([#1414](https://github.com/open-mmlab/mmaction2/pull/1414))
- Set test_mode for AVA configs([#1432](https://github.com/open-mmlab/mmaction2/pull/1432))
- Support single label([#1434](https://github.com/open-mmlab/mmaction2/pull/1434))
- Add check copyright([#1447](https://github.com/open-mmlab/mmaction2/pull/1447))
- Support Windows CI([#1448](https://github.com/open-mmlab/mmaction2/pull/1448))
- Fix wrong device of class_weight in models/losses/cross_entropy_loss.py([#1457](https://github.com/open-mmlab/mmaction2/pull/1457))
- Fix bug caused by distributed([#1459](https://github.com/open-mmlab/mmaction2/pull/1459))
- Update readme([#1460](https://github.com/open-mmlab/mmaction2/pull/1460))
- Fix lint caused by colab automatic upload([#1461](https://github.com/open-mmlab/mmaction2/pull/1461))
- Refine CI([#1471](https://github.com/open-mmlab/mmaction2/pull/1471))
- Update pre-commit([#1474](https://github.com/open-mmlab/mmaction2/pull/1474))
- Add deprecation message for deploy tool([#1483](https://github.com/open-mmlab/mmaction2/pull/1483))
**ModelZoo**
- Support slowfast_steplr([#1421](https://github.com/open-mmlab/mmaction2/pull/1421))
### 0.21.0 (31/12/2021)
**Highlights**
- Support 2s-AGCN
- Support publish models in Windows
- Improve some sthv1 related models
- Support BABEL
**New Features**
- Support 2s-AGCN([#1248](https://github.com/open-mmlab/mmaction2/pull/1248))
- Support skip postproc in ntu_pose_extraction([#1295](https://github.com/open-mmlab/mmaction2/pull/1295))
- Support publish models in Windows([#1325](https://github.com/open-mmlab/mmaction2/pull/1325))
- Add copyright checkhook in pre-commit-config([#1344](https://github.com/open-mmlab/mmaction2/pull/1344))
**Documentations**
- Add MMFlow ([#1273](https://github.com/open-mmlab/mmaction2/pull/1273))
- Revise README.md and add projects.md ([#1286](https://github.com/open-mmlab/mmaction2/pull/1286))
- Add 2s-AGCN in Updates([#1289](https://github.com/open-mmlab/mmaction2/pull/1289))
- Add MMFewShot([#1300](https://github.com/open-mmlab/mmaction2/pull/1300))
- Add MMHuman3d([#1304](https://github.com/open-mmlab/mmaction2/pull/1304))
- Update pre-commit([#1313](https://github.com/open-mmlab/mmaction2/pull/1313))
- Use share menu from the theme instead([#1328](https://github.com/open-mmlab/mmaction2/pull/1328))
- Update installation command([#1340](https://github.com/open-mmlab/mmaction2/pull/1340))
**Bug and Typo Fixes**
- Update the inference part in notebooks([#1256](https://github.com/open-mmlab/mmaction2/pull/1256))
- Update the map_location([#1262](<(https://github.com/open-mmlab/mmaction2/pull/1262)>))
- Fix bug that start_index is not used in RawFrameDecode([#1278](https://github.com/open-mmlab/mmaction2/pull/1278))
- Fix bug in init_random_seed([#1282](https://github.com/open-mmlab/mmaction2/pull/1282))
- Fix bug in setup.py([#1303](https://github.com/open-mmlab/mmaction2/pull/1303))
- Fix interrogate error in workflows([#1305](https://github.com/open-mmlab/mmaction2/pull/1305))
- Fix typo in slowfast config([#1309](https://github.com/open-mmlab/mmaction2/pull/1309))
- Cancel previous runs that are not completed([#1327](https://github.com/open-mmlab/mmaction2/pull/1327))
- Fix missing skip_postproc parameter([#1347](https://github.com/open-mmlab/mmaction2/pull/1347))
- Update ssn.py([#1355](https://github.com/open-mmlab/mmaction2/pull/1355))
- Use latest youtube-dl([#1357](https://github.com/open-mmlab/mmaction2/pull/1357))
- Fix test-best([#1362](https://github.com/open-mmlab/mmaction2/pull/1362))
**ModelZoo**
- Improve some sthv1 related models([#1306](https://github.com/open-mmlab/mmaction2/pull/1306))
- Support BABEL([#1332](https://github.com/open-mmlab/mmaction2/pull/1332))
### 0.20.0 (07/10/2021)
**Highlights**
- Support TorchServe
- Add video structuralize demo
- Support using 3D skeletons for skeleton-based action recognition
- Benchmark PoseC3D on UCF and HMDB
**New Features**
- Support TorchServe ([#1212](https://github.com/open-mmlab/mmaction2/pull/1212))
- Support 3D skeletons pre-processing ([#1218](https://github.com/open-mmlab/mmaction2/pull/1218))
- Support video structuralize demo ([#1197](https://github.com/open-mmlab/mmaction2/pull/1197))
**Documentations**
- Revise README.md and add projects.md ([#1214](https://github.com/open-mmlab/mmaction2/pull/1214))
- Add CN docs for Skeleton dataset, PoseC3D and ST-GCN ([#1228](https://github.com/open-mmlab/mmaction2/pull/1228), [#1237](https://github.com/open-mmlab/mmaction2/pull/1237), [#1236](https://github.com/open-mmlab/mmaction2/pull/1236))
- Add tutorial for custom dataset training for skeleton-based action recognition ([#1234](https://github.com/open-mmlab/mmaction2/pull/1234))
**Bug and Typo Fixes**
- Fix tutorial link ([#1219](https://github.com/open-mmlab/mmaction2/pull/1219))
- Fix GYM links ([#1224](https://github.com/open-mmlab/mmaction2/pull/1224))
**ModelZoo**
- Benchmark PoseC3D on UCF and HMDB ([#1223](https://github.com/open-mmlab/mmaction2/pull/1223))
- Add ST-GCN + 3D skeleton model for NTU60-XSub ([#1236](https://github.com/open-mmlab/mmaction2/pull/1236))
### 0.19.0 (07/10/2021)
**Highlights**
- Support ST-GCN
- Refactor the inference API
- Add code spell check hook
**New Features**
- Support ST-GCN ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123))
**Improvement**
- Add label maps for every dataset ([#1127](https://github.com/open-mmlab/mmaction2/pull/1127))
- Remove useless code MultiGroupCrop ([#1180](https://github.com/open-mmlab/mmaction2/pull/1180))
- Refactor Inference API ([#1191](https://github.com/open-mmlab/mmaction2/pull/1191))
- Add code spell check hook ([#1208](https://github.com/open-mmlab/mmaction2/pull/1208))
- Use docker in CI ([#1159](https://github.com/open-mmlab/mmaction2/pull/1159))
**Documentations**
- Update metafiles to new OpenMMLAB protocols ([#1134](https://github.com/open-mmlab/mmaction2/pull/1134))
- Switch to new doc style ([#1160](https://github.com/open-mmlab/mmaction2/pull/1160))
- Improve the ERROR message ([#1203](https://github.com/open-mmlab/mmaction2/pull/1203))
- Fix invalid URL in getting_started ([#1169](https://github.com/open-mmlab/mmaction2/pull/1169))
**Bug and Typo Fixes**
- Compatible with new MMClassification ([#1139](https://github.com/open-mmlab/mmaction2/pull/1139))
- Add missing runtime dependencies ([#1144](https://github.com/open-mmlab/mmaction2/pull/1144))
- Fix THUMOS tag proposals path ([#1156](https://github.com/open-mmlab/mmaction2/pull/1156))
- Fix LoadHVULabel ([#1194](https://github.com/open-mmlab/mmaction2/pull/1194))
- Switch the default value of `persistent_workers` to False ([#1202](https://github.com/open-mmlab/mmaction2/pull/1202))
- Fix `_freeze_stages` for MobileNetV2 ([#1193](https://github.com/open-mmlab/mmaction2/pull/1193))
- Fix resume when building rawframes ([#1150](https://github.com/open-mmlab/mmaction2/pull/1150))
- Fix device bug for class weight ([#1188](https://github.com/open-mmlab/mmaction2/pull/1188))
- Correct Arg names in extract_audio.py ([#1148](https://github.com/open-mmlab/mmaction2/pull/1148))
**ModelZoo**
- Add TSM-MobileNetV2 ported from TSM ([#1163](https://github.com/open-mmlab/mmaction2/pull/1163))
- Add ST-GCN for NTURGB+D-XSub-60 ([#1123](https://github.com/open-mmlab/mmaction2/pull/1123))
### 0.18.0 (02/09/2021)
**Improvement**
- Add CopyRight ([#1099](https://github.com/open-mmlab/mmaction2/pull/1099))
- Support NTU Pose Extraction ([#1076](https://github.com/open-mmlab/mmaction2/pull/1076))
- Support Caching in RawFrameDecode ([#1078](https://github.com/open-mmlab/mmaction2/pull/1078))
- Add citations & Support python3.9 CI & Use fixed-version sphinx ([#1125](https://github.com/open-mmlab/mmaction2/pull/1125))
**Documentations**
- Add Descriptions of PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053))
**Bug and Typo Fixes**
- Fix SSV2 checkpoints ([#1101](https://github.com/open-mmlab/mmaction2/pull/1101))
- Fix CSN normalization ([#1116](https://github.com/open-mmlab/mmaction2/pull/1116))
- Fix typo ([#1121](https://github.com/open-mmlab/mmaction2/pull/1121))
- Fix new_crop_quadruple bug ([#1108](https://github.com/open-mmlab/mmaction2/pull/1108))
### 0.17.0 (03/08/2021)
**Highlights**
- Support PyTorch 1.9
- Support Pytorchvideo Transforms
- Support PreciseBN
**New Features**
- Support Pytorchvideo Transforms ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008))
- Support PreciseBN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038))
**Improvements**
- Remove redundant augmentations in config files ([#996](https://github.com/open-mmlab/mmaction2/pull/996))
- Make resource directory to hold common resource pictures ([#1011](https://github.com/open-mmlab/mmaction2/pull/1011))
- Remove deprecated FrameSelector ([#1010](https://github.com/open-mmlab/mmaction2/pull/1010))
- Support Concat Dataset ([#1000](https://github.com/open-mmlab/mmaction2/pull/1000))
- Add `to-mp4` option to resize_videos.py ([#1021](https://github.com/open-mmlab/mmaction2/pull/1021))
- Add option to keep tail frames ([#1050](https://github.com/open-mmlab/mmaction2/pull/1050))
- Update MIM support ([#1061](https://github.com/open-mmlab/mmaction2/pull/1061))
- Calculate Top-K accurate and inaccurate classes ([#1047](https://github.com/open-mmlab/mmaction2/pull/1047))
**Bug and Typo Fixes**
- Fix bug in PoseC3D demo ([#1009](https://github.com/open-mmlab/mmaction2/pull/1009))
- Fix some problems in resize_videos.py ([#1012](https://github.com/open-mmlab/mmaction2/pull/1012))
- Support torch1.9 ([#1015](https://github.com/open-mmlab/mmaction2/pull/1015))
- Remove redundant code in CI ([#1046](https://github.com/open-mmlab/mmaction2/pull/1046))
- Fix bug about persistent_workers ([#1044](https://github.com/open-mmlab/mmaction2/pull/1044))
- Support TimeSformer feature extraction ([#1035](https://github.com/open-mmlab/mmaction2/pull/1035))
- Fix ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025))
**ModelZoo**
- Add TSM-R50 sthv1 models trained by PytorchVideo RandAugment and AugMix ([#1008](https://github.com/open-mmlab/mmaction2/pull/1008))
- Update SlowOnly SthV1 checkpoints ([#1034](https://github.com/open-mmlab/mmaction2/pull/1034))
- Add SlowOnly Kinetics400 checkpoints trained with Precise-BN ([#1038](https://github.com/open-mmlab/mmaction2/pull/1038))
- Add CSN-R50 from scratch checkpoints ([#1045](https://github.com/open-mmlab/mmaction2/pull/1045))
- TPN Kinetics-400 Checkpoints trained with the new ColorJitter ([#1025](https://github.com/open-mmlab/mmaction2/pull/1025))
**Documentation**
- Add Chinese translation of feature_extraction.md ([#1020](https://github.com/open-mmlab/mmaction2/pull/1020))
- Fix the code snippet in getting_started.md ([#1023](https://github.com/open-mmlab/mmaction2/pull/1023))
- Fix TANet config table ([#1028](https://github.com/open-mmlab/mmaction2/pull/1028))
- Add description to PoseC3D dataset ([#1053](https://github.com/open-mmlab/mmaction2/pull/1053))
### 0.16.0 (01/07/2021)
**Highlights**
- Support using backbone from pytorch-image-models(timm)
- Support PIMS Decoder
- Demo for skeleton-based action recognition
- Support Timesformer
**New Features**
- Support using backbones from pytorch-image-models(timm) for TSN ([#880](https://github.com/open-mmlab/mmaction2/pull/880))
- Support torchvision transformations in preprocessing pipelines ([#972](https://github.com/open-mmlab/mmaction2/pull/972))
- Demo for skeleton-based action recognition ([#972](https://github.com/open-mmlab/mmaction2/pull/972))
- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839))
**Improvements**
- Add a tool to find invalid videos ([#907](https://github.com/open-mmlab/mmaction2/pull/907), [#950](https://github.com/open-mmlab/mmaction2/pull/950))
- Add an option to specify spectrogram_type ([#909](https://github.com/open-mmlab/mmaction2/pull/909))
- Add json output to video demo ([#906](https://github.com/open-mmlab/mmaction2/pull/906))
- Add MIM related docs ([#918](https://github.com/open-mmlab/mmaction2/pull/918))
- Rename lr to scheduler ([#916](https://github.com/open-mmlab/mmaction2/pull/916))
- Support `--cfg-options` for demos ([#911](https://github.com/open-mmlab/mmaction2/pull/911))
- Support number counting for flow-wise filename template ([#922](https://github.com/open-mmlab/mmaction2/pull/922))
- Add Chinese tutorial ([#941](https://github.com/open-mmlab/mmaction2/pull/941))
- Change ResNet3D default values ([#939](https://github.com/open-mmlab/mmaction2/pull/939))
- Adjust script structure ([#935](https://github.com/open-mmlab/mmaction2/pull/935))
- Add font color to args in long_video_demo ([#947](https://github.com/open-mmlab/mmaction2/pull/947))
- Polish code style with Pylint ([#908](https://github.com/open-mmlab/mmaction2/pull/908))
- Support PIMS Decoder ([#946](https://github.com/open-mmlab/mmaction2/pull/946))
- Improve Metafiles ([#956](https://github.com/open-mmlab/mmaction2/pull/956), [#979](https://github.com/open-mmlab/mmaction2/pull/979), [#966](https://github.com/open-mmlab/mmaction2/pull/966))
- Add links to download Kinetics400 validation ([#920](https://github.com/open-mmlab/mmaction2/pull/920))
- Audit the usage of shutil.rmtree ([#943](https://github.com/open-mmlab/mmaction2/pull/943))
- Polish localizer related codes([#913](https://github.com/open-mmlab/mmaction2/pull/913))
**Bug and Typo Fixes**
- Fix spatiotemporal detection demo ([#899](https://github.com/open-mmlab/mmaction2/pull/899))
- Fix docstring for 3D inflate ([#925](https://github.com/open-mmlab/mmaction2/pull/925))
- Fix bug of writing text to video with TextClip ([#952](https://github.com/open-mmlab/mmaction2/pull/952))
- Fix mmcv install in CI ([#977](https://github.com/open-mmlab/mmaction2/pull/977))
**ModelZoo**
- Add TSN with Swin Transformer backbone as an example for using pytorch-image-models(timm) backbones ([#880](https://github.com/open-mmlab/mmaction2/pull/880))
- Port CSN checkpoints from VMZ ([#945](https://github.com/open-mmlab/mmaction2/pull/945))
- Release various checkpoints for UCF101, HMDB51 and Sthv1 ([#938](https://github.com/open-mmlab/mmaction2/pull/938))
- Support Timesformer ([#839](https://github.com/open-mmlab/mmaction2/pull/839))
- Update TSM modelzoo ([#981](https://github.com/open-mmlab/mmaction2/pull/981))
### 0.15.0 (31/05/2021)
**Highlights**
- Support PoseC3D
- Support ACRN
- Support MIM
**New Features**
- Support PoseC3D ([#786](https://github.com/open-mmlab/mmaction2/pull/786), [#890](https://github.com/open-mmlab/mmaction2/pull/890))
- Support MIM ([#870](https://github.com/open-mmlab/mmaction2/pull/870))
- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891))
- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864))
**Improvements**
- Add `metric_options` for evaluation to docs ([#873](https://github.com/open-mmlab/mmaction2/pull/873))
- Support creating a new label map based on custom classes for demos about spatio temporal demo ([#879](https://github.com/open-mmlab/mmaction2/pull/879))
- Improve document about AVA dataset preparation ([#878](https://github.com/open-mmlab/mmaction2/pull/878))
- Provide a script to extract clip-level feature ([#856](https://github.com/open-mmlab/mmaction2/pull/856))
**Bug and Typo Fixes**
- Fix issues about resume ([#877](https://github.com/open-mmlab/mmaction2/pull/877), [#878](https://github.com/open-mmlab/mmaction2/pull/878))
- Correct the key name of `eval_results` dictionary for metric 'mmit_mean_average_precision' ([#885](https://github.com/open-mmlab/mmaction2/pull/885))
**ModelZoo**
- Support Jester dataset ([#864](https://github.com/open-mmlab/mmaction2/pull/864))
- Support ACRN and Focal Loss ([#891](https://github.com/open-mmlab/mmaction2/pull/891))
### 0.14.0 (30/04/2021)
**Highlights**
- Support TRN
- Support Diving48
**New Features**
- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755))
- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835))
- Support Webcam Demo for Spatio-temporal Action Detection Models ([#795](https://github.com/open-mmlab/mmaction2/pull/795))
**Improvements**
- Add softmax option for pytorch2onnx tool ([#781](https://github.com/open-mmlab/mmaction2/pull/781))
- Support TRN ([#755](https://github.com/open-mmlab/mmaction2/pull/755))
- Test with onnx models and TensorRT engines ([#758](https://github.com/open-mmlab/mmaction2/pull/758))
- Speed up AVA Testing ([#784](https://github.com/open-mmlab/mmaction2/pull/784))
- Add `self.with_neck` attribute ([#796](https://github.com/open-mmlab/mmaction2/pull/796))
- Update installation document ([#798](https://github.com/open-mmlab/mmaction2/pull/798))
- Use a random master port ([#809](https://github.com/open-mmlab/mmaction2/pull/8098))
- Update AVA processing data document ([#801](https://github.com/open-mmlab/mmaction2/pull/801))
- Refactor spatio-temporal augmentation ([#782](https://github.com/open-mmlab/mmaction2/pull/782))
- Add QR code in CN README ([#812](https://github.com/open-mmlab/mmaction2/pull/812))
- Add Alternative way to download Kinetics ([#817](https://github.com/open-mmlab/mmaction2/pull/817), [#822](https://github.com/open-mmlab/mmaction2/pull/822))
- Refactor Sampler ([#790](https://github.com/open-mmlab/mmaction2/pull/790))
- Use EvalHook in MMCV with backward compatibility ([#793](https://github.com/open-mmlab/mmaction2/pull/793))
- Use MMCV Model Registry ([#843](https://github.com/open-mmlab/mmaction2/pull/843))
**Bug and Typo Fixes**
- Fix a bug in pytorch2onnx.py when `num_classes <= 4` ([#800](https://github.com/open-mmlab/mmaction2/pull/800), [#824](https://github.com/open-mmlab/mmaction2/pull/824))
- Fix `demo_spatiotemporal_det.py` error ([#803](https://github.com/open-mmlab/mmaction2/pull/803), [#805](https://github.com/open-mmlab/mmaction2/pull/805))
- Fix loading config bugs when resume ([#820](https://github.com/open-mmlab/mmaction2/pull/820))
- Make HMDB51 annotation generation more robust ([#811](https://github.com/open-mmlab/mmaction2/pull/811))
**ModelZoo**
- Update checkpoint for 256 height in something-V2 ([#789](https://github.com/open-mmlab/mmaction2/pull/789))
- Support Diving48 ([#835](https://github.com/open-mmlab/mmaction2/pull/835))
### 0.13.0 (31/03/2021)
**Highlights**
- Support LFB
- Support using backbone from MMCls/TorchVision
- Add Chinese documentation
**New Features**
- Support LFB ([#553](https://github.com/open-mmlab/mmaction2/pull/553))
- Support using backbones from MMCls for TSN ([#679](https://github.com/open-mmlab/mmaction2/pull/679))
- Support using backbones from TorchVision for TSN ([#720](https://github.com/open-mmlab/mmaction2/pull/720))
- Support Mixup and Cutmix for recognizers ([#681](https://github.com/open-mmlab/mmaction2/pull/681))
- Support Chinese documentation ([#665](https://github.com/open-mmlab/mmaction2/pull/665), [#680](https://github.com/open-mmlab/mmaction2/pull/680), [#689](https://github.com/open-mmlab/mmaction2/pull/689), [#701](https://github.com/open-mmlab/mmaction2/pull/701), [#702](https://github.com/open-mmlab/mmaction2/pull/702), [#703](https://github.com/open-mmlab/mmaction2/pull/703), [#706](https://github.com/open-mmlab/mmaction2/pull/706), [#716](https://github.com/open-mmlab/mmaction2/pull/716), [#717](https://github.com/open-mmlab/mmaction2/pull/717), [#731](https://github.com/open-mmlab/mmaction2/pull/731), [#733](https://github.com/open-mmlab/mmaction2/pull/733), [#735](https://github.com/open-mmlab/mmaction2/pull/735), [#736](https://github.com/open-mmlab/mmaction2/pull/736), [#737](https://github.com/open-mmlab/mmaction2/pull/737), [#738](https://github.com/open-mmlab/mmaction2/pull/738), [#739](https://github.com/open-mmlab/mmaction2/pull/739), [#740](https://github.com/open-mmlab/mmaction2/pull/740), [#742](https://github.com/open-mmlab/mmaction2/pull/742), [#752](https://github.com/open-mmlab/mmaction2/pull/752), [#759](https://github.com/open-mmlab/mmaction2/pull/759), [#761](https://github.com/open-mmlab/mmaction2/pull/761), [#772](https://github.com/open-mmlab/mmaction2/pull/772), [#775](https://github.com/open-mmlab/mmaction2/pull/775))
**Improvements**
- Add slowfast config/json/log/ckpt for training custom classes of AVA ([#678](https://github.com/open-mmlab/mmaction2/pull/678))
- Set RandAugment as Imgaug default transforms ([#585](https://github.com/open-mmlab/mmaction2/pull/585))
- Add `--test-last` & `--test-best` for `tools/train.py` to test checkpoints after training ([#608](https://github.com/open-mmlab/mmaction2/pull/608))
- Add fcn_testing in TPN ([#684](https://github.com/open-mmlab/mmaction2/pull/684))
- Remove redundant recall functions ([#741](https://github.com/open-mmlab/mmaction2/pull/741))
- Recursively remove pretrained step for testing ([#695](https://github.com/open-mmlab/mmaction2/pull/695))
- Improve demo by limiting inference fps ([#668](https://github.com/open-mmlab/mmaction2/pull/668))
**Bug and Typo Fixes**
- Fix a bug about multi-class in VideoDataset ([#723](https://github.com/open-mmlab/mmaction2/pull/678))
- Reverse key-value in anet filelist generation ([#686](https://github.com/open-mmlab/mmaction2/pull/686))
- Fix flow norm cfg typo ([#693](https://github.com/open-mmlab/mmaction2/pull/693))
**ModelZoo**
- Add LFB for AVA2.1 ([#553](https://github.com/open-mmlab/mmaction2/pull/553))
- Add TSN with ResNeXt-101-32x4d backbone as an example for using MMCls backbones ([#679](https://github.com/open-mmlab/mmaction2/pull/679))
- Add TSN with Densenet161 backbone as an example for using TorchVision backbones ([#720](https://github.com/open-mmlab/mmaction2/pull/720))
- Add slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb ([#690](https://github.com/open-mmlab/mmaction2/pull/690))
- Add slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb ([#704](https://github.com/open-mmlab/mmaction2/pull/704))
- Add slowonly_nl_kinetics_pretrained_r50_4x16x1(8x8x1)\_20e_ava_rgb ([#730](https://github.com/open-mmlab/mmaction2/pull/730))
### 0.12.0 (28/02/2021)
**Highlights**
- Support TSM-MobileNetV2
- Support TANet
- Support GPU Normalize
**New Features**
- Support TSM-MobileNetV2 ([#415](https://github.com/open-mmlab/mmaction2/pull/415))
- Support flip with label mapping ([#591](https://github.com/open-mmlab/mmaction2/pull/591))
- Add seed option for sampler ([#642](https://github.com/open-mmlab/mmaction2/pull/642))
- Support GPU Normalize ([#586](https://github.com/open-mmlab/mmaction2/pull/586))
- Support TANet ([#595](https://github.com/open-mmlab/mmaction2/pull/595))
**Improvements**
- Training custom classes of ava dataset ([#555](https://github.com/open-mmlab/mmaction2/pull/555))
- Add CN README in homepage ([#592](https://github.com/open-mmlab/mmaction2/pull/592), [#594](https://github.com/open-mmlab/mmaction2/pull/594))
- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625))
- Refactor config: Specify `train_cfg` and `test_cfg` in `model` ([#629](https://github.com/open-mmlab/mmaction2/pull/629))
- Provide an alternative way to download older kinetics annotations ([#597](https://github.com/open-mmlab/mmaction2/pull/597))
- Update FAQ for
- 1). data pipeline about video and frames ([#598](https://github.com/open-mmlab/mmaction2/pull/598))
- 2). how to show results ([#598](https://github.com/open-mmlab/mmaction2/pull/598))
- 3). batch size setting for batchnorm ([#657](https://github.com/open-mmlab/mmaction2/pull/657))
- 4). how to fix stages of backbone when finetuning models ([#658](https://github.com/open-mmlab/mmaction2/pull/658))
- Modify default value of `save_best` ([#600](https://github.com/open-mmlab/mmaction2/pull/600))
- Use BibTex rather than latex in markdown ([#607](https://github.com/open-mmlab/mmaction2/pull/607))
- Add warnings of uninstalling mmdet and supplementary documents ([#624](https://github.com/open-mmlab/mmaction2/pull/624))
- Support soft label for CrossEntropyLoss ([#625](https://github.com/open-mmlab/mmaction2/pull/625))
**Bug and Typo Fixes**
- Fix value of `pem_low_temporal_iou_threshold` in BSN ([#556](https://github.com/open-mmlab/mmaction2/pull/556))
- Fix ActivityNet download script ([#601](https://github.com/open-mmlab/mmaction2/pull/601))
**ModelZoo**
- Add TSM-MobileNetV2 for Kinetics400 ([#415](https://github.com/open-mmlab/mmaction2/pull/415))
- Add deeper SlowFast models ([#605](https://github.com/open-mmlab/mmaction2/pull/605))
### 0.11.0 (31/01/2021)
**Highlights**
- Support imgaug
- Support spatial temporal demo
- Refactor EvalHook, config structure, unittest structure
**New Features**
- Support [imgaug](https://imgaug.readthedocs.io/en/latest/index.html) for augmentations in the data pipeline ([#492](https://github.com/open-mmlab/mmaction2/pull/492))
- Support setting `max_testing_views` for extremely large models to save GPU memory used ([#511](https://github.com/open-mmlab/mmaction2/pull/511))
- Add spatial temporal demo ([#547](https://github.com/open-mmlab/mmaction2/pull/547), [#566](https://github.com/open-mmlab/mmaction2/pull/566))
**Improvements**
- Refactor EvalHook ([#395](https://github.com/open-mmlab/mmaction2/pull/395))
- Refactor AVA hook ([#567](https://github.com/open-mmlab/mmaction2/pull/567))
- Add repo citation ([#545](https://github.com/open-mmlab/mmaction2/pull/545))
- Add dataset size of Kinetics400 ([#503](https://github.com/open-mmlab/mmaction2/pull/503))
- Add lazy operation docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504))
- Add class_weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509))
- add some explanation about the resampling in slowfast ([#502](https://github.com/open-mmlab/mmaction2/pull/502))
- Modify paper title in README.md ([#512](https://github.com/open-mmlab/mmaction2/pull/512))
- Add alternative ways to download Kinetics ([#521](https://github.com/open-mmlab/mmaction2/pull/521))
- Add OpenMMLab projects link in README ([#530](https://github.com/open-mmlab/mmaction2/pull/530))
- Change default preprocessing to shortedge to 256 ([#538](https://github.com/open-mmlab/mmaction2/pull/538))
- Add config tag in dataset README ([#540](https://github.com/open-mmlab/mmaction2/pull/540))
- Add solution for markdownlint installation issue ([#497](https://github.com/open-mmlab/mmaction2/pull/497))
- Add dataset overview in readthedocs ([#548](https://github.com/open-mmlab/mmaction2/pull/548))
- Modify the trigger mode of the warnings of missing mmdet ([#583](https://github.com/open-mmlab/mmaction2/pull/583))
- Refactor config structure ([#488](https://github.com/open-mmlab/mmaction2/pull/488), [#572](https://github.com/open-mmlab/mmaction2/pull/572))
- Refactor unittest structure ([#433](https://github.com/open-mmlab/mmaction2/pull/433))
**Bug and Typo Fixes**
- Fix a bug about ava dataset validation ([#527](https://github.com/open-mmlab/mmaction2/pull/527))
- Fix a bug about ResNet pretrain weight initialization ([#582](https://github.com/open-mmlab/mmaction2/pull/582))
- Fix a bug in CI due to MMCV index ([#495](https://github.com/open-mmlab/mmaction2/pull/495))
- Remove invalid links of MiT and MMiT ([#516](https://github.com/open-mmlab/mmaction2/pull/516))
- Fix frame rate bug for AVA preparation ([#576](https://github.com/open-mmlab/mmaction2/pull/576))
**ModelZoo**
### 0.10.0 (31/12/2020)
**Highlights**
- Support Spatio-Temporal Action Detection (AVA)
- Support precise BN
**New Features**
- Support precise BN ([#501](https://github.com/open-mmlab/mmaction2/pull/501/))
- Support Spatio-Temporal Action Detection (AVA) ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
- Support to return feature maps in `inference_recognizer` ([#458](https://github.com/open-mmlab/mmaction2/pull/458))
**Improvements**
- Add arg `stride` to long_video_demo.py, to make inference faster ([#468](https://github.com/open-mmlab/mmaction2/pull/468))
- Support training and testing for Spatio-Temporal Action Detection ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
- Fix CI due to pip upgrade ([#454](https://github.com/open-mmlab/mmaction2/pull/454))
- Add markdown lint in pre-commit hook ([#255](https://github.com/open-mmlab/mmaction2/pull/225))
- Speed up confusion matrix calculation ([#465](https://github.com/open-mmlab/mmaction2/pull/465))
- Use title case in modelzoo statistics ([#456](https://github.com/open-mmlab/mmaction2/pull/456))
- Add FAQ documents for easy troubleshooting. ([#413](https://github.com/open-mmlab/mmaction2/pull/413), [#420](https://github.com/open-mmlab/mmaction2/pull/420), [#439](https://github.com/open-mmlab/mmaction2/pull/439))
- Support Spatio-Temporal Action Detection with context ([#471](https://github.com/open-mmlab/mmaction2/pull/471))
- Add class weight for CrossEntropyLoss and BCELossWithLogits ([#509](https://github.com/open-mmlab/mmaction2/pull/509))
- Add Lazy OPs docs ([#504](https://github.com/open-mmlab/mmaction2/pull/504))
**Bug and Typo Fixes**
- Fix typo in default argument of BaseHead ([#446](https://github.com/open-mmlab/mmaction2/pull/446))
- Fix potential bug about `output_config` overwrite ([#463](https://github.com/open-mmlab/mmaction2/pull/463))
**ModelZoo**
- Add SlowOnly, SlowFast for AVA2.1 ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
### 0.9.0 (30/11/2020)
**Highlights**
- Support GradCAM utils for recognizers
- Support ResNet Audio model
**New Features**
- Automatically add modelzoo statistics to readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327))
- Support GYM99 ([#331](https://github.com/open-mmlab/mmaction2/pull/331), [#336](https://github.com/open-mmlab/mmaction2/pull/336))
- Add AudioOnly Pathway from AVSlowFast. ([#355](https://github.com/open-mmlab/mmaction2/pull/355))
- Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324))
- Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345))
- Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291))
**Improvements**
- Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312))
- Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274))
- Update docs bout test crops ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
- Polish code format using pylint manually ([#338](https://github.com/open-mmlab/mmaction2/pull/338))
- Update unittest coverage ([#358](https://github.com/open-mmlab/mmaction2/pull/358), [#322](https://github.com/open-mmlab/mmaction2/pull/322), [#325](https://github.com/open-mmlab/mmaction2/pull/325))
- Add random seed for building filelists ([#323](https://github.com/open-mmlab/mmaction2/pull/323))
- Update colab tutorial ([#367](https://github.com/open-mmlab/mmaction2/pull/367))
- set default batch_size of evaluation and testing to 1 ([#250](https://github.com/open-mmlab/mmaction2/pull/250))
- Rename the preparation docs to `README.md` ([#388](https://github.com/open-mmlab/mmaction2/pull/388))
- Move docs about demo to `demo/README.md` ([#329](https://github.com/open-mmlab/mmaction2/pull/329))
- Remove redundant code in `tools/test.py` ([#310](https://github.com/open-mmlab/mmaction2/pull/310))
- Automatically calculate number of test clips for Recognizer2D ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
**Bug and Typo Fixes**
- Fix rename Kinetics classnames bug ([#384](https://github.com/open-mmlab/mmaction2/pull/384))
- Fix a bug in BaseDataset when `data_prefix` is None ([#314](https://github.com/open-mmlab/mmaction2/pull/314))
- Fix a bug about `tmp_folder` in `OpenCVInit` ([#357](https://github.com/open-mmlab/mmaction2/pull/357))
- Fix `get_thread_id` when not using disk as backend ([#354](https://github.com/open-mmlab/mmaction2/pull/354), [#357](https://github.com/open-mmlab/mmaction2/pull/357))
- Fix the bug of HVU object `num_classes` from 1679 to 1678 ([#307](https://github.com/open-mmlab/mmaction2/pull/307))
- Fix typo in `export_model.md` ([#399](https://github.com/open-mmlab/mmaction2/pull/399))
- Fix OmniSource training configs ([#321](https://github.com/open-mmlab/mmaction2/pull/321))
- Fix Issue #306: Bug of SampleAVAFrames ([#317](https://github.com/open-mmlab/mmaction2/pull/317))
**ModelZoo**
- Add SlowOnly model for GYM99, both RGB and Flow ([#336](https://github.com/open-mmlab/mmaction2/pull/336))
- Add auto modelzoo statistics in readthedocs ([#327](https://github.com/open-mmlab/mmaction2/pull/327))
- Add TSN for HMDB51 pretrained on Kinetics400, Moments in Time and ImageNet. ([#372](https://github.com/open-mmlab/mmaction2/pull/372))
### v0.8.0 (31/10/2020)
**Highlights**
- Support [OmniSource](https://arxiv.org/abs/2003.13042)
- Support C3D
- Support video recognition with audio modality
- Support HVU
- Support X3D
**New Features**
- Support AVA dataset preparation ([#266](https://github.com/open-mmlab/mmaction2/pull/266))
- Support the training of video recognition dataset with multiple tag categories ([#235](https://github.com/open-mmlab/mmaction2/pull/235))
- Support joint training with multiple training datasets of multiple formats, including images, untrimmed videos, etc. ([#242](https://github.com/open-mmlab/mmaction2/pull/242))
- Support to specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216))
- Implement X3D models, support testing with model weights converted from SlowFast ([#288](https://github.com/open-mmlab/mmaction2/pull/288))
- Support specify a start epoch to conduct evaluation ([#216](https://github.com/open-mmlab/mmaction2/pull/216))
**Improvements**
- Set default values of 'average_clips' in each config file so that there is no need to set it explicitly during testing in most cases ([#232](https://github.com/open-mmlab/mmaction2/pull/232))
- Extend HVU datatools to generate individual file list for each tag category ([#258](https://github.com/open-mmlab/mmaction2/pull/258))
- Support data preparation for Kinetics-600 and Kinetics-700 ([#254](https://github.com/open-mmlab/mmaction2/pull/254))
- Use `metric_dict` to replace hardcoded arguments in `evaluate` function ([#286](https://github.com/open-mmlab/mmaction2/pull/286))
- Add `cfg-options` in arguments to override some settings in the used config for convenience ([#212](https://github.com/open-mmlab/mmaction2/pull/212))
- Rename the old evaluating protocol `mean_average_precision` as `mmit_mean_average_precision` since it is only used on MMIT and is not the `mAP` we usually talk about. Add `mean_average_precision`, which is the real `mAP` ([#235](https://github.com/open-mmlab/mmaction2/pull/235))
- Add accurate setting (Three crop * 2 clip) and report corresponding performance for TSM model ([#241](https://github.com/open-mmlab/mmaction2/pull/241))
- Add citations in each preparing_dataset.md in `tools/data/dataset` ([#289](https://github.com/open-mmlab/mmaction2/pull/289))
- Update the performance of audio-visual fusion on Kinetics-400 ([#281](https://github.com/open-mmlab/mmaction2/pull/281))
- Support data preparation of OmniSource web datasets, including GoogleImage, InsImage, InsVideo and KineticsRawVideo ([#294](https://github.com/open-mmlab/mmaction2/pull/294))
- Use `metric_options` dict to provide metric args in `evaluate` ([#286](https://github.com/open-mmlab/mmaction2/pull/286))
**Bug Fixes**
- Register `FrameSelector` in `PIPELINES` ([#268](https://github.com/open-mmlab/mmaction2/pull/268))
- Fix the potential bug for default value in dataset_setting ([#245](https://github.com/open-mmlab/mmaction2/pull/245))
- Fix multi-node dist test ([#292](https://github.com/open-mmlab/mmaction2/pull/292))
- Fix the data preparation bug for `something-something` dataset ([#278](https://github.com/open-mmlab/mmaction2/pull/278))
- Fix the invalid config url in slowonly README data benchmark ([#249](https://github.com/open-mmlab/mmaction2/pull/249))
- Validate that the performance of models trained with videos have no significant difference comparing to the performance of models trained with rawframes ([#256](https://github.com/open-mmlab/mmaction2/pull/256))
- Correct the `img_norm_cfg` used by TSN-3seg-R50 UCF-101 model, improve the Top-1 accuracy by 3% ([#273](https://github.com/open-mmlab/mmaction2/pull/273))
**ModelZoo**
- Add Baselines for Kinetics-600 and Kinetics-700, including TSN-R50-8seg and SlowOnly-R50-8x8 ([#259](https://github.com/open-mmlab/mmaction2/pull/259))
- Add OmniSource benchmark on MiniKineitcs ([#296](https://github.com/open-mmlab/mmaction2/pull/296))
- Add Baselines for HVU, including TSN-R18-8seg on 6 tag categories of HVU ([#287](https://github.com/open-mmlab/mmaction2/pull/287))
- Add X3D models ported from [SlowFast](https://github.com/facebookresearch/SlowFast/) ([#288](https://github.com/open-mmlab/mmaction2/pull/288))
### v0.7.0 (30/9/2020)
**Highlights**
- Support TPN
- Support JHMDB, UCF101-24, HVU dataset preparation
- support onnx model conversion
**New Features**
- Support the data pre-processing pipeline for the HVU Dataset ([#277](https://github.com/open-mmlab/mmaction2/pull/227/))
- Support real-time action recognition from web camera ([#171](https://github.com/open-mmlab/mmaction2/pull/171))
- Support onnx ([#160](https://github.com/open-mmlab/mmaction2/pull/160))
- Support UCF101-24 preparation ([#219](https://github.com/open-mmlab/mmaction2/pull/219))
- Support evaluating mAP for ActivityNet with [CUHK17_activitynet_pred](http://activity-net.org/challenges/2017/evaluation.html) ([#176](https://github.com/open-mmlab/mmaction2/pull/176))
- Add the data pipeline for ActivityNet, including downloading videos, extracting RGB and Flow frames, finetuning TSN and extracting feature ([#190](https://github.com/open-mmlab/mmaction2/pull/190))
- Support JHMDB preparation ([#220](https://github.com/open-mmlab/mmaction2/pull/220))
**ModelZoo**
- Add finetuning setting for SlowOnly ([#173](https://github.com/open-mmlab/mmaction2/pull/173))
- Add TSN and SlowOnly models trained with [OmniSource](https://arxiv.org/abs/2003.13042), which achieve 75.7% Top-1 with TSN-R50-3seg and 80.4% Top-1 with SlowOnly-R101-8x8 ([#215](https://github.com/open-mmlab/mmaction2/pull/215))
**Improvements**
- Support demo with video url ([#165](https://github.com/open-mmlab/mmaction2/pull/165))
- Support multi-batch when testing ([#184](https://github.com/open-mmlab/mmaction2/pull/184))
- Add tutorial for adding a new learning rate updater ([#181](https://github.com/open-mmlab/mmaction2/pull/181))
- Add config name in meta info ([#183](https://github.com/open-mmlab/mmaction2/pull/183))
- Remove git hash in `__version__` ([#189](https://github.com/open-mmlab/mmaction2/pull/189))
- Check mmcv version ([#189](https://github.com/open-mmlab/mmaction2/pull/189))
- Update url with 'https://download.openmmlab.com' ([#208](https://github.com/open-mmlab/mmaction2/pull/208))
- Update Docker file to support PyTorch 1.6 and update `install.md` ([#209](https://github.com/open-mmlab/mmaction2/pull/209))
- Polish readsthedocs display ([#217](https://github.com/open-mmlab/mmaction2/pull/217), [#229](https://github.com/open-mmlab/mmaction2/pull/229))
**Bug Fixes**
- Fix the bug when using OpenCV to extract only RGB frames with original shape ([#184](https://github.com/open-mmlab/mmaction2/pull/187))
- Fix the bug of sthv2 `num_classes` from 339 to 174 ([#174](https://github.com/open-mmlab/mmaction2/pull/174), [#207](https://github.com/open-mmlab/mmaction2/pull/207))
### v0.6.0 (2/9/2020)
**Highlights**
- Support TIN, CSN, SSN, NonLocal
- Support FP16 training
**New Features**
- Support NonLocal module and provide ckpt in TSM and I3D ([#41](https://github.com/open-mmlab/mmaction2/pull/41))
- Support SSN ([#33](https://github.com/open-mmlab/mmaction2/pull/33), [#37](https://github.com/open-mmlab/mmaction2/pull/37), [#52](https://github.com/open-mmlab/mmaction2/pull/52), [#55](https://github.com/open-mmlab/mmaction2/pull/55))
- Support CSN ([#87](https://github.com/open-mmlab/mmaction2/pull/87))
- Support TIN ([#53](https://github.com/open-mmlab/mmaction2/pull/53))
- Support HMDB51 dataset preparation ([#60](https://github.com/open-mmlab/mmaction2/pull/60))
- Support encoding videos from frames ([#84](https://github.com/open-mmlab/mmaction2/pull/84))
- Support FP16 training ([#25](https://github.com/open-mmlab/mmaction2/pull/25))
- Enhance demo by supporting rawframe inference ([#59](https://github.com/open-mmlab/mmaction2/pull/59)), output video/gif ([#72](https://github.com/open-mmlab/mmaction2/pull/72))
**ModelZoo**
- Update Slowfast modelzoo ([#51](https://github.com/open-mmlab/mmaction2/pull/51))
- Update TSN, TSM video checkpoints ([#50](https://github.com/open-mmlab/mmaction2/pull/50))
- Add data benchmark for TSN ([#57](https://github.com/open-mmlab/mmaction2/pull/57))
- Add data benchmark for SlowOnly ([#77](https://github.com/open-mmlab/mmaction2/pull/77))
- Add BSN/BMN performance results with feature extracted by our codebase ([#99](https://github.com/open-mmlab/mmaction2/pull/99))
**Improvements**
- Polish data preparation codes ([#70](https://github.com/open-mmlab/mmaction2/pull/70))
- Improve data preparation scripts ([#58](https://github.com/open-mmlab/mmaction2/pull/58))
- Improve unittest coverage and minor fix ([#62](https://github.com/open-mmlab/mmaction2/pull/62))
- Support PyTorch 1.6 in CI ([#117](https://github.com/open-mmlab/mmaction2/pull/117))
- Support `with_offset` for rawframe dataset ([#48](https://github.com/open-mmlab/mmaction2/pull/48))
- Support json annotation files ([#119](https://github.com/open-mmlab/mmaction2/pull/119))
- Support `multi-class` in TSMHead ([#104](https://github.com/open-mmlab/mmaction2/pull/104))
- Support using `val_step()` to validate data for each `val` workflow ([#123](https://github.com/open-mmlab/mmaction2/pull/123))
- Use `xxInit()` method to get `total_frames` and make `total_frames` a required key ([#90](https://github.com/open-mmlab/mmaction2/pull/90))
- Add paper introduction in model readme ([#140](https://github.com/open-mmlab/mmaction2/pull/140))
- Adjust the directory structure of `tools/` and rename some scripts files ([#142](https://github.com/open-mmlab/mmaction2/pull/142))
**Bug Fixes**
- Fix configs for localization test ([#67](https://github.com/open-mmlab/mmaction2/pull/67))
- Fix configs of SlowOnly by fixing lr to 8 gpus ([#136](https://github.com/open-mmlab/mmaction2/pull/136))
- Fix the bug in analyze_log ([#54](https://github.com/open-mmlab/mmaction2/pull/54))
- Fix the bug of generating HMDB51 class index file ([#69](https://github.com/open-mmlab/mmaction2/pull/69))
- Fix the bug of using `load_checkpoint()` in ResNet ([#93](https://github.com/open-mmlab/mmaction2/pull/93))
- Fix the bug of `--work-dir` when using slurm training script ([#110](https://github.com/open-mmlab/mmaction2/pull/110))
- Correct the sthv1/sthv2 rawframes filelist generate command ([#71](https://github.com/open-mmlab/mmaction2/pull/71))
- `CosineAnnealing` typo ([#47](https://github.com/open-mmlab/mmaction2/pull/47))
### v0.5.0 (9/7/2020)
**Highlights**
- MMAction2 is released
**New Features**
- Support various datasets: UCF101, Kinetics-400, Something-Something V1&V2, Moments in Time,
Multi-Moments in Time, THUMOS14
- Support various action recognition methods: TSN, TSM, R(2+1)D, I3D, SlowOnly, SlowFast, Non-local
- Support various action localization methods: BSN, BMN
- Colab demo for action recognition
# Copyright (c) OpenMMLab. All rights reserved.
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import subprocess
import sys
import pytorch_sphinx_theme
sys.path.insert(0, os.path.abspath('..'))
# -- Project information -----------------------------------------------------
project = 'MMAction2'
copyright = '2020, OpenMMLab'
author = 'MMAction2 Authors'
version_file = '../../mmaction/version.py'
def get_version():
with open(version_file, 'r') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']
# The full version, including alpha/beta/rc tags
release = get_version()
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode',
'sphinx_markdown_tables', 'sphinx_copybutton', 'myst_parser'
]
# numpy and torch are required
autodoc_mock_imports = ['mmaction.version', 'PIL']
copybutton_prompt_text = r'>>> |\.\.\. '
copybutton_prompt_is_regexp = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# -- Options for HTML output -------------------------------------------------
source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'}
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'pytorch_sphinx_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
html_theme_options = {
# 'logo_url': 'https://mmaction2.readthedocs.io/en/latest/',
'menu': [
{
'name':
'Tutorial',
'url':
'https://colab.research.google.com/github/'
'open-mmlab/mmaction2/blob/master/demo/mmaction2_tutorial.ipynb'
},
{
'name': 'GitHub',
'url': 'https://github.com/open-mmlab/mmaction2'
},
{
'name':
'Upstream',
'children': [
{
'name': 'MMCV',
'url': 'https://github.com/open-mmlab/mmcv',
'description': 'Foundational library for computer vision'
},
{
'name':
'MMClassification',
'url':
'https://github.com/open-mmlab/mmclassification',
'description':
'Open source image classification toolbox based on PyTorch'
},
{
'name': 'MMDetection',
'url': 'https://github.com/open-mmlab/mmdetection',
'description': 'Object detection toolbox and benchmark'
},
]
},
],
# Specify the language of shared menu
'menu_lang':
'en'
}
language = 'en'
master_doc = 'index'
html_static_path = ['_static']
html_css_files = ['css/readthedocs.css']
myst_enable_extensions = ['colon_fence']
myst_heading_anchors = 3
def builder_inited_handler(app):
subprocess.run(['./merge_docs.sh'])
subprocess.run(['./stat.py'])
def setup(app):
app.connect('builder-inited', builder_inited_handler)
# Data Preparation
We provide some tips for MMAction2 data preparation in this file.
<!-- TOC -->
- [Data Preparation](#data-preparation)
- [Notes on Video Data Format](#notes-on-video-data-format)
- [Getting Data](#getting-data)
- [Prepare videos](#prepare-videos)
- [Extract frames](#extract-frames)
- [Alternative to denseflow](#alternative-to-denseflow)
- [Generate file list](#generate-file-list)
- [Prepare audio](#prepare-audio)
<!-- TOC -->
## Notes on Video Data Format
MMAction2 supports two types of data format: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks).
This is fast when SSD is available but fails to scale to the fast-growing datasets.
(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K videos and the total frames will take up several TBs.)
The latter saves much space but has to do the computation intensive video decoding at execution time.
To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc.
## Getting Data
The following guide is helpful when you want to experiment with custom dataset.
Similar to the datasets stated above, it is recommended organizing in `$MMACTION2/data/$DATASET`.
### Prepare videos
Please refer to the official website and/or the official script to prepare the videos.
Note that the videos should be arranged in either
(1). A two-level directory organized by `${CLASS_NAME}/${VIDEO_ID}`, which is recommended to be used for action recognition datasets (such as UCF101 and Kinetics)
(2). A single-level directory, which is recommended to be used for action detection datasets or those with multiple annotations per video (such as THUMOS14).
### Extract frames
To extract both frames and optical flow, you can use the tool [denseflow](https://github.com/open-mmlab/denseflow) we wrote.
Since different frame extraction tools produce different number of frames,
it is beneficial to use the same tool to do both frame extraction and the flow computation, to avoid mismatching of frame counts.
```shell
python build_rawframes.py ${SRC_FOLDER} ${OUT_FOLDER} [--task ${TASK}] [--level ${LEVEL}] \
[--num-worker ${NUM_WORKER}] [--flow-type ${FLOW_TYPE}] [--out-format ${OUT_FORMAT}] \
[--ext ${EXT}] [--new-width ${NEW_WIDTH}] [--new-height ${NEW_HEIGHT}] [--new-short ${NEW_SHORT}] \
[--resume] [--use-opencv] [--mixed-ext]
```
- `SRC_FOLDER`: Folder of the original video.
- `OUT_FOLDER`: Root folder where the extracted frames and optical flow store.
- `TASK`: Extraction task indicating which kind of frames to extract. Allowed choices are `rgb`, `flow`, `both`.
- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory.
- `NUM_WORKER`: Number of workers to build rawframes.
- `FLOW_TYPE`: Flow type to extract, e.g., `None`, `tvl1`, `warp_tvl1`, `farn`, `brox`.
- `OUT_FORMAT`: Output format for extracted frames, e.g., `jpg`, `h5`, `png`.
- `EXT`: Video file extension, e.g., `avi`, `mp4`.
- `NEW_WIDTH`: Resized image width of output.
- `NEW_HEIGHT`: Resized image height of output.
- `NEW_SHORT`: Resized image short side length keeping ratio.
- `--resume`: Whether to resume optical flow extraction instead of overwriting.
- `--use-opencv`: Whether to use OpenCV to extract rgb frames.
- `--mixed-ext`: Indicate whether process video files with mixed extensions.
The recommended practice is
1. set `$OUT_FOLDER` to be a folder located in SSD.
2. symlink the link `$OUT_FOLDER` to `$MMACTION2/data/$DATASET/rawframes`.
3. set `new-short` instead of using `new-width` and `new-height`.
```shell
ln -s ${YOUR_FOLDER} $MMACTION2/data/$DATASET/rawframes
```
#### Alternative to denseflow
In case your device doesn't fulfill the installation requirement of [denseflow](https://github.com/open-mmlab/denseflow)(like Nvidia driver version), or you just want to see some quick demos about flow extraction, we provide a python script `tools/misc/flow_extraction.py` as an alternative to denseflow. You can use it for rgb frames and optical flow extraction from one or several videos. Note that the speed of the script is much slower than denseflow, since it runs optical flow algorithms on CPU.
```shell
python tools/misc/flow_extraction.py --input ${INPUT} [--prefix ${PREFIX}] [--dest ${DEST}] [--rgb-tmpl ${RGB_TMPL}] \
[--flow-tmpl ${FLOW_TMPL}] [--start-idx ${START_IDX}] [--method ${METHOD}] [--bound ${BOUND}] [--save-rgb]
```
- `INPUT`: Videos for frame extraction, can be single video or a video list, the video list should be a txt file and just consists of filenames without directories.
- `PREFIX`: The prefix of input videos, used when input is a video list.
- `DEST`: The destination to save extracted frames.
- `RGB_TMPL`: The template filename of rgb frames.
- `FLOW_TMPL`: The template filename of flow frames.
- `START_IDX`: The start index of extracted frames.
- `METHOD`: The method used to generate flow.
- `BOUND`: The maximum of optical flow.
- `SAVE_RGB`: Also save extracted rgb frames.
### Generate file list
We provide a convenient script to generate annotation file list. You can use the following command to generate file lists given extracted frames / downloaded videos.
```shell
cd $MMACTION2
python tools/data/build_file_list.py ${DATASET} ${SRC_FOLDER} [--rgb-prefix ${RGB_PREFIX}] \
[--flow-x-prefix ${FLOW_X_PREFIX}] [--flow-y-prefix ${FLOW_Y_PREFIX}] [--num-split ${NUM_SPLIT}] \
[--subset ${SUBSET}] [--level ${LEVEL}] [--format ${FORMAT}] [--out-root-path ${OUT_ROOT_PATH}] \
[--seed ${SEED}] [--shuffle]
```
- `DATASET`: Dataset to be prepared, e.g., `ucf101`, `kinetics400`, `thumos14`, `sthv1`, `sthv2`, etc.
- `SRC_FOLDER`: Folder of the corresponding data format:
- "$MMACTION2/data/$DATASET/rawframes" if `--format rawframes`.
- "$MMACTION2/data/$DATASET/videos" if `--format videos`.
- `RGB_PREFIX`: Name prefix of rgb frames.
- `FLOW_X_PREFIX`: Name prefix of x flow frames.
- `FLOW_Y_PREFIX`: Name prefix of y flow frames.
- `NUM_SPLIT`: Number of split to file list.
- `SUBSET`: Subset to generate file list. Allowed choice are `train`, `val`, `test`.
- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory.
- `FORMAT`: Source data format to generate file list. Allowed choices are `rawframes`, `videos`.
- `OUT_ROOT_PATH`: Root path for output
- `SEED`: Random seed.
- `--shuffle`: Whether to shuffle the file list.
Now, you can go to [getting_started.md](getting_started.md) to train and test the model.
### Prepare audio
We also provide a simple script for audio waveform extraction and mel-spectrogram generation.
```shell
cd $MMACTION2
python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \
[--level ${LEVEL}]
```
- `ROOT`: The root directory of the videos.
- `DST_ROOT`: The destination root directory of the audios.
- `EXT`: Extension of the video files. e.g., `mp4`.
- `N_WORKERS`: Number of processes to be used.
After extracting audios, you are free to decode and generate the spectrogram on-the-fly such as [this](/configs/recognition_audio/resnet/tsn_r50_64x1x1_100e_kinetics400_audio.py). As for the annotations, you can directly use those of the rawframes as long as you keep the relative position of audio files same as the rawframes directory. However, extracting spectrogram on-the-fly is slow and bad for prototype iteration. Therefore, we also provide a script (and many useful tools to play with) for you to generation spectrogram off-line.
```shell
cd $MMACTION2
python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \
[--ext $EXT] [--num-workers $N_WORKERS] [--part $PART]
```
- `AUDIO_HOME_PATH`: The root directory of the audio files.
- `SPECTROGRAM_SAVE_PATH`: The destination root directory of the audio features.
- `EXT`: Extension of the audio files. e.g., `m4a`.
- `N_WORKERS`: Number of processes to be used.
- `PART`: Determines how many parts to be splited and which part to run. e.g., `2/5` means splitting all files into 5-fold and executing the 2nd part. This is useful if you have several machines.
The annotations for audio spectrogram features are identical to those of rawframes. You can simply make a copy of `dataset_[train/val]_list_rawframes.txt` and rename it as `dataset_[train/val]_list_audio_feature.txt`
# FAQ
## Outline
We list some common issues faced by many users and their corresponding solutions here.
- [FAQ](#faq)
- [Outline](#outline)
- [Installation](#installation)
- [Data](#data)
- [Training](#training)
- [Testing](#testing)
- [Deploying](#deploying)
Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
If the contents here do not cover your issue, please create an issue using the [provided templates](/.github/ISSUE_TEMPLATE/error-report.md) and make sure you fill in all required information in the template.
## Installation
- **"No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"**
1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`
2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/#installation)
- **"OSError: MoviePy Error: creation of None failed because of the following error"**
Refer to [install.md](https://github.com/open-mmlab/mmaction2/blob/master/docs/en/install.md#requirements)
1. For Windows users, [ImageMagick](https://www.imagemagick.org/script/index.php) will not be automatically detected by MoviePy, there is a need to modify `moviepy/config_defaults.py` file by providing the path to the ImageMagick binary called `magick`, like `IMAGEMAGICK_BINARY = "C:\\Program Files\\ImageMagick_VERSION\\magick.exe"`
2. For Linux users, there is a need to modify the `/etc/ImageMagick-6/policy.xml` file by commenting out `<policy domain="path" rights="none" pattern="@*" />` to `<!-- <policy domain="path" rights="none" pattern="@*" /> -->`, if ImageMagick is not detected by moviepy.
- **"Why I got the error message 'Please install XXCODEBASE to use XXX' even if I have already installed XXCODEBASE?"**
You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv-full before you install them.
## Data
- **FileNotFound like `No such file or directory: xxx/xxx/img_00300.jpg`**
In our repo, we set `start_index=1` as default value for rawframe dataset, and `start_index=0` as default value for video dataset.
If users encounter FileNotFound error for the first or last frame of the data, there is a need to check the files begin with offset 0 or 1,
that is `xxx_00000.jpg` or `xxx_00001.jpg`, and then change the `start_index` value of data pipeline in configs.
- **How should we preprocess the videos in the dataset? Resizing them to a fix size(all videos with the same height-width ratio) like `340x256`(1) or resizing them so that the short edges of all videos are of the same length (256px or 320px)**
We have tried both preprocessing approaches and found (2) is a better solution in general, so we use (2) with short edge length 256px as the default preprocessing setting. We benchmarked these preprocessing approaches and you may find the results in [TSN Data Benchmark](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn) and [SlowOnly Data Benchmark](https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsn).
- **Mismatched data pipeline items lead to errors like `KeyError: 'total_frames'`**
We have both pipeline for processing videos and frames.
**For videos**, We should decode them on the fly in the pipeline, so pairs like `DecordInit & DecordDecode`, `OpenCVInit & OpenCVDecode`, `PyAVInit & PyAVDecode` should be used for this case like [this example](https://github.com/open-mmlab/mmaction2/blob/023777cfd26bb175f85d78c455f6869673e0aa09/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py#L47-L49).
**For Frames**, the image has been decoded offline, so pipeline item `RawFrameDecode` should be used for this case like [this example](https://github.com/open-mmlab/mmaction2/blob/023777cfd26bb175f85d78c455f6869673e0aa09/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py#L49).
`KeyError: 'total_frames'` is caused by incorrectly using `RawFrameDecode` step for videos, since when the input is a video, it can not get the `total_frame` beforehand.
## Training
- **How to just use trained recognizer models for backbone pre-training?**
Refer to [Use Pre-Trained Model](https://github.com/open-mmlab/mmaction2/blob/master/docs/en/tutorials/2_finetune.md#use-pre-trained-model),
in order to use the pre-trained model for the whole network, the new config adds the link of pre-trained models in the `load_from`.
And to use backbone for pre-training, you can change `pretrained` value in the backbone dict of config files to the checkpoint path / url.
When training, the unexpected keys will be ignored.
- **How to visualize the training accuracy/loss curves in real-time?**
Use `TensorboardLoggerHook` in `log_config` like
```python
log_config=dict(interval=20, hooks=[dict(type='TensorboardLoggerHook')])
```
You can refer to [tutorials/1_config.md](tutorials/1_config.md), [tutorials/7_customize_runtime.md](tutorials/7_customize_runtime.md#log-config), and [this](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py#L118).
- **In batchnorm.py: Expected more than 1 value per channel when training**
To use batchnorm, the batch_size should be larger than 1. If `drop_last` is set as False when building dataloaders, sometimes the last batch of an epoch will have `batch_size==1` (what a coincidence ...) and training will throw out this error. You can set `drop_last` as True to avoid this error:
```python
train_dataloader=dict(drop_last=True)
```
- **How to fix stages of backbone when finetuning a model?**
You can refer to [`def _freeze_stages()`](https://github.com/open-mmlab/mmaction2/blob/0149a0e8c1e0380955db61680c0006626fd008e9/mmaction/models/backbones/x3d.py#L458) and [`frozen_stages`](https://github.com/open-mmlab/mmaction2/blob/0149a0e8c1e0380955db61680c0006626fd008e9/mmaction/models/backbones/x3d.py#L183-L184),
reminding to set `find_unused_parameters = True` in config files for distributed training or testing.
Actually, users can set `frozen_stages` to freeze stages in backbones except C3D model, since all backbones inheriting from `ResNet` and `ResNet3D` support the inner function `_freeze_stages()`.
- **How to set memcached setting in config files?**
In MMAction2, you can pass memcached kwargs to `class DecordInit` for video dataset or `RawFrameDecode` for rawframes dataset.
For more details, you can refer to [`class FileClient`](https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py) in MMCV for more details.
Here is an example to use memcached for rawframes dataset:
```python
mc_cfg = dict(server_list_cfg='server_list_cfg', client_cfg='client_cfg', sys_path='sys_path')
train_pipeline = [
...
dict(type='RawFrameDecode', io_backend='memcached', **mc_cfg),
...
]
```
- **How to set `load_from` value in config files to finetune models?**
In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/tutorials/1_config.md),
users can directly change it by setting `load_from` in their configs.
## Testing
- **How to make predicted score normalized by softmax within \[0, 1\]?**
change this in the config, make `model['test_cfg'] = dict(average_clips='prob')`.
- **What if the model is too large and the GPU memory can not fit even only one testing sample?**
By default, the 3d models are tested with 10clips x 3crops, which are 30 views in total. For extremely large models, the GPU memory can not fit even only one testing sample (cuz there are 30 views). To handle this, you can set `max_testing_views=n` in `model['test_cfg']` of the config file. If so, n views will be used as a batch during forwarding to save GPU memory used.
- **How to show test results?**
During testing, we can use the command `--out xxx.json/pkl/yaml` to output result files for checking. The testing output has exactly the same order as the test dataset.
Besides, we provide an analysis tool for evaluating a model using the output result files in [`tools/analysis/eval_metric.py`](/tools/analysis/eval_metric.py)
## Deploying
- **Why is the onnx model converted by mmaction2 throwing error when converting to other frameworks such as TensorRT?**
For now, we can only make sure that models in mmaction2 are onnx-compatible. However, some operations in onnx may be unsupported by your target framework for deployment, e.g. TensorRT in [this issue](https://github.com/open-mmlab/mmaction2/issues/414). When such situation occurs, we suggest you raise an issue and ask the community to help as long as `pytorch2onnx.py` works well and is verified numerically.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment