# Copyright (c) OpenMMLab. All rights reserved. import argparse import os import warnings from functools import partial from multiprocessing import Manager, Pool, cpu_count import mmcv import numpy as np from mmcv import Config, DictAction from mmaction.datasets import PIPELINES, build_dataset def parse_args(): parser = argparse.ArgumentParser(description='MMAction2 check datasets') parser.add_argument('config', help='test config file path') parser.add_argument( '--options', nargs='+', action=DictAction, default={}, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, default={}, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. For example, ' "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") parser.add_argument( '--output-file', default='invalid-video.txt', help='Output file path which keeps corrupted/missing video file paths') parser.add_argument( '--split', default='train', choices=['train', 'val', 'test'], help='Dataset split') parser.add_argument( '--decoder', default='decord', choices=['decord', 'opencv', 'pyav'], help='Video decoder type, should be one of [decord, opencv, pyav]') parser.add_argument( '--num-processes', type=int, default=(cpu_count() - 1 or 1), help='Number of processes to check videos') parser.add_argument( '--remove-corrupted-videos', action='store_true', help='Whether to delete all corrupted videos') args = parser.parse_args() if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both ' 'specified, --options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args @PIPELINES.register_module() class RandomSampleFrames: def __call__(self, results): """Select frames to verify. Select the first, last and three random frames, Required key is "total_frames", added or modified key is "frame_inds". Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ assert results['total_frames'] > 0 # first and last frames results['frame_inds'] = np.array([0, results['total_frames'] - 1]) # choose 3 random frames if results['total_frames'] > 2: results['frame_inds'] = np.concatenate([ results['frame_inds'], np.random.randint(1, results['total_frames'] - 1, 3) ]) return results def _do_check_videos(lock, dataset, output_file, idx): try: dataset[idx] except: # noqa # save invalid video path to output file lock.acquire() with open(output_file, 'a') as f: f.write(dataset.video_infos[idx]['filename'] + '\n') lock.release() if __name__ == '__main__': args = parse_args() decoder_to_pipeline_prefix = dict( decord='Decord', opencv='OpenCV', pyav='PyAV') # read config file cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) # build dataset dataset_type = cfg.data[args.split].type assert dataset_type == 'VideoDataset' cfg.data[args.split].pipeline = [ dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Init'), dict(type='RandomSampleFrames'), dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Decode') ] dataset = build_dataset(cfg.data[args.split], dict(test_mode=(args.split != 'train'))) # prepare for checking if os.path.exists(args.output_file): # remove existing output file os.remove(args.output_file) pool = Pool(args.num_processes) lock = Manager().Lock() worker_fn = partial(_do_check_videos, lock, dataset, args.output_file) ids = range(len(dataset)) # start checking prog_bar = mmcv.ProgressBar(len(dataset)) for _ in pool.imap_unordered(worker_fn, ids): prog_bar.update() pool.close() pool.join() if os.path.exists(args.output_file): num_lines = sum(1 for _ in open(args.output_file)) print(f'Checked {len(dataset)} videos, ' f'{num_lines} are corrupted/missing.') if args.remove_corrupted_videos: print('Start deleting corrupted videos') cnt = 0 with open(args.output_file, 'r') as f: for line in f: if os.path.exists(line.strip()): os.remove(line.strip()) cnt += 1 print(f'Deleted {cnt} corrupted videos.') else: print(f'Checked {len(dataset)} videos, none are corrupted/missing')