Commit 5b3e36dc authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add model TSM

parents
Pipeline #315 failed with stages
in 0 seconds
_base_ = [
'../../_base_/models/i3d_r50.py', '../../_base_/schedules/sgd_100e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
non_local=((0, 0, 0), (0, 1, 0, 1), (0, 1, 0, 1, 0, 1), (0, 0, 0)),
non_local_cfg=dict(
sub_sample=True,
use_scale=False,
norm_cfg=dict(type='BN3d', requires_grad=True),
mode='dot_product')))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb/'
_base_ = ['./i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb.py']
# model settings
model = dict(
backbone=dict(
non_local_cfg=dict(
sub_sample=True,
use_scale=False,
norm_cfg=dict(type='BN3d', requires_grad=True),
mode='embedded_gaussian')))
# runtime settings
work_dir = './work_dirs/i3d_nl_embedded_gaussian_r50_32x2x1_100e_kinetics400_rgb/' # noqa: E501
_base_ = ['./i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb.py']
# model settings
model = dict(
backbone=dict(
non_local_cfg=dict(
sub_sample=True,
use_scale=False,
norm_cfg=dict(type='BN3d', requires_grad=True),
mode='gaussian')))
# runtime settings
work_dir = './work_dirs/i3d_nl_gaussian_r50_32x2x1_100e_kinetics400_rgb/'
_base_ = [
'../../_base_/models/i3d_r50.py', '../../_base_/schedules/sgd_100e.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/i3d_r50_32x2x1_100e_kinetics400_rgb/'
_base_ = ['./i3d_r50_32x2x1_100e_kinetics400_rgb.py']
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DenseSampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='DenseSampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='DenseSampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
# runtime settings
work_dir = './work_dirs/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/'
_base_ = ['./i3d_r50_32x2x1_100e_kinetics400_rgb.py']
# model settings
model = dict(
backbone=dict(
inflate=(1, 1, 1, 1),
conv1_stride_t=1,
pool1_stride_t=1,
with_pool2=True))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
# runtime settings
work_dir = './work_dirs/i3d_r50_heavy_8x8x1_100e_kinetics400_rgb/'
_base_ = ['./i3d_r50_32x2x1_100e_kinetics400_rgb.py']
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='RawFrameDecode', decoding_backend='turbojpeg'),
dict(type='Resize', scale=(-1, 256), lazy=True),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0,
lazy=True),
dict(type='Resize', scale=(224, 224), keep_ratio=False, lazy=True),
dict(type='Flip', flip_ratio=0.5, lazy=True),
dict(type='Fuse'),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode', decoding_backend='turbojpeg'),
dict(type='Resize', scale=(-1, 256), lazy=True),
dict(type='CenterCrop', crop_size=224, lazy=True),
dict(type='Flip', flip_ratio=0, lazy=True),
dict(type='Fuse'),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode', decoding_backend='turbojpeg'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
# runtime settings
work_dir = './work_dirs/i3d_r50_lazy_32x2x1_100e_kinetics400_rgb/'
_base_ = ['./i3d_r50_32x2x1_100e_kinetics400_rgb.py']
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/kinetics400/videos_train'
data_root_val = 'data/kinetics400/videos_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
# runtime settings
work_dir = './work_dirs/i3d_r50_video_3d_32x2x1_100e_kinetics400_rgb/'
_base_ = ['./i3d_r50_heavy_8x8x1_100e_kinetics400_rgb.py']
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/kinetics400/videos_train'
data_root_val = 'data/kinetics400/videos_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
#
# runtime settings
work_dir = './work_dirs/i3d_r50_video_heavy_8x8x1_100e_kinetics400_rgb/'
_base_ = ['../../_base_/models/i3d_r50.py']
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/kinetics400/videos_train'
data_root_val = 'data/kinetics400/videos_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(
type='Imgaug',
transforms=[
dict(type='Fliplr', p=0.5),
dict(type='Rotate', rotate=(-20, 20)),
dict(type='Dropout', p=(0, 0.05))
]),
# dict(type='Imgaug', transforms='default'),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='SGD', lr=0.01, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='step', step=[40, 80])
total_epochs = 100
checkpoint_config = dict(interval=5)
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
log_config = dict(
interval=20,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook'),
])
# runtime settings
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/i3d_r50_video_3d_32x2x1_100e_kinetics400_rgb/'
load_from = None
resume_from = None
workflow = [('train', 1)]
_base_ = ['../../_base_/models/i3d_r50.py']
# dataset settings
dataset_type = 'VideoDataset'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_pipeline = [
dict(type='DecordInit', num_threads=1),
dict(
type='SampleFrames',
clip_len=32,
frame_interval=2,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=1,
workers_per_gpu=2,
test=dict(
type=dataset_type,
ann_file=None,
data_prefix=None,
pipeline=test_pipeline))
Collections:
- Name: I3D
README: configs/recognition/i3d/README.md
Paper:
URL: https://arxiv.org/abs/1705.07750
Title: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
Models:
- Config: configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 43564040192
Parameters: 28043472
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: i3d_r50_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.68
Top 5 Accuracy: 90.78
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/20200614_060456.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/20200614_060456.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/i3d_r50_32x2x1_100e_kinetics400_rgb_20200614-c25ef9a4.pth
- Config: configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 43564040192
Parameters: 28043472
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: i3d_r50_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.27
Top 5 Accuracy: 90.92
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_256p_32x2x1_100e_kinetics400_rgb/20200725_031555.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_256p_32x2x1_100e_kinetics400_rgb/20200725_031555.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_256p_32x2x1_100e_kinetics400_rgb/i3d_r50_256p_32x2x1_100e_kinetics400_rgb_20200801-7d9f44de.pth
- Config: configs/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 43564040192
Parameters: 28043472
Pretrained: ImageNet
Resolution: short-side 256p
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: i3d_r50_video_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.85
Top 5 Accuracy: 90.75
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb/20200706_143014.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb/20200706_143014.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb/i3d_r50_video_32x2x1_100e_kinetics400_rgb_20200826-e31c6f52.pth
- Config: configs/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 43564040192
Parameters: 28043472
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: RGB
Name: i3d_r50_dense_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.77
Top 5 Accuracy: 90.57
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/20200616_230011.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/20200616_230011.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/i3d_r50_dense_32x2x1_100e_kinetics400_rgb_20200616-2bbb4361.pth
- Config: configs/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 43564040192
Parameters: 28043472
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: i3d_r50_dense_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.48
Top 5 Accuracy: 91.0
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb/20200725_031604.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb/20200725_031604.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb_20200725-24eb54cc.pth
- Config: configs/recognition/i3d/i3d_r50_lazy_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 43564040192
Parameters: 28043472
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: i3d_r50_lazy_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.32
Top 5 Accuracy: 90.72
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/20200612_233836.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/20200612_233836.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/i3d_r50_fast_32x2x1_100e_kinetics400_rgb_20200612-000e4d2a.pth
- Config: configs/recognition/i3d/i3d_r50_lazy_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 43564040192
Parameters: 28043472
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: i3d_r50_lazy_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.24
Top 5 Accuracy: 90.99
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb/20200725_031457.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb/20200725_031457.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb_20200817-4e90d1d5.pth
- Config: configs/recognition/i3d/i3d_nl_embedded_gaussian_r50_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 54334488576
Parameters: 35397840
Pretrained: ImageNet
Resolution: short-side 256p
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: i3d_nl_embedded_gaussian_r50_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 74.71
Top 5 Accuracy: 91.81
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_embedded_gaussian_r50_32x2x1_100e_kinetics400_rgb/20200813_034054.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_embedded_gaussian_r50_32x2x1_100e_kinetics400_rgb/20200813_034054.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_embedded_gaussian_r50_32x2x1_100e_kinetics400_rgb/i3d_nl_embedded_gaussian_r50_32x2x1_100e_kinetics400_rgb_20200813-6e6aef1b.pth
- Config: configs/recognition/i3d/i3d_nl_gaussian_r50_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 48962109440
Parameters: 31723728
Pretrained: ImageNet
Resolution: short-side 256p
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: i3d_nl_gaussian_r50_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.37
Top 5 Accuracy: 91.26
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_gaussian_r50_32x2x1_100e_kinetics400_rgb/20200813_034909.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_gaussian_r50_32x2x1_100e_kinetics400_rgb/20200813_034909.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_gaussian_r50_32x2x1_100e_kinetics400_rgb/i3d_nl_gaussian_r50_32x2x1_100e_kinetics400_rgb_20200815-17f84aa2.pth
- Config: configs/recognition/i3d/i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb.py
In Collection: I3D
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 54334488576
Parameters: 35397840
Pretrained: ImageNet
Resolution: short-side 256p
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: RGB
Name: i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.92
Top 5 Accuracy: 91.59
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb/20200814_044208.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb/20200814_044208.log
Weights: https://download.openmmlab.com/mmaction/recognition/i3d/i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb/i3d_nl_dot_product_r50_32x2x1_100e_kinetics400_rgb_20200814-7c30d5bb.pth
# Omni-sourced Webly-supervised Learning for Video Recognition
[Omni-sourced Webly-supervised Learning for Video Recognition](https://arxiv.org/abs/2003.13042)
[Dataset](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link)
## Abstract
<!-- [ABSTRACT] -->
We introduce OmniSource, a novel framework for leveraging web data to train video recognition models. OmniSource overcomes the barriers between data formats, such as images, short videos, and long untrimmed videos for webly-supervised learning. First, data samples with multiple formats, curated by task-specific data collection and automatically filtered by a teacher model, are transformed into a unified form. Then a joint-training strategy is proposed to deal with the domain gaps between multiple data sources and formats in webly-supervised learning. Several good practices, including data balancing, resampling, and cross-dataset mixup are adopted in joint training. Experiments show that by utilizing data from multiple sources and formats, OmniSource is more data-efficient in training. With only 3.5M images and 800K minutes videos crawled from the internet without human labeling (less than 2% of prior works), our models learned with OmniSource improve Top-1 accuracy of 2D- and 3D-ConvNet baseline models by 3.0% and 3.9%, respectively, on the Kinetics-400 benchmark. With OmniSource, we establish new records with different pretraining strategies for video recognition. Our best models achieve 80.4%, 80.5%, and 83.6 Top-1 accuracies on the Kinetics-400 benchmark respectively for training-from-scratch, ImageNet pre-training and IG-65M pre-training.
<!-- [IMAGE] -->
<div align=center>
<img src="https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/omnisource/pipeline.png" width="800"/>
</div>
## Results and Models
### Kinetics-400 Model Release
We currently released 4 models trained with OmniSource framework, including both 2D and 3D architectures. We compare the performance of models trained with or without OmniSource in the following table.
| Model | Modality | Pretrained | Backbone | Input | Resolution | Top-1 (Baseline / OmniSource (Delta)) | Top-5 (Baseline / OmniSource (Delta))) | Download |
| :------: | :------: | :--------: | :-------: | :---: | :------------: | :-----------------------------------: | :------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| TSN | RGB | ImageNet | ResNet50 | 3seg | 340x256 | 70.6 / 73.6 (+ 3.0) | 89.4 / 91.0 (+ 1.6) | [Baseline](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_imagenet_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-54192355.pth) |
| TSN | RGB | IG-1B | ResNet50 | 3seg | short-side 320 | 73.1 / 75.7 (+ 2.6) | 90.4 / 91.9 (+ 1.5) | [Baseline](https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_1G1B_pretrained_r50_without_omni_1x1x3_kinetics400_rgb_20200926-c133dd49.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_1G1B_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-2863fed0.pth) |
| SlowOnly | RGB | Scratch | ResNet50 | 4x16 | short-side 320 | 72.9 / 76.8 (+ 3.9) | 90.9 / 92.5 (+ 1.6) | [Baseline](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r50_omni_4x16x1_kinetics400_rgb_20200926-51b1f7ea.pth) |
| SlowOnly | RGB | Scratch | ResNet101 | 8x8 | short-side 320 | 76.5 / 80.4 (+ 3.9) | 92.7 / 94.4 (+ 1.7) | [Baseline](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_20200926-0c730aef.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_omni_8x8x1_kinetics400_rgb_20200926-b5dbb701.pth) |
1. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
## Benchmark on Mini-Kinetics
We release a subset of web dataset used in the OmniSource paper. Specifically, we release the web data in the 200 classes of [Mini-Kinetics](https://arxiv.org/pdf/1712.04851.pdf). The statistics of those datasets is detailed in [preparing_omnisource](/tools/data/omnisource/README.md). To obtain those data, you need to fill in a [data request form](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link). After we received your request, the download link of these data will be send to you. For more details on the released OmniSource web dataset, please refer to [preparing_omnisource](/tools/data/omnisource/README.md).
We benchmark the OmniSource framework on the released subset, results are listed in the following table (we report the Top-1 and Top-5 accuracy on Mini-Kinetics validation). The benchmark can be used as a baseline for video recognition with web data.
### TSN-8seg-ResNet50
| Model | Modality | Pretrained | Backbone | Input | Resolution | top1 acc | top5 acc | ckpt | json | log |
| :-------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -------- | ---------- | -------- | ----- | -------------- | :------: | :------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [tsn_r50_1x1x8_100e_minikinetics_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 77.4 | 93.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030-b4eaf92b.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 78.0 | 93.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030-23966b4b.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_webimage_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 78.6 | 93.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030-66f5e046.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 80.6 | 95.0 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030-011f984d.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 78.6 | 93.2 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030-59f5d064.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 81.3 | 94.8 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030-0f56ef51.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030.log) |
### SlowOnly-8x8-ResNet50
| Model | Modality | Pretrained | Backbone | Input | Resolution | top1 acc | top5 acc | ckpt | json | log |
| :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -------- | ---------- | -------- | ----- | -------------- | :------: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [slowonly_r50_8x8x1_256e_minikinetics_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 78.6 | 93.9 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030-168eb098.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 80.8 | 95.0 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030-7da6dfc3.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 81.3 | 95.2 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030-c36616e9.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 82.4 | 95.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030-e2890e8d.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 80.3 | 94.5 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030-62974bac.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 82.9 | 95.8 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030-284cfd3b.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030.log) |
We also list the benchmark in the original paper which run on Kinetics-400 for comparison:
| Model | Baseline | +GG-img | +\[GG-IG\]-img | +IG-vid | +KRaw | OmniSource |
| :--------------------: | :---------: | :---------: | :------------: | :---------: | :---------: | :---------: |
| TSN-3seg-ResNet50 | 70.6 / 89.4 | 71.5 / 89.5 | 72.0 / 90.0 | 72.0 / 90.3 | 71.7 / 89.6 | 73.6 / 91.0 |
| SlowOnly-4x16-ResNet50 | 73.8 / 90.9 | 74.5 / 91.4 | 75.2 / 91.6 | 75.2 / 91.7 | 74.5 / 91.1 | 76.6 / 92.5 |
## Citation
<!-- [ALGORITHM] -->
```BibTeX
@article{duan2020omni,
title={Omni-sourced Webly-supervised Learning for Video Recognition},
author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua},
journal={arXiv preprint arXiv:2003.13042},
year={2020}
}
```
# Omni-sourced Webly-supervised Learning for Video Recognition
[Haodong Duan](https://github.com/kennymckormick), [Yue Zhao](https://github.com/zhaoyue-zephyrus), [Yuanjun Xiong](https://github.com/yjxiong), Wentao Liu, [Dahua Lin](https://github.com/lindahua)
In ECCV, 2020. [Paper](https://arxiv.org/abs/2003.13042), [Dataset](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link)
![pipeline](https://github.com/open-mmlab/mmaction2/blob/master/configs/recognition/omnisource/pipeline.png?raw=true)
## 模型库
### Kinetics-400
MMAction2 当前公开了 4 个 OmniSource 框架训练的模型,包含 2D 架构与 3D 架构。下表比较了使用或不适用 OmniSource 框架训练得的模型在 Kinetics-400 上的精度:
| 模型 | 模态 | 预训练 | 主干网络 | 输入 | 分辨率 | Top-1 准确率(Baseline / OmniSource (Delta)) | Top-5 准确率(Baseline / OmniSource (Delta))) | 模型下载链接 |
| :------: | :--: | :------: | :-------: | :--: | :------------: | :-----------------------------------------: | :------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| TSN | RGB | ImageNet | ResNet50 | 3seg | 340x256 | 70.6 / 73.6 (+ 3.0) | 89.4 / 91.0 (+ 1.6) | [Baseline](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_imagenet_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-54192355.pth) |
| TSN | RGB | IG-1B | ResNet50 | 3seg | short-side 320 | 73.1 / 75.7 (+ 2.6) | 90.4 / 91.9 (+ 1.5) | [Baseline](https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_1G1B_pretrained_r50_without_omni_1x1x3_kinetics400_rgb_20200926-c133dd49.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_1G1B_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-2863fed0.pth) |
| SlowOnly | RGB | None | ResNet50 | 4x16 | short-side 320 | 72.9 / 76.8 (+ 3.9) | 90.9 / 92.5 (+ 1.6) | [Baseline](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r50_omni_4x16x1_kinetics400_rgb_20200926-51b1f7ea.pth) |
| SlowOnly | RGB | None | ResNet101 | 8x8 | short-side 320 | 76.5 / 80.4 (+ 3.9) | 92.7 / 94.4 (+ 1.7) | [Baseline](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_20200926-0c730aef.pth) / [OmniSource](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_omni_8x8x1_kinetics400_rgb_20200926-b5dbb701.pth) |
1. 我们使用的 Kinetics400 验证集包含 19796 个视频,用户可以从 [验证集视频](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB) 下载这些视频。同时也提供了对应的 [数据列表](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (每行格式为:视频 ID,视频帧数目,类别序号)以及 [标签映射](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) (类别序号到类别名称)。
## Mini-Kinetics 上的基准测试
OmniSource 项目当前公开了所采集网络数据的一个子集,涉及 [Mini-Kinetics](https://arxiv.org/pdf/1712.04851.pdf) 中的 200 个动作类别。[OmniSource 数据集准备](/tools/data/omnisource/README_zh-CN.md) 中记录了这些数据集的详细统计信息。用户可以通过填写 [申请表](https://docs.google.com/forms/d/e/1FAIpQLSd8_GlmHzG8FcDbW-OEu__G7qLgOSYZpH-i5vYVJcu7wcb_TQ/viewform?usp=sf_link) 获取这些数据,在完成填写后,数据下载链接会被发送至用户邮箱。更多关于 OmniSource 网络数据集的信息请参照 [OmniSource 数据集准备](/tools/data/omnisource/README_zh-CN.md)
MMAction2 在公开的数据集上进行了 OmniSource 框架的基准测试,下表记录了详细的结果(在 Mini-Kinetics 验证集上的精度),这些结果可以作为使用网络数据训练视频识别任务的基线。
### TSN-8seg-ResNet50
| 模型 | 模态 | 预训练 | 主干网络 | 输入 | 分辨率 | Top-1 准确率 | Top-5 准确率 | ckpt | json | log |
| :-------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--: | :------: | :------: | :--: | :------------: | :----------: | :----------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [tsn_r50_1x1x8_100e_minikinetics_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 77.4 | 93.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030-b4eaf92b.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 78.0 | 93.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030-23966b4b.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_webimage_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 78.6 | 93.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030-66f5e046.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 80.6 | 95.0 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030-011f984d.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 78.6 | 93.2 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030-59f5d064.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030.log) |
| [tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb](/configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb.py) | RGB | ImageNet | ResNet50 | 3seg | short-side 320 | 81.3 | 94.8 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030-0f56ef51.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030.log) |
### SlowOnly-8x8-ResNet50
| 模型 | 模态 | 预训练 | 主干网络 | 输入 | 分辨率 | Top-1 准确率 | Top-5 准确率 | ckpt | json | log |
| :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--: | :----: | :------: | :--: | :------------: | :----------: | :----------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [slowonly_r50_8x8x1_256e_minikinetics_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 78.6 | 93.9 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030-168eb098.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 80.8 | 95.0 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030-7da6dfc3.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 81.3 | 95.2 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030-c36616e9.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 82.4 | 95.6 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030-e2890e8d.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 80.3 | 94.5 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030-62974bac.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030.log) |
| [slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb](/configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb.py) | RGB | None | ResNet50 | 8x8 | short-side 320 | 82.9 | 95.8 | [ckpt](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030-284cfd3b.pth) | [json](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030.json) | [log](https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030.log) |
下表列出了原论文中在 Kinetics-400 上进行基准测试的结果供参考:
| Model | Baseline | +GG-img | +\[GG-IG\]-img | +IG-vid | +KRaw | OmniSource |
| :--------------------: | :---------: | :---------: | :------------: | :---------: | :---------: | :---------: |
| TSN-3seg-ResNet50 | 70.6 / 89.4 | 71.5 / 89.5 | 72.0 / 90.0 | 72.0 / 90.3 | 71.7 / 89.6 | 73.6 / 91.0 |
| SlowOnly-4x16-ResNet50 | 73.8 / 90.9 | 74.5 / 91.4 | 75.2 / 91.6 | 75.2 / 91.7 | 74.5 / 91.1 | 76.6 / 92.5 |
## 注:
如果 OmniSource 项目对您的研究有所帮助,请使用以下 BibTex 项进行引用:
<!-- [ALGORITHM] -->
```BibTeX
@article{duan2020omni,
title={Omni-sourced Webly-supervised Learning for Video Recognition},
author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua},
journal={arXiv preprint arXiv:2003.13042},
year={2020}
}
```
Collections:
- Name: OmniSource
README: configs/recognition/omnisource/README.md
Paper:
URL: https://arxiv.org/abs/2003.13042
Title: Omni-sourced Webly-supervised Learning for Video Recognition
Models:
- Config: configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134526976000
Input: 3seg
Modality: RGB
Parameters: 23917832
Pretrained: ImageNet
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: tsn_r50_1x1x8_100e_minikinetics_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 77.4
Top 5 Accuracy: 93.6
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/baseline/tsn_r50_1x1x8_100e_minikinetics_rgb_20201030-b4eaf92b.pth
- Config: configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134526976000
Input: 3seg
Modality: RGB
Parameters: 23917832
Pretrained: ImageNet
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 78.0
Top 5 Accuracy: 93.6
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/googleimage/tsn_r50_1x1x8_100e_minikinetics_googleimage_rgb_20201030-23966b4b.pth
- Config: configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134526976000
Input: 3seg
Modality: RGB
Parameters: 23917832
Pretrained: ImageNet
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: tsn_r50_1x1x8_100e_minikinetics_webimage_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 78.6
Top 5 Accuracy: 93.6
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/webimage/tsn_r50_1x1x8_100e_minikinetics_webimage_rgb_20201030-66f5e046.pth
- Config: configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134526976000
Input: 3seg
Modality: RGB
Parameters: 23917832
Pretrained: ImageNet
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 80.6
Top 5 Accuracy: 95.0
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/insvideo/tsn_r50_1x1x8_100e_minikinetics_insvideo_rgb_20201030-011f984d.pth
- Config: configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134526976000
Input: 3seg
Modality: RGB
Parameters: 23917832
Pretrained: ImageNet
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 78.6
Top 5 Accuracy: 93.2
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/kineticsraw/tsn_r50_1x1x8_100e_minikinetics_kineticsraw_rgb_20201030-59f5d064.pth
- Config: configs/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134526976000
Input: 3seg
Modality: RGB
Parameters: 23917832
Pretrained: ImageNet
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 81.3
Top 5 Accuracy: 94.8
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/tsn_r50_1x1x8_100e_minikinetics_rgb/omnisource/tsn_r50_1x1x8_100e_minikinetics_omnisource_rgb_20201030-0f56ef51.pth
- Config: configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 256
FLOPs: 54860070912
Input: 8x8
Modality: RGB
Parameters: 32044296
Pretrained: None
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: slowonly_r50_8x8x1_256e_minikinetics_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 78.6
Top 5 Accuracy: 93.9
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/baseline/slowonly_r50_8x8x1_256e_minikinetics_rgb_20201030-168eb098.pth
- Config: configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 256
FLOPs: 54860070912
Input: 8x8
Modality: RGB
Parameters: 32044296
Pretrained: None
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 80.8
Top 5 Accuracy: 95.0
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/googleimage/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb_20201030-7da6dfc3.pth
- Config: configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 256
FLOPs: 54860070912
Input: 8x8
Modality: RGB
Parameters: 32044296
Pretrained: None
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 81.3
Top 5 Accuracy: 95.2
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/webimage/slowonly_r50_8x8x1_256e_minikinetics_webimage_rgb_20201030-c36616e9.pth
- Config: configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 256
FLOPs: 54860070912
Input: 8x8
Modality: RGB
Parameters: 32044296
Pretrained: None
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 82.4
Top 5 Accuracy: 95.6
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/insvideo/slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb_20201030-e2890e8d.pth
- Config: configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 256
FLOPs: 54860070912
Input: 8x8
Modality: RGB
Parameters: 32044296
Pretrained: None
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 80.3
Top 5 Accuracy: 94.5
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/kineticsraw/slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb_20201030-62974bac.pth
- Config: configs/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics/slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 256
FLOPs: 54860070912
Input: 8x8
Modality: RGB
Parameters: 32044296
Pretrained: None
Resolution: short-side 320
Training Data: MiniKinetics
Modality: RGB
Name: slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb
Results:
- Dataset: MiniKinetics
Metrics:
Top 1 Accuracy: 82.9
Top 5 Accuracy: 95.8
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030.json
Training Log: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030.log
Weights: https://download.openmmlab.com/mmaction/recognition/omnisource/slowonly_r50_8x8x1_256e_minikinetics_rgb/omnisource/slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb_20201030-284cfd3b.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Modality: RGB
Name: tsn_omnisource_r50_1x1x3_100e_kinetics_rgb
Converted From:
Weights: https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmaction/models/kinetics400/omnisource/tsn_OmniSource_kinetics400_se_rgb_r50_seg3_f1s1_imagenet-4066cb7e.pth
Code: https://github.com/open-mmlab/mmaction
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.6
Top 5 Accuracy: 91.0
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_imagenet_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-54192355.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: IG-1B
Resolution: short-side 320
Training Data: Kinetics-400
Modality: RGB
Name: tsn_IG1B_pretrained_omnisource_r50_1x1x3_100e_kinetics_rgb
Converted From:
Weights: https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmaction/models/kinetics400/omnisource/tsn_OmniSource_kinetics400_se_rgb_r50_seg3_f1s1_IG1B-25fc136b.pth
Code: https://github.com/open-mmlab/mmaction/
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 75.7
Top 5 Accuracy: 91.9
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_1G1B_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-2863fed0.pth
- Config: configs/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 256
FLOPs: 27430649856
Parameters: 32454096
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Modality: RGB
Name: slowonly_r50_omnisource_4x16x1_256e_kinetics400_rgb
Converted From:
Weights: https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmaction/models/kinetics400/omnisource/slowonly_OmniSource_kinetics400_se_rgb_r50_seg1_4x16_scratch-71f7b8ee.pth
Code: https://github.com/open-mmlab/mmaction/
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 76.8
Top 5 Accuracy: 92.5
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r50_omni_4x16x1_kinetics400_rgb_20200926-51b1f7ea.pth
- Config: configs/recognition/slowonly/slowonly_r101_8x8x1_196e_kinetics400_rgb.py
In Collection: OmniSource
Metadata:
Architecture: ResNet101
Batch Size: 8
Epochs: 196
FLOPs: 112063447040
Parameters: 60359120
Pretrained: None
Resolution: short-side 320
Training Data: Kinetics-400
Modality: RGB
Name: slowonly_r101_omnisource_8x8x1_196e_kinetics400_rgb
Converted From:
Weights: https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmaction/models/kinetics400/omnisource/slowonly_OmniSource_kinetics400_se_rgb_r101_seg1_8x8_scratch-2f838cb0.pth
Code: https://github.com/open-mmlab/mmaction/
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 80.4
Top 5 Accuracy: 94.4
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_omni_8x8x1_kinetics400_rgb_20200926-b5dbb701.pth
_base_ = [
'../../../_base_/models/slowonly_r50.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
gg_root = 'data/OmniSource/googleimage_200'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_gg = ('data/OmniSource/annotations/googleimage_200/'
'tsn_8seg_googleimage_200_wodup.txt')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_gg_pipeline = [
dict(type='ImageDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='BuildPseudoClip', clip_len=8),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='ImageDataset',
ann_file=ann_file_gg,
data_prefix=gg_root,
pipeline=train_gg_pipeline)
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.15, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr=0)
# runtime settings
total_epochs = 256
checkpoint_config = dict(interval=8)
work_dir = ('./work_dirs/omnisource/'
'slowonly_r50_8x8x1_256e_minikinetics_googleimage_rgb')
find_unused_parameters = False
_base_ = [
'../../../_base_/models/slowonly_r50.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
iv_root = 'data/OmniSource/insvideo_200'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_iv = ('data/OmniSource/annotations/insvideo_200/'
'slowonly_8x8_insvideo_200_wodup.txt')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_iv_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type=dataset_type,
ann_file=ann_file_iv,
data_prefix=iv_root,
pipeline=train_iv_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5)
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.15, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr=0)
# runtime settings
total_epochs = 256
checkpoint_config = dict(interval=8)
work_dir = ('./work_dirs/omnisource/'
'slowonly_r50_8x8x1_256e_minikinetics_insvideo_rgb')
find_unused_parameters = False
_base_ = [
'../../../_base_/models/slowonly_r50.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
kraw_root = 'data/OmniSource/kinetics_raw_200_train'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_kraw = ('data/OmniSource/annotations/kinetics_raw_200/'
'slowonly_8x8_kinetics_raw_200.json')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_kraw_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='RawVideoDataset',
ann_file=ann_file_kraw,
data_prefix=kraw_root,
pipeline=train_kraw_pipeline,
clipname_tmpl='part_{}.mp4',
sampling_strategy='positive')
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.15, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr=0)
# runtime settings
total_epochs = 256
checkpoint_config = dict(interval=8)
work_dir = ('./work_dirs/omnisource/'
'slowonly_r50_8x8x1_256e_minikinetics_kineticsraw_rgb')
find_unused_parameters = False
_base_ = [
'../../../_base_/models/slowonly_r50.py',
'../../../_base_/default_runtime.py'
]
# model settings
model = dict(backbone=dict(pretrained=None), cls_head=dict(num_classes=200))
# dataset settings
dataset_type = 'VideoDataset'
# The flag indicates using joint training
omnisource = True
data_root = 'data/OmniSource/kinetics_200_train'
data_root_val = 'data/OmniSource/kinetics_200_val'
web_root = 'data/OmniSource/'
iv_root = 'data/OmniSource/insvideo_200'
kraw_root = 'data/OmniSource/kinetics_raw_200_train'
ann_file_train = 'data/OmniSource/annotations/kinetics_200/k200_train.txt'
ann_file_web = ('data/OmniSource/annotations/webimage_200/'
'tsn_8seg_webimage_200_wodup.txt')
ann_file_iv = ('data/OmniSource/annotations/insvideo_200/'
'slowonly_8x8_insvideo_200_wodup.txt')
ann_file_kraw = ('data/OmniSource/annotations/kinetics_raw_200/'
'slowonly_8x8_kinetics_raw_200.json')
ann_file_val = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
ann_file_test = 'data/OmniSource/annotations/kinetics_200/k200_val.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_web_pipeline = [
dict(type='ImageDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='BuildPseudoClip', clip_len=8),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_iv_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
train_kraw_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=12,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train_ratio=[2, 1, 1, 1],
train=[
dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
dict(
type='ImageDataset',
ann_file=ann_file_web,
data_prefix=web_root,
pipeline=train_web_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5),
dict(
type=dataset_type,
ann_file=ann_file_iv,
data_prefix=iv_root,
pipeline=train_iv_pipeline,
num_classes=200,
sample_by_class=True,
power=0.5),
dict(
type='RawVideoDataset',
ann_file=ann_file_kraw,
data_prefix=kraw_root,
pipeline=train_kraw_pipeline,
clipname_tmpl='part_{}.mp4',
sampling_strategy='positive')
],
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=8, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.15, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='CosineAnnealing', min_lr=0)
# runtime settings
total_epochs = 256
checkpoint_config = dict(interval=8)
work_dir = ('./work_dirs/omnisource/'
'slowonly_r50_8x8x1_256e_minikinetics_omnisource_rgb')
find_unused_parameters = False
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment