Commit 5b3e36dc authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add model TSM

parents
Pipeline #315 failed with stages
in 0 seconds
_base_ = ['../../../_base_/models/tsn_r50.py']
# dataset settings
dataset_type = 'RawframeDataset'
data_root_val = 'data/kinetics400/rawframes_val_320p'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes_320p.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='TenCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
dist_params = dict(backend='nccl')
_base_ = ['../../../_base_/models/tsn_r50.py']
# dataset settings
dataset_type = 'RawframeDataset'
data_root_val = 'data/kinetics400/rawframes_val_320p'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes_320p.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
dist_params = dict(backend='nccl')
_base_ = ['../../../_base_/models/tsn_r50.py']
# dataset settings
dataset_type = 'RawframeDataset'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='TenCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
dist_params = dict(backend='nccl')
_base_ = ['../../../_base_/models/tsn_r50.py']
# dataset settings
dataset_type = 'RawframeDataset'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
dist_params = dict(backend='nccl')
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
category_nums = dict(
action=739, attribute=117, concept=291, event=69, object=1678, scene=248)
target_cate = 'action'
model = dict(
backbone=dict(pretrained='torchvision://resnet18', depth=18),
cls_head=dict(
in_channels=512,
num_classes=category_nums[target_cate],
multi_class=True,
loss_cls=dict(type='BCELossWithLogits', loss_weight=333.)))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/hvu/videos_train'
data_root_val = 'data/hvu/videos_val'
ann_file_train = f'data/hvu/hvu_{target_cate}_train.json'
ann_file_val = f'data/hvu/hvu_{target_cate}_val.json'
ann_file_test = f'data/hvu/hvu_{target_cate}_val.json'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]))
evaluation = dict(interval=2, metrics=['mean_average_precision'])
# runtime settings
work_dir = f'./work_dirs/tsn_r18_1x1x8_100e_hvu_{target_cate}_rgb/'
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
category_nums = dict(
action=739, attribute=117, concept=291, event=69, object=1678, scene=248)
target_cate = 'attribute'
model = dict(
backbone=dict(pretrained='torchvision://resnet18', depth=18),
cls_head=dict(
in_channels=512,
num_classes=category_nums[target_cate],
multi_class=True,
loss_cls=dict(type='BCELossWithLogits', loss_weight=333.)))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/hvu/videos_train'
data_root_val = 'data/hvu/videos_val'
ann_file_train = f'data/hvu/hvu_{target_cate}_train.json'
ann_file_val = f'data/hvu/hvu_{target_cate}_val.json'
ann_file_test = f'data/hvu/hvu_{target_cate}_val.json'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]))
evaluation = dict(interval=2, metrics=['mean_average_precision'])
# runtime settings
work_dir = f'./work_dirs/tsn_r18_1x1x8_100e_hvu_{target_cate}_rgb/'
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
category_nums = dict(
action=739, attribute=117, concept=291, event=69, object=1678, scene=248)
target_cate = 'concept'
model = dict(
backbone=dict(pretrained='torchvision://resnet18', depth=18),
cls_head=dict(
in_channels=512,
num_classes=category_nums[target_cate],
multi_class=True,
loss_cls=dict(type='BCELossWithLogits', loss_weight=333.)))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/hvu/videos_train'
data_root_val = 'data/hvu/videos_val'
ann_file_train = f'data/hvu/hvu_{target_cate}_train.json'
ann_file_val = f'data/hvu/hvu_{target_cate}_val.json'
ann_file_test = f'data/hvu/hvu_{target_cate}_val.json'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]))
evaluation = dict(interval=2, metrics=['mean_average_precision'])
# runtime settings
work_dir = f'./work_dirs/tsn_r18_1x1x8_100e_hvu_{target_cate}_rgb/'
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
category_nums = dict(
action=739, attribute=117, concept=291, event=69, object=1678, scene=248)
target_cate = 'event'
model = dict(
backbone=dict(pretrained='torchvision://resnet18', depth=18),
cls_head=dict(
in_channels=512,
num_classes=category_nums[target_cate],
multi_class=True,
loss_cls=dict(type='BCELossWithLogits', loss_weight=333.)))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/hvu/videos_train'
data_root_val = 'data/hvu/videos_val'
ann_file_train = f'data/hvu/hvu_{target_cate}_train.json'
ann_file_val = f'data/hvu/hvu_{target_cate}_val.json'
ann_file_test = f'data/hvu/hvu_{target_cate}_val.json'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]))
evaluation = dict(interval=2, metrics=['mean_average_precision'])
# runtime settings
work_dir = f'./work_dirs/tsn_r18_1x1x8_100e_hvu_{target_cate}_rgb/'
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
category_nums = dict(
action=739, attribute=117, concept=291, event=69, object=1678, scene=248)
target_cate = 'object'
model = dict(
backbone=dict(pretrained='torchvision://resnet18', depth=18),
cls_head=dict(
in_channels=512,
num_classes=category_nums[target_cate],
multi_class=True,
loss_cls=dict(type='BCELossWithLogits', loss_weight=333.)))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/hvu/videos_train'
data_root_val = 'data/hvu/videos_val'
ann_file_train = f'data/hvu/hvu_{target_cate}_train.json'
ann_file_val = f'data/hvu/hvu_{target_cate}_val.json'
ann_file_test = f'data/hvu/hvu_{target_cate}_val.json'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]))
evaluation = dict(interval=2, metrics=['mean_average_precision'])
# runtime settings
work_dir = f'./work_dirs/tsn_r18_1x1x8_100e_hvu_{target_cate}_rgb/'
_base_ = [
'../../../_base_/models/tsn_r50.py',
'../../../_base_/schedules/sgd_100e.py',
'../../../_base_/default_runtime.py'
]
# model settings
category_nums = dict(
action=739, attribute=117, concept=291, event=69, object=1678, scene=248)
target_cate = 'scene'
model = dict(
backbone=dict(pretrained='torchvision://resnet18', depth=18),
cls_head=dict(
in_channels=512,
num_classes=category_nums[target_cate],
multi_class=True,
loss_cls=dict(type='BCELossWithLogits', loss_weight=333.)))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/hvu/videos_train'
data_root_val = 'data/hvu/videos_val'
ann_file_train = f'data/hvu/hvu_{target_cate}_train.json'
ann_file_val = f'data/hvu/hvu_{target_cate}_val.json'
ann_file_test = f'data/hvu/hvu_{target_cate}_val.json'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline,
multi_class=True,
num_classes=category_nums[target_cate]))
evaluation = dict(interval=2, metrics=['mean_average_precision'])
# runtime settings
work_dir = f'./work_dirs/tsn_r18_1x1x8_100e_hvu_{target_cate}_rgb/'
Collections:
- Name: TSN
README: configs/recognition/tsn/README.md
Paper:
URL: https://arxiv.org/abs/1608.00859
Title: "Temporal Segment Networks: Towards Good Practices for Deep Action Recognition"
Models:
- Config: configs/recognition/tsn/tsn_r50_1x1x3_75e_ucf101_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 75
FLOPs: 134526773248
Parameters: 23714981
Pretrained: ImageNet
Training Data: UCF101
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x3_75e_ucf101_rgb
Results:
- Dataset: UCF101
Metrics:
Top 1 Accuracy: 83.03
Top 5 Accuracy: 96.78
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_75e_ucf101_rgb/tsn_r50_1x1x3_75e_ucf101_rgb_20201023.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_75e_ucf101_rgb/tsn_r50_1x1x3_75e_ucf101_rgb_20201023.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_75e_ucf101_rgb/tsn_r50_1x1x3_75e_ucf101_rgb_20201023-d85ab600.pth
- Config: configs/recognition/tsn/tsn_r50_video_1x1x8_100e_diving48_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 100
FLOPs: 32959107072
Parameters: 23606384
Pretrained: ImageNet
Training Data: Diving48
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_video_1x1x8_100e_diving48_rgb
Results:
- Dataset: Diving48
Metrics:
Top 1 Accuracy: 71.27
Top 5 Accuracy: 95.74
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_diving48_rgb/20210426_014138.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_diving48_rgb/20210426_014138.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_diving48_rgb/tsn_r50_video_1x1x8_100e_diving48_rgb_20210426-6dde0185.pth
- Config: configs/recognition/tsn/tsn_r50_video_1x1x16_100e_diving48_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 4
Epochs: 100
FLOPs: 32959107072
Parameters: 23606384
Pretrained: ImageNet
Training Data: Diving48
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_video_1x1x16_100e_diving48_rgb
Results:
- Dataset: Diving48
Metrics:
Top 1 Accuracy: 76.75
Top 5 Accuracy: 96.95
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x16_100e_diving48_rgb/20210426_014103.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x16_100e_diving48_rgb/20210426_014103.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x16_100e_diving48_rgb/tsn_r50_video_1x1x16_100e_diving48_rgb_20210426-63c5f2f7.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 50
FLOPs: 43048605696
Parameters: 23612531
Pretrained: ImageNet
Training Data: HMDB51
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb
Results:
- Dataset: HMDB51
Metrics:
Top 1 Accuracy: 48.95
Top 5 Accuracy: 80.19
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb/20201025_231108.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb/20201025_231108.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb/tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb_20201123-ce6c27ed.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 50
FLOPs: 43048605696
Parameters: 23612531
Pretrained: Kinetics400
Training Data: HMDB51
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x8_50e_hmdb51_kinetics400_rgb
Results:
- Dataset: HMDB51
Metrics:
Top 1 Accuracy: 56.08
Top 5 Accuracy: 84.31
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_kinetics400_rgb/20201108_190805.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_kinetics400_rgb/20201108_190805.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_kinetics400_rgb/tsn_r50_1x1x8_50e_hmdb51_kinetics400_rgb_20201123-7f84701b.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_mit_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Epochs: 50
FLOPs: 43048605696
Parameters: 23612531
Pretrained: Moments
Training Data: HMDB51
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x8_50e_hmdb51_mit_rgb
Results:
- Dataset: HMDB51
Metrics:
Top 1 Accuracy: 54.25
Top 5 Accuracy: 83.86
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_mit_rgb/20201112_170135.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_mit_rgb/20201112_170135.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_mit_rgb/tsn_r50_1x1x8_50e_hmdb51_mit_rgb_20201123-01526d41.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x3_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.6
Top 5 Accuracy: 89.26
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/20200614_063526.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/20200614_063526.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x3_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.42
Top 5 Accuracy: 89.03
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/20200725_031325.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/20200725_031325.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/tsn_r50_256p_1x1x3_100e_kinetics400_rgb_20200725-22592236.pth
- Config: configs/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 16
Epochs: 100
FLOPs: 32959827968
Parameters: 24327632
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 24 GPUs
Modality: RGB
Name: tsn_r50_dense_1x1x5_50e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.18
Top 5 Accuracy: 89.1
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/tsn_r50_dense_1x1x5_100e_kinetics400_rgb_20200627-a063165f.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 134527385600
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_r50_320p_1x1x3_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.91
Top 5 Accuracy: 89.51
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_320p_1x1x3_100e_kinetics400_rgb_20200702-cc665e2a.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 110
FLOPs: 109881868800
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: Flow
Name: tsn_r50_320p_1x1x3_110e_kinetics400_flow
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 55.7
Top 5 Accuracy: 79.85
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_f3_kinetics400_flow_shortedge_55.7_79.9.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_f3_kinetics400_flow_shortedge_55.7_79.9.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_320p_1x1x3_110e_kinetics400_flow_20200705-3036bab6.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134527385600
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 71.8
Top 5 Accuracy: 90.17
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/20200815_173413.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/20200815_173413.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/tsn_r50_256p_1x1x8_100e_kinetics400_rgb_20200817-883baf16.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134527385600
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 24 GPUs
Modality: RGB
Name: tsn_r50_320p_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.41
Top 5 Accuracy: 90.55
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_f8_kinetics400_shortedge_72.4_90.6.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_f8_kinetics400_shortedge_72.4_90.6.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_320p_1x1x8_100e_kinetics400_rgb_20200702-ef80e3d7.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 110
FLOPs: 109881868800
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 32 GPUs
Modality: Flow
Name: tsn_r50_320p_1x1x8_110e_kinetics400_flow
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 57.76
Top 5 Accuracy: 80.99
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_f8_kinetics400_flow_shortedge_57.8_81.0.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_f8_kinetics400_flow_shortedge_57.8_81.0.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_320p_1x1x8_110e_kinetics400_flow_20200705-1f39486b.pth
- Config: configs/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 71.11
Top 5 Accuracy: 90.04
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb_20201014.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb_20201014.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb_20201014-5ae1ee79.pth
- Config: configs/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 32959827968
Parameters: 24327632
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_dense_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.77
Top 5 Accuracy: 89.3
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/20200606_003901.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/20200606_003901.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_dense_1x1x8_100e_kinetics400_rgb_20200606-e925e6e3.pth
- Config: configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 134527385600
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_video_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 71.14
Top 5 Accuracy: 89.63
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_100e_kinetics400_rgb.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_100e_kinetics400_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb/tsn_r50_video_1x1x8_100e_kinetics400_rgb_20200702-568cde33.pth
- Config: configs/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 32959827968
Parameters: 24327632
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-400
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 70.4
Top 5 Accuracy: 89.12
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_dense_100e_kinetics400_rgb.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_dense_100e_kinetics400_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb_20200703-0f19175f.pth
- Config: configs/recognition/tsn/custom_backbones/tsn_rn101_32x4d_320p_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNeXt101-32x4d [[MMCls](https://github.com/open-mmlab/mmclassification/tree/master/configs/resnext)]
Batch Size: 16
Epochs: 100
FLOPs: 262238208000
Parameters: 42948304
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_rn101_32x4d_320p_1x1x3_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.43
Top 5 Accuracy: 91.01
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_rn101_32x4d_320p_1x1x3_100e_kinetics400_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_rn101_32x4d_320p_1x1x3_100e_kinetics400_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_rn101_32x4d_320p_1x1x3_100e_kinetics400_rgb-16a8b561.pth
- Config: configs/recognition/tsn/custom_backbones/tsn_dense161_320p_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNeXt101-32x4d [[TorchVision](https://github.com/pytorch/vision/)]
Batch Size: 12
Epochs: 100
FLOPs: 255225561600
Parameters: 27355600
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics-400
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_dense161_320p_1x1x3_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 72.78
Top 5 Accuracy: 90.75
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_dense161_320p_1x1x3_100e_kinetics400_rgb/tsn_dense161_320p_1x1x3_100e_kinetics400_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_dense161_320p_1x1x3_100e_kinetics400_rgb/tsn_dense161_320p_1x1x3_100e_kinetics400_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_dense161_320p_1x1x3_100e_kinetics400_rgb/tsn_dense161_320p_1x1x3_100e_kinetics400_rgb-cbe85332.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: ImageNet
Resolution: 340x256
Training Data: Kinetics-400
Modality: RGB
Name: tsn_omnisource_r50_1x1x3_100e_kinetics_rgb
Converted From:
Weights: https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmaction/models/kinetics400/omnisource/tsn_OmniSource_kinetics400_se_rgb_r50_seg3_f1s1_imagenet-4066cb7e.pth
Code: https://github.com/open-mmlab/mmaction
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.6
Top 5 Accuracy: 91.0
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_imagenet_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-54192355.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: IG-1B
Resolution: short-side 320
Training Data: Kinetics-400
Modality: RGB
Name: tsn_IG1B_pretrained_r50_1x1x3_100e_kinetics_rgb
Converted From:
Weights: https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmaction/models/kinetics400/omnisource/tsn_OmniSource_kinetics400_se_rgb_r50_seg3_f1s1_IG1B-25fc136b.pth
Code: https://github.com/open-mmlab/mmaction/
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 73.1
Top 5 Accuracy: 90.4
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_1G1B_pretrained_r50_without_omni_1x1x3_kinetics400_rgb_20200926-c133dd49.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 32
Epochs: 100
FLOPs: 102997721600
Parameters: 24327632
Pretrained: IG-1B
Resolution: short-side 320
Training Data: Kinetics-400
Modality: RGB
Name: tsn_IG1B_pretrained_omnisource_r50_1x1x3_100e_kinetics_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 75.7
Top 5 Accuracy: 91.9
Task: Action Recognition
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/omni/tsn_1G1B_pretrained_r50_omni_1x1x3_kinetics400_rgb_20200926-2863fed0.pth
- Config: configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134527795200
Parameters: 24737432
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-600
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_r50_video_1x1x8_100e_kinetics600_rgb
Results:
- Dataset: Kinetics-600
Metrics:
Top 1 Accuracy: 74.8
Top 5 Accuracy: 92.3
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015-4db3c461.pth
- Config: configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 12
Epochs: 100
FLOPs: 134528000000
Parameters: 24942332
Pretrained: ImageNet
Resolution: short-side 256
Training Data: Kinetics-700
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_r50_video_1x1x8_100e_kinetics700_rgb
Results:
- Dataset: Kinetics-700
Metrics:
Top 1 Accuracy: 61.7
Top 5 Accuracy: 83.6
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015-e381a6c7.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 16
Epochs: 50
FLOPs: 32781541376
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x8_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 18.55
Top 5 Accuracy: 44.8
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_r50_f8_sthv1_18.1_45.0.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_sthv1.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_r50_1x1x8_50e_sthv1_rgb_20200618-061b9195.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 4
Epochs: 50
FLOPs: 32781541376
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 100
Training Data: SthV1
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x16_50e_sthv1_rgb
Results:
- Dataset: SthV1
Metrics:
Top 1 Accuracy: 15.77
Top 5 Accuracy: 39.85
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/20200614_211932.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/20200614_211932.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/tsn_r50_1x1x16_50e_sthv1_rgb_20200614-7e2fe4f1.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 16
Epochs: 50
FLOPs: 32959365120
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 256
Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x8_50e_sthv2_rgb
Results:
- Dataset: SthV2
Metrics:
Top 1 Accuracy: 28.59
Top 5 Accuracy: 59.56
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/20210816_221116.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/20210816_221116.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/tsn_r50_1x1x8_50e_sthv2_rgb_20210816-1aafee8f.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 4
Epochs: 50
FLOPs: 65918373888
Parameters: 23864558
Pretrained: ImageNet
Resolution: height 256
Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_1x1x16_50e_sthv2_rgb
Results:
- Dataset: SthV2
Metrics:
Top 1 Accuracy: 20.89
Top 5 Accuracy: 49.16
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/20210816_225256.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/20210816_225256.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/tsn_r50_1x1x16_50e_sthv2_rgb_20210816-5d23ac6e.pth
- Config: configs/recognition/tsn/tsn_r50_1x1x6_100e_mit_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 16
Epochs: 100
FLOPs: 32287070208
Parameters: 24202643
Pretrained: ImageNet
Resolution: short-side 256
Training Data: MiT
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_r50_1x1x6_100e_mit_rgb
Results:
- Dataset: MiT
Metrics:
Top 1 Accuracy: 26.84
Top 5 Accuracy: 51.6
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x6_100e_mit_rgb/tsn_r50_f6_mit_26.8_51.6.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x6_100e_mit_rgb/tsn_mit.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_1x1x6_100e_mit_rgb/tsn_r50_1x1x6_100e_mit_rgb_20200618-d512ab1b.pth
- Config: configs/recognition/tsn/tsn_r101_1x1x5_50e_mmit_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet101
Batch Size: 16
Epochs: 50
FLOPs: 51249301504
Parameters: 43141497
Pretrained: ImageNet
Resolution: short-side 256
Training Data: MMiT
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_r101_1x1x5_50e_mmit_rgb
Results:
- Dataset: MMiT
Metrics:
mAP: 61.09
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r101_1x1x5_50e_mmit_rgb/tsn_r101_f6_mmit_61.1.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r101_1x1x5_50e_mmit_rgb/tsn_mmit.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r101_1x1x5_50e_mmit_rgb/tsn_r101_1x1x5_50e_mmit_rgb_20200618-642f450d.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 134526976000
Parameters: 23917832
Pretrained: Kinetics400
Resolution: short-side 320
Training Data: ActivityNet v1.3
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_320p_1x1x8_50e_activitynet_video_rgb
Results:
- Dataset: ActivityNet v1.3
Metrics:
Top 1 Accuracy: 73.93
Top 5 Accuracy: 93.44
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb/20210228_223327.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb/20210228_223327.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb_20210301-7f8da0c6.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 50
FLOPs: 134526976000
Parameters: 23917832
Pretrained: Kinetics400
Resolution: short-side 320
Training Data: ActivityNet v1.3
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb
Results:
- Dataset: ActivityNet v1.3
Metrics:
Top 1 Accuracy: 76.9
Top 5 Accuracy: 94.47
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb/20210217_181313.log.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb/20210217_181313.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb_20210301-c0f04a7e.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 150
FLOPs: 109881459200
Parameters: 23939784
Pretrained: Kinetics400
Resolution: 340x256
Training Data: ActivityNet v1.3
Training Resources: 16 GPUs
Modality: Flow
Name: tsn_r50_320p_1x1x8_150e_activitynet_video_flow
Results:
- Dataset: ActivityNet v1.3
Metrics:
Top 1 Accuracy: 57.51
Top 5 Accuracy: 83.02
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow/tsn_r50_320p_1x1x8_150e_activitynet_video_flow_20200804.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow/tsn_r50_320p_1x1x8_150e_activitynet_video_flow_20200804.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow/tsn_r50_320p_1x1x8_150e_activitynet_video_flow_20200804-13313f52.pth
- Config: configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow.py
In Collection: TSN
Metadata:
Architecture: ResNet50
Batch Size: 8
Epochs: 150
FLOPs: 109881459200
Parameters: 23939784
Pretrained: Kinetics400
Resolution: 340x256
Training Data: ActivityNet v1.3
Training Resources: 16 GPUs
Modality: Flow
Name: tsn_r50_320p_1x1x8_150e_activitynet_clip_flow
Results:
- Dataset: ActivityNet v1.3
Metrics:
Top 1 Accuracy: 59.51
Top 5 Accuracy: 82.69
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow_20200804.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow_20200804.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow_20200804-8622cf38.pth
- Config: configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_action_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet18
Batch Size: 32
Epochs: 100
FLOPs: 59483309568
Parameters: 11555619
Pretrained: ImageNet
Resolution: short-side 256
Training Data: HVU
Training Resources: 16 GPUs
Modality: RGB
Name: tsn_r18_1x1x8_100e_hvu_action_rgb
Results:
- Dataset: HVU
Metrics:
mAP: 57.5
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/action/tsn_r18_1x1x8_100e_hvu_action_rgb_20201027.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/action/tsn_r18_1x1x8_100e_hvu_action_rgb_20201027.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/action/tsn_r18_1x1x8_100e_hvu_action_rgb_20201027-011b282b.pth
tag category: action
- Config: configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_scene_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet18
Batch Size: 32
Epochs: 100
FLOPs: 59483058176
Parameters: 11303736
Pretrained: ImageNet
Resolution: short-side 256
Training Data: HVU
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r18_1x1x8_100e_hvu_scene_rgb
Results:
- Dataset: HVU
Metrics:
mAP: 55.2
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/scene/tsn_r18_1x1x8_100e_hvu_scene_rgb_20201027.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/scene/tsn_r18_1x1x8_100e_hvu_scene_rgb_20201027.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/scene/tsn_r18_1x1x8_100e_hvu_scene_rgb_20201027-00e5748d.pth
tag category: scene
- Config: configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_object_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet18
Batch Size: 32
Epochs: 100
FLOPs: 59483790336
Parameters: 12037326
Pretrained: ImageNet
Resolution: short-side 256
Training Data: HVU
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r18_1x1x8_100e_hvu_object_rgb
Results:
- Dataset: HVU
Metrics:
mAP: 45.7
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/object/tsn_r18_1x1x8_100e_hvu_object_rgb_20201027.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/object/tsn_r18_1x1x8_100e_hvu_object_rgb_20201027.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/object/tsn_r18_1x1x8_100e_hvu_object_rgb_20201102-24a22f30.pth
tag category: object
- Config: configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_event_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet18
Batch Size: 32
Epochs: 100
FLOPs: 59482966528
Parameters: 11211909
Pretrained: ImageNet
Resolution: short-side 256
Training Data: HVU
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r18_1x1x8_100e_hvu_event_rgb
Results:
- Dataset: HVU
Metrics:
mAP: 63.7
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/event/tsn_r18_1x1x8_100e_hvu_event_rgb_20201027.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/event/tsn_r18_1x1x8_100e_hvu_event_rgb_20201027.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/event/tsn_r18_1x1x8_100e_hvu_event_rgb_20201027-dea8cd71.pth
tag category: event
- Config: configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_concept_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet18
Batch Size: 32
Epochs: 100
FLOPs: 59483790336
Parameters: 12037326
Pretrained: ImageNet
Resolution: short-side 256
Training Data: HVU
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r18_1x1x8_100e_hvu_concept_rgb
Results:
- Dataset: HVU
Metrics:
mAP: 47.5
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/concept/tsn_r18_1x1x8_100e_hvu_concept_rgb_20201027.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/concept/tsn_r18_1x1x8_100e_hvu_concept_rgb_20201027.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/concept/tsn_r18_1x1x8_100e_hvu_concept_rgb_20201027-fc1dd8e3.pth
tag category: concept
- Config: configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_attribute_rgb.py
In Collection: TSN
Metadata:
Architecture: ResNet18
Batch Size: 32
Epochs: 100
FLOPs: 59482991104
Parameters: 11236533
Pretrained: ImageNet
Resolution: short-side 256
Training Data: HVU
Training Resources: 8 GPUs
Modality: RGB
Name: tsn_r18_1x1x8_100e_hvu_attribute_rgb
Results:
- Dataset: HVU
Metrics:
mAP: 46.1
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/attribute/tsn_r18_1x1x8_100e_hvu_attribute_rgb_20201027.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/attribute/tsn_r18_1x1x8_100e_hvu_attribute_rgb_20201027.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/hvu/attribute/tsn_r18_1x1x8_100e_hvu_attribute_rgb_20201027-0b3b49d2.pth
tag category: attribute
- Config: configs/recognition/tsn/custom_backbones/tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb.py
In Collection: TSN
Metadata:
Architecture: Swin Transformer
Batch Size: 24
Epochs: 100
Parameters: 87153224
Pretrained: ImageNet
Resolution: short-side 320
Training Data: Kinetics400
Training Resources: 8 GPUs
Name: tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb
Results:
- Dataset: Kinetics-400
Metrics:
Top 1 Accuracy: 77.51
Top 5 Accuracy: 92.92
Task: Action Recognition
Training Json Log: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb/tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb.json
Training Log: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb/tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb.log
Weights: https://download.openmmlab.com/mmaction/recognition/tsn/custom_backbones/tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb/tsn_swin_transformer_video_320p_1x1x3_100e_kinetics400_rgb-805380f6.pth
_base_ = [
'../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=3,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='TenCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# fp16 settings
fp16 = dict()
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/tsn_fp16_r50_1x1x3_100e_kinetics400_rgb/'
_base_ = [
'../../_base_/schedules/sgd_tsm_50e.py', '../../_base_/default_runtime.py'
]
# model settings
model = dict(
type='Recognizer2D',
backbone=dict(
type='ResNet',
pretrained='torchvision://resnet101',
depth=101,
norm_eval=False),
cls_head=dict(
type='TSNHead',
num_classes=313,
in_channels=2048,
spatial_type='avg',
consensus=dict(type='AvgConsensus', dim=1),
loss_cls=dict(type='BCELossWithLogits', loss_weight=160.0),
dropout_ratio=0.5,
init_std=0.01,
multi_class=True,
label_smooth_eps=0),
train_cfg=None,
test_cfg=dict(average_clips=None))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/mmit/videos'
data_root_val = '/data/mmit/videos'
ann_file_train = 'data/mmit/mmit_train_list_videos.txt'
ann_file_val = 'data/mmit/mmit_val_list_videos.txt'
ann_file_test = 'data/mmit/mmit_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=5,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=5,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline,
multi_class=True,
num_classes=313),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline,
multi_class=True,
num_classes=313),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline,
multi_class=True,
num_classes=313))
evaluation = dict(interval=5, metrics=['mmit_mean_average_precision'])
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/tsn_r101_1x1x5_50e_mmit_rgb/'
_base_ = ['./tsn_r50_1x1x8_50e_sthv1_rgb.py']
# model settings
model = dict(cls_head=dict(init_std=0.001))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/sthv1/rawframes'
data_root_val = 'data/sthv1/rawframes'
ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt'
ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=4,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
filename_tmpl='{:05}.jpg',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
filename_tmpl='{:05}.jpg',
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
filename_tmpl='{:05}.jpg',
pipeline=test_pipeline))
evaluation = dict(
interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.01, momentum=0.9,
weight_decay=0.0005) # this lr is used for 8 gpus
# runtime settings
checkpoint_config = dict(interval=1)
work_dir = './work_dirs/tsn_r50_1x1x16_50e_sthv1_rgb/'
_base_ = ['./tsn_r50_1x1x8_50e_sthv2_rgb.py']
# model settings
model = dict(cls_head=dict(init_std=0.001))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/sthv2/rawframes'
data_root_val = 'data/sthv2/rawframes'
ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt'
ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt'
ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=16,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=4,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.005, momentum=0.9,
weight_decay=0.0005) # this lr is used for 8 gpus
# optimizer config
optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
# runtime settings
checkpoint_config = dict(interval=1)
work_dir = './work_dirs/tsn_r50_1x1x16_50e_sthv2_rgb/'
_base_ = [
'../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
data_root_val = 'data/kinetics400/rawframes_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.875, 0.75, 0.66),
random_crop=False,
max_wh_scale_gap=1),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=3,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='TenCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/tsn_r50_1x1x3_100e_kinetics400_rgb/'
_base_ = ['../../_base_/models/tsn_r50.py', '../../_base_/default_runtime.py']
# model settings
model = dict(cls_head=dict(num_classes=101, init_std=0.001))
# dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/ucf101/rawframes/'
data_root_val = 'data/ucf101/rawframes/'
split = 1 # official train/test splits. valid numbers: 1, 2, 3
ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt'
ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=3,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=25,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
# optimizer
optimizer = dict(
type='SGD', lr=0.00128, momentum=0.9,
weight_decay=0.0005) # this lr is used for 8 gpus
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
# learning policy
lr_config = dict(policy='step', step=[])
total_epochs = 75
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = f'./work_dirs/tsn_r50_1x1x3_75e_ucf101_split_{split}_rgb/'
_base_ = [
'../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_100e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=339))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/mit/videos/training'
data_root_val = '/data/mit/videos/validation/'
ann_file_train = 'data/mit/mit_train_list_videos.txt'
ann_file_val = 'data/mit/mit_val_list_videos.txt'
ann_file_test = 'data/mit/mit_val_list_videos.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=6),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(
type='MultiScaleCrop',
input_size=224,
scales=(1, 0.8),
random_crop=False,
max_wh_scale_gap=0),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=6,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(type='DecordDecode'),
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=6,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=16,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='SGD', lr=0.005, momentum=0.9,
weight_decay=0.0001) # this lr is used for 8 gpus
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/tsn_r50_1x1x6_100e_mit_rgb'
_base_ = [
'../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=51))
# dataset settings
split = 1
dataset_type = 'RawframeDataset'
data_root = 'data/hmdb51/rawframes'
data_root_val = 'data/hmdb51/rawframes'
ann_file_train = f'data/hmdb51/hmdb51_train_split_{split}_rawframes.txt'
ann_file_val = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
ann_file_test = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
# optimizer
optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001)
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb/'
gpu_ids = range(0, 1)
_base_ = [
'../../_base_/models/tsn_r50.py', '../../_base_/schedules/sgd_50e.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(cls_head=dict(num_classes=51))
# dataset settings
split = 1
dataset_type = 'RawframeDataset'
data_root = 'data/hmdb51/rawframes'
data_root_val = 'data/hmdb51/rawframes'
ann_file_train = f'data/hmdb51/hmdb51_train_split_{split}_rawframes.txt'
ann_file_val = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
ann_file_test = f'data/hmdb51/hmdb51_val_split_{split}_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=32,
workers_per_gpu=2,
test_dataloader=dict(videos_per_gpu=1),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=1, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
# optimizer
optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001)
# runtime settings
checkpoint_config = dict(interval=5)
work_dir = './work_dirs/tsn_r50_1x1x8_50e_hmdb51_kinetics400_rgb/'
load_from = 'https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/tsn_r50_256p_1x1x8_100e_kinetics400_rgb_20200817-883baf16.pth' # noqa: E501
gpu_ids = range(0, 1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment