# model settings model = dict( type='Recognizer2D', backbone=dict( type='ResNetTSM', pretrained='torchvision://resnet50', depth=50, out_indices=(2, 3), norm_eval=False, shift_div=8), neck=dict( type='TPN', in_channels=(1024, 2048), out_channels=1024, spatial_modulation_cfg=dict( in_channels=(1024, 2048), out_channels=2048), temporal_modulation_cfg=dict(downsample_scales=(8, 8)), upsample_cfg=dict(scale_factor=(1, 1, 1)), downsample_cfg=dict(downsample_scale=(1, 1, 1)), level_fusion_cfg=dict( in_channels=(1024, 1024), mid_channels=(1024, 1024), out_channels=2048, downsample_scales=((1, 1, 1), (1, 1, 1))), aux_head_cfg=dict(out_channels=174, loss_weight=0.5)), cls_head=dict( type='TPNHead', num_classes=174, in_channels=2048, spatial_type='avg', consensus=dict(type='AvgConsensus', dim=1), dropout_ratio=0.5, init_std=0.01), # model training and testing settings train_cfg=None, test_cfg=dict(average_clips='prob', fcn_test=True))