# Copyright (c) OpenMMLab. All rights reserved. import os.path as osp import tempfile from unittest.mock import Mock, patch import numpy as np import pytest import torch import torch.nn as nn import mmaction from mmaction.models import (ACRNHead, AudioTSNHead, BBoxHeadAVA, FBOHead, I3DHead, LFBInferHead, SlowFastHead, STGCNHead, TimeSformerHead, TPNHead, TRNHead, TSMHead, TSNHead, X3DHead) from .base import generate_backbone_demo_inputs def test_i3d_head(): """Test loss method, layer construction, attributes and forward function in i3d head.""" i3d_head = I3DHead(num_classes=4, in_channels=2048) i3d_head.init_weights() assert i3d_head.num_classes == 4 assert i3d_head.dropout_ratio == 0.5 assert i3d_head.in_channels == 2048 assert i3d_head.init_std == 0.01 assert isinstance(i3d_head.dropout, nn.Dropout) assert i3d_head.dropout.p == i3d_head.dropout_ratio assert isinstance(i3d_head.fc_cls, nn.Linear) assert i3d_head.fc_cls.in_features == i3d_head.in_channels assert i3d_head.fc_cls.out_features == i3d_head.num_classes assert isinstance(i3d_head.avg_pool, nn.AdaptiveAvgPool3d) assert i3d_head.avg_pool.output_size == (1, 1, 1) input_shape = (3, 2048, 4, 7, 7) feat = torch.rand(input_shape) # i3d head inference cls_scores = i3d_head(feat) assert cls_scores.shape == torch.Size([3, 4]) def test_bbox_head_ava(): """Test loss method, layer construction, attributes and forward function in bbox head.""" with pytest.raises(TypeError): # topk must be None, int or tuple[int] BBoxHeadAVA(topk=0.1) with pytest.raises(AssertionError): # topk should be smaller than num_classes BBoxHeadAVA(num_classes=5, topk=(3, 5)) bbox_head = BBoxHeadAVA(in_channels=10, num_classes=4, topk=1) input = torch.randn([3, 10, 2, 2, 2]) ret, _ = bbox_head(input) assert ret.shape == (3, 4) cls_score = torch.tensor( [[0.568, -0.162, 0.273, -0.390, 0.447, 0.102, -0.409], [2.388, 0.609, 0.369, 1.630, -0.808, -0.212, 0.296], [0.252, -0.533, -0.644, -0.591, 0.148, 0.963, -0.525], [0.134, -0.311, -0.764, -0.752, 0.656, -1.517, 0.185]]) labels = torch.tensor([[0., 0., 1., 0., 0., 1., 0.], [0., 0., 0., 1., 0., 0., 0.], [0., 1., 0., 0., 1., 0., 1.], [0., 0., 1., 1., 0., 0., 1.]]) label_weights = torch.tensor([1., 1., 1., 1.]) # Test topk_to_matrix() assert torch.equal( BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 1), torch.tensor([[0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0]], dtype=bool)) assert torch.equal( BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 2), torch.tensor([[0, 1, 0, 1, 0, 0], [1, 0, 1, 0, 0, 0], [0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1]], dtype=bool)) assert torch.equal( BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 3), torch.tensor([[0, 1, 0, 1, 1, 0], [1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1], [1, 0, 0, 1, 0, 1]], dtype=bool)) assert torch.equal( BBoxHeadAVA.topk_to_matrix(cls_score[:, 1:], 6), torch.ones([4, 6], dtype=bool)) # Test Multi-Label Loss bbox_head = BBoxHeadAVA() # Why is this here? isn't this redundant? bbox_head.init_weights() bbox_head = BBoxHeadAVA(temporal_pool_type='max', spatial_pool_type='avg') bbox_head.init_weights() losses = bbox_head.loss( cls_score=cls_score, bbox_pred=None, rois=None, labels=labels, label_weights=label_weights) assert torch.isclose(losses['loss_action_cls'], torch.tensor(0.7162495)) assert torch.isclose(losses['recall@thr=0.5'], torch.tensor(0.6666666)) assert torch.isclose(losses['prec@thr=0.5'], torch.tensor(0.4791665)) assert torch.isclose(losses['recall@top3'], torch.tensor(0.75)) assert torch.isclose(losses['prec@top3'], torch.tensor(0.5)) assert torch.isclose(losses['recall@top5'], torch.tensor(1.0)) assert torch.isclose(losses['prec@top5'], torch.tensor(0.45)) # Test Single-Label Loss bbox_head = BBoxHeadAVA(multilabel=False) losses = bbox_head.loss( cls_score=cls_score, bbox_pred=None, rois=None, labels=labels, label_weights=label_weights) assert torch.isclose(losses['loss_action_cls'], torch.tensor(1.639561)) assert torch.isclose(losses['recall@thr=0.5'], torch.tensor(0.25)) assert torch.isclose(losses['prec@thr=0.5'], torch.tensor(0.25)) assert torch.isclose(losses['recall@top3'], torch.tensor(0.75)) assert torch.isclose(losses['prec@top3'], torch.tensor(0.5)) assert torch.isclose(losses['recall@top5'], torch.tensor(1.0)) assert torch.isclose(losses['prec@top5'], torch.tensor(0.45)) # Test ROI rois = torch.tensor([[0.0, 0.1, 0.2, 0.3, 0.4], [0.0, 0.5, 0.6, 0.7, 0.8]]) rois[1::2] *= 380 rois[2::2] *= 220 crop_quadruple = np.array([0.1, 0.2, 0.8, 0.7]) cls_score = torch.tensor([0.995, 0.728]) img_shape = (320, 480) flip = True bbox_head = BBoxHeadAVA(multilabel=True) bboxes, scores = bbox_head.get_det_bboxes( rois=rois, cls_score=cls_score, img_shape=img_shape, flip=flip, crop_quadruple=crop_quadruple) assert torch.all( torch.isclose( bboxes, torch.tensor([[0.89783341, 0.20043750, 0.89816672, 0.20087500], [0.45499998, 0.69875002, 0.58166665, 0.86499995]]))) assert torch.all( torch.isclose(scores, torch.tensor([0.73007441, 0.67436624]))) bbox_head = BBoxHeadAVA(multilabel=False) bboxes, scores = bbox_head.get_det_bboxes( rois=rois, cls_score=cls_score, img_shape=img_shape, flip=flip, crop_quadruple=crop_quadruple) assert torch.all( torch.isclose( bboxes, torch.tensor([[0.89783341, 0.20043750, 0.89816672, 0.20087500], [0.45499998, 0.69875002, 0.58166665, 0.86499995]]))) assert torch.all(torch.isclose(scores, torch.tensor([0.56636, 0.43364]))) def test_x3d_head(): """Test loss method, layer construction, attributes and forward function in x3d head.""" x3d_head = X3DHead(in_channels=432, num_classes=4, fc1_bias=False) x3d_head.init_weights() assert x3d_head.num_classes == 4 assert x3d_head.dropout_ratio == 0.5 assert x3d_head.in_channels == 432 assert x3d_head.init_std == 0.01 assert isinstance(x3d_head.dropout, nn.Dropout) assert x3d_head.dropout.p == x3d_head.dropout_ratio assert isinstance(x3d_head.fc1, nn.Linear) assert x3d_head.fc1.in_features == x3d_head.in_channels assert x3d_head.fc1.out_features == x3d_head.mid_channels assert x3d_head.fc1.bias is None assert isinstance(x3d_head.fc2, nn.Linear) assert x3d_head.fc2.in_features == x3d_head.mid_channels assert x3d_head.fc2.out_features == x3d_head.num_classes assert isinstance(x3d_head.pool, nn.AdaptiveAvgPool3d) assert x3d_head.pool.output_size == (1, 1, 1) input_shape = (3, 432, 4, 7, 7) feat = torch.rand(input_shape) # i3d head inference cls_scores = x3d_head(feat) assert cls_scores.shape == torch.Size([3, 4]) def test_slowfast_head(): """Test loss method, layer construction, attributes and forward function in slowfast head.""" sf_head = SlowFastHead(num_classes=4, in_channels=2304) sf_head.init_weights() assert sf_head.num_classes == 4 assert sf_head.dropout_ratio == 0.8 assert sf_head.in_channels == 2304 assert sf_head.init_std == 0.01 assert isinstance(sf_head.dropout, nn.Dropout) assert sf_head.dropout.p == sf_head.dropout_ratio assert isinstance(sf_head.fc_cls, nn.Linear) assert sf_head.fc_cls.in_features == sf_head.in_channels assert sf_head.fc_cls.out_features == sf_head.num_classes assert isinstance(sf_head.avg_pool, nn.AdaptiveAvgPool3d) assert sf_head.avg_pool.output_size == (1, 1, 1) input_shape = (3, 2048, 32, 7, 7) feat_slow = torch.rand(input_shape) input_shape = (3, 256, 4, 7, 7) feat_fast = torch.rand(input_shape) sf_head = SlowFastHead(num_classes=4, in_channels=2304) cls_scores = sf_head((feat_slow, feat_fast)) assert cls_scores.shape == torch.Size([3, 4]) def test_tsn_head(): """Test loss method, layer construction, attributes and forward function in tsn head.""" tsn_head = TSNHead(num_classes=4, in_channels=2048) tsn_head.init_weights() assert tsn_head.num_classes == 4 assert tsn_head.dropout_ratio == 0.4 assert tsn_head.in_channels == 2048 assert tsn_head.init_std == 0.01 assert tsn_head.consensus.dim == 1 assert tsn_head.spatial_type == 'avg' assert isinstance(tsn_head.dropout, nn.Dropout) assert tsn_head.dropout.p == tsn_head.dropout_ratio assert isinstance(tsn_head.fc_cls, nn.Linear) assert tsn_head.fc_cls.in_features == tsn_head.in_channels assert tsn_head.fc_cls.out_features == tsn_head.num_classes assert isinstance(tsn_head.avg_pool, nn.AdaptiveAvgPool2d) assert tsn_head.avg_pool.output_size == (1, 1) input_shape = (8, 2048, 7, 7) feat = torch.rand(input_shape) # tsn head inference num_segs = input_shape[0] cls_scores = tsn_head(feat, num_segs) assert cls_scores.shape == torch.Size([1, 4]) # Test multi-class recognition multi_tsn_head = TSNHead( num_classes=4, in_channels=2048, loss_cls=dict(type='BCELossWithLogits', loss_weight=160.0), multi_class=True, label_smooth_eps=0.01) multi_tsn_head.init_weights() assert multi_tsn_head.num_classes == 4 assert multi_tsn_head.dropout_ratio == 0.4 assert multi_tsn_head.in_channels == 2048 assert multi_tsn_head.init_std == 0.01 assert multi_tsn_head.consensus.dim == 1 assert isinstance(multi_tsn_head.dropout, nn.Dropout) assert multi_tsn_head.dropout.p == multi_tsn_head.dropout_ratio assert isinstance(multi_tsn_head.fc_cls, nn.Linear) assert multi_tsn_head.fc_cls.in_features == multi_tsn_head.in_channels assert multi_tsn_head.fc_cls.out_features == multi_tsn_head.num_classes assert isinstance(multi_tsn_head.avg_pool, nn.AdaptiveAvgPool2d) assert multi_tsn_head.avg_pool.output_size == (1, 1) input_shape = (8, 2048, 7, 7) feat = torch.rand(input_shape) # multi-class tsn head inference num_segs = input_shape[0] cls_scores = tsn_head(feat, num_segs) assert cls_scores.shape == torch.Size([1, 4]) def test_tsn_head_audio(): """Test loss method, layer construction, attributes and forward function in tsn head.""" tsn_head_audio = AudioTSNHead(num_classes=4, in_channels=5) tsn_head_audio.init_weights() assert tsn_head_audio.num_classes == 4 assert tsn_head_audio.dropout_ratio == 0.4 assert tsn_head_audio.in_channels == 5 assert tsn_head_audio.init_std == 0.01 assert tsn_head_audio.spatial_type == 'avg' assert isinstance(tsn_head_audio.dropout, nn.Dropout) assert tsn_head_audio.dropout.p == tsn_head_audio.dropout_ratio assert isinstance(tsn_head_audio.fc_cls, nn.Linear) assert tsn_head_audio.fc_cls.in_features == tsn_head_audio.in_channels assert tsn_head_audio.fc_cls.out_features == tsn_head_audio.num_classes assert isinstance(tsn_head_audio.avg_pool, nn.AdaptiveAvgPool2d) assert tsn_head_audio.avg_pool.output_size == (1, 1) input_shape = (8, 5, 7, 7) feat = torch.rand(input_shape) # tsn head inference cls_scores = tsn_head_audio(feat) assert cls_scores.shape == torch.Size([8, 4]) def test_tsm_head(): """Test loss method, layer construction, attributes and forward function in tsm head.""" tsm_head = TSMHead(num_classes=4, in_channels=2048) tsm_head.init_weights() assert tsm_head.num_classes == 4 assert tsm_head.dropout_ratio == 0.8 assert tsm_head.in_channels == 2048 assert tsm_head.init_std == 0.001 assert tsm_head.consensus.dim == 1 assert tsm_head.spatial_type == 'avg' assert isinstance(tsm_head.dropout, nn.Dropout) assert tsm_head.dropout.p == tsm_head.dropout_ratio assert isinstance(tsm_head.fc_cls, nn.Linear) assert tsm_head.fc_cls.in_features == tsm_head.in_channels assert tsm_head.fc_cls.out_features == tsm_head.num_classes assert isinstance(tsm_head.avg_pool, nn.AdaptiveAvgPool2d) assert tsm_head.avg_pool.output_size == 1 input_shape = (8, 2048, 7, 7) feat = torch.rand(input_shape) # tsm head inference with no init num_segs = input_shape[0] cls_scores = tsm_head(feat, num_segs) assert cls_scores.shape == torch.Size([1, 4]) # tsm head inference with init tsm_head = TSMHead(num_classes=4, in_channels=2048, temporal_pool=True) tsm_head.init_weights() cls_scores = tsm_head(feat, num_segs) assert cls_scores.shape == torch.Size([2, 4]) def test_trn_head(): """Test loss method, layer construction, attributes and forward function in trn head.""" from mmaction.models.heads.trn_head import (RelationModule, RelationModuleMultiScale) trn_head = TRNHead(num_classes=4, in_channels=2048, relation_type='TRN') trn_head.init_weights() assert trn_head.num_classes == 4 assert trn_head.dropout_ratio == 0.8 assert trn_head.in_channels == 2048 assert trn_head.init_std == 0.001 assert trn_head.spatial_type == 'avg' relation_module = trn_head.consensus assert isinstance(relation_module, RelationModule) assert relation_module.hidden_dim == 256 assert isinstance(relation_module.classifier[3], nn.Linear) assert relation_module.classifier[3].out_features == trn_head.num_classes assert trn_head.dropout.p == trn_head.dropout_ratio assert isinstance(trn_head.dropout, nn.Dropout) assert isinstance(trn_head.fc_cls, nn.Linear) assert trn_head.fc_cls.in_features == trn_head.in_channels assert trn_head.fc_cls.out_features == trn_head.hidden_dim assert isinstance(trn_head.avg_pool, nn.AdaptiveAvgPool2d) assert trn_head.avg_pool.output_size == 1 input_shape = (8, 2048, 7, 7) feat = torch.rand(input_shape) # tsm head inference with no init num_segs = input_shape[0] cls_scores = trn_head(feat, num_segs) assert cls_scores.shape == torch.Size([1, 4]) # tsm head inference with init trn_head = TRNHead( num_classes=4, in_channels=2048, num_segments=8, relation_type='TRNMultiScale') trn_head.init_weights() assert isinstance(trn_head.consensus, RelationModuleMultiScale) assert trn_head.consensus.scales == range(8, 1, -1) cls_scores = trn_head(feat, num_segs) assert cls_scores.shape == torch.Size([1, 4]) with pytest.raises(ValueError): trn_head = TRNHead( num_classes=4, in_channels=2048, num_segments=8, relation_type='RelationModlue') def test_timesformer_head(): """Test loss method, layer construction, attributes and forward function in timesformer head.""" timesformer_head = TimeSformerHead(num_classes=4, in_channels=64) timesformer_head.init_weights() assert timesformer_head.num_classes == 4 assert timesformer_head.in_channels == 64 assert timesformer_head.init_std == 0.02 input_shape = (2, 64) feat = torch.rand(input_shape) cls_scores = timesformer_head(feat) assert cls_scores.shape == torch.Size([2, 4]) @patch.object(mmaction.models.LFBInferHead, '__del__', Mock) def test_lfb_infer_head(): """Test layer construction, attributes and forward function in lfb infer head.""" with tempfile.TemporaryDirectory() as tmpdir: lfb_infer_head = LFBInferHead( lfb_prefix_path=tmpdir, use_half_precision=True) lfb_infer_head.init_weights() st_feat_shape = (3, 16, 1, 8, 8) st_feat = generate_backbone_demo_inputs(st_feat_shape) rois = torch.cat( (torch.tensor([0, 1, 0]).float().view(3, 1), torch.randn(3, 4)), dim=1) img_metas = [dict(img_key='video_1,777'), dict(img_key='video_2, 888')] result = lfb_infer_head(st_feat, rois, img_metas) assert st_feat.equal(result) assert len(lfb_infer_head.all_features) == 3 assert lfb_infer_head.all_features[0].shape == (16, 1, 1, 1) def test_fbo_head(): """Test layer construction, attributes and forward function in fbo head.""" lfb_prefix_path = osp.normpath( osp.join(osp.dirname(__file__), '../data/lfb')) st_feat_shape = (1, 16, 1, 8, 8) st_feat = generate_backbone_demo_inputs(st_feat_shape) rois = torch.randn(1, 5) rois[0][0] = 0 img_metas = [dict(img_key='video_1, 930')] # non local fbo fbo_head = FBOHead( lfb_cfg=dict( lfb_prefix_path=lfb_prefix_path, max_num_sampled_feat=5, window_size=60, lfb_channels=16, dataset_modes=('unittest'), device='cpu'), fbo_cfg=dict( type='non_local', st_feat_channels=16, lt_feat_channels=16, latent_channels=8, num_st_feat=1, num_lt_feat=5 * 60, )) fbo_head.init_weights() out = fbo_head(st_feat, rois, img_metas) assert out.shape == (1, 24, 1, 1, 1) # avg fbo fbo_head = FBOHead( lfb_cfg=dict( lfb_prefix_path=lfb_prefix_path, max_num_sampled_feat=5, window_size=60, lfb_channels=16, dataset_modes=('unittest'), device='cpu'), fbo_cfg=dict(type='avg')) fbo_head.init_weights() out = fbo_head(st_feat, rois, img_metas) assert out.shape == (1, 32, 1, 1, 1) # max fbo fbo_head = FBOHead( lfb_cfg=dict( lfb_prefix_path=lfb_prefix_path, max_num_sampled_feat=5, window_size=60, lfb_channels=16, dataset_modes=('unittest'), device='cpu'), fbo_cfg=dict(type='max')) fbo_head.init_weights() out = fbo_head(st_feat, rois, img_metas) assert out.shape == (1, 32, 1, 1, 1) def test_tpn_head(): """Test loss method, layer construction, attributes and forward function in tpn head.""" tpn_head = TPNHead(num_classes=4, in_channels=2048) tpn_head.init_weights() assert hasattr(tpn_head, 'avg_pool2d') assert hasattr(tpn_head, 'avg_pool3d') assert isinstance(tpn_head.avg_pool3d, nn.AdaptiveAvgPool3d) assert tpn_head.avg_pool3d.output_size == (1, 1, 1) assert tpn_head.avg_pool2d is None input_shape = (4, 2048, 7, 7) feat = torch.rand(input_shape) # tpn head inference with num_segs num_segs = 2 cls_scores = tpn_head(feat, num_segs) assert isinstance(tpn_head.avg_pool2d, nn.AvgPool3d) assert tpn_head.avg_pool2d.kernel_size == (1, 7, 7) assert cls_scores.shape == torch.Size([2, 4]) # tpn head inference with no num_segs input_shape = (2, 2048, 3, 7, 7) feat = torch.rand(input_shape) cls_scores = tpn_head(feat) assert isinstance(tpn_head.avg_pool2d, nn.AvgPool3d) assert tpn_head.avg_pool2d.kernel_size == (1, 7, 7) assert cls_scores.shape == torch.Size([2, 4]) def test_acrn_head(): roi_feat = torch.randn(4, 16, 1, 7, 7) feat = torch.randn(2, 16, 1, 16, 16) rois = torch.Tensor([[0, 2.2268, 0.5926, 10.6142, 8.0029], [0, 2.2577, 0.1519, 11.6451, 8.9282], [1, 1.9874, 1.0000, 11.1585, 8.2840], [1, 3.3338, 3.7166, 8.4174, 11.2785]]) acrn_head = ACRNHead(32, 16) acrn_head.init_weights() new_feat = acrn_head(roi_feat, feat, rois) assert new_feat.shape == (4, 16, 1, 16, 16) acrn_head = ACRNHead(32, 16, stride=2) new_feat = acrn_head(roi_feat, feat, rois) assert new_feat.shape == (4, 16, 1, 8, 8) acrn_head = ACRNHead(32, 16, stride=2, num_convs=2) new_feat = acrn_head(roi_feat, feat, rois) assert new_feat.shape == (4, 16, 1, 8, 8) def test_stgcn_head(): """Test loss method, layer construction, attributes and forward function in stgcn head.""" with pytest.raises(NotImplementedError): # spatial_type not in ['avg', 'max'] stgcn_head = STGCNHead( num_classes=60, in_channels=256, spatial_type='min') stgcn_head.init_weights() # spatial_type='avg' stgcn_head = STGCNHead(num_classes=60, in_channels=256, spatial_type='avg') stgcn_head.init_weights() assert stgcn_head.num_classes == 60 assert stgcn_head.in_channels == 256 input_shape = (2, 256, 75, 17) feat = torch.rand(input_shape) cls_scores = stgcn_head(feat) assert cls_scores.shape == torch.Size([1, 60]) # spatial_type='max' stgcn_head = STGCNHead(num_classes=60, in_channels=256, spatial_type='max') stgcn_head.init_weights() assert stgcn_head.num_classes == 60 assert stgcn_head.in_channels == 256 input_shape = (2, 256, 75, 17) feat = torch.rand(input_shape) cls_scores = stgcn_head(feat) assert cls_scores.shape == torch.Size([1, 60])