Release detection models

00af501a · zhe chen · 0dec0215 · 00af501a · 00af501a · 00af501a
Commit 00af501a authored Mar 04, 2025 by zhe chen
6 changed files
--- a/detection/README.md
+++ b/detection/README.md
@@ -112,8 +112,12 @@ Prepare datasets according to the guidelines in [MMDetection v2.28.1](https://gi
 | :--------: | :--------------: | :--: | :-----: | :----: | :----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 |    DINO    |  InternImage-T   |  1x  |  53.9   |  49M   |  [config](./configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py)   |                    [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json)                    |
 |    DINO    |  InternImage-L   |  1x  |  57.6   |  241M  | [config](./configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) |
-|    DINO    | CB-InternImage-H |  1x  |  64.5   | 2.18B  |   [config](./configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py)   |                                                                    [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth)                                                                     |
+|    DINO    |  InternImage-H   |  1x  |  63.4   |  1.1B  |    [config](./configs/coco/dino_4scale_internimage_h_objects365_coco_ss.py)    |                                                                     [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_h_objects365_coco.pth)                                                                      |
-| DINO (TTA) | CB-InternImage-H |  1x  |  65.0   | 2.18B  |                                       -                                        |                                                                    [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth)                                                                     |
+|    DINO    | CB-InternImage-H |  1x  |  64.5   |  2.2B  |   [config](./configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py)   |                                                                    [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth)                                                                     |
+| DINO (TTA) | CB-InternImage-H |  1x  |  65.0   |  2.2B  |                                      TODO                                      |                                                                    [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth)                                                                     |
+|    DINO    |  InternImage-G   |  1x  |  64.2   |  3.1B  |    [config](./configs/coco/dino_4scale_internimage_g_objects365_coco_ss.py)    |                                                                     [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_g_objects365_coco.pth)                                                                      |
+| DINO (TTA) | CB-InternImage-G |  1x  |  65.1   |   6B   |                                      TODO                                      |                                                                                                                           TODO                                                                                                                           |
+| DINO (TTA) | CB-InternImage-G |  1x  |  65.3   |   6B   |                                      TODO                                      |                                                                                                                           TODO                                                                                                                           |
 </div>

--- a/detection/configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py
+++ b/detection/configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
 _base_ = [
    '../_base_/datasets/coco_detection.py',
    '../_base_/default_runtime.py'
@@ -122,7 +127,7 @@ model = dict(
        snip_cfg=dict(
            type='v3',
            weight=0.1)),
-    test_cfg=dict(max_per_img=300))  # TODO: Originally 100
+    test_cfg=dict(max_per_img=300))
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different

--- a/detection/configs/coco/dino_4scale_internimage_g_objects365_coco_ss.py
+++ b/detection/configs/coco/dino_4scale_internimage_g_objects365_coco_ss.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DINO',
+    backbone=dict(
+        type='InternImage',
+        core_op='DCNv3',
+        channels=512,
+        depths=[2, 2, 48, 4],
+        groups=[16, 32, 64, 128],
+        mlp_ratio=4.,
+        drop_path_rate=0.5,
+        norm_layer='LN',
+        layer_scale=None,
+        offset_scale=1.0,
+        post_norm=True,
+        dw_kernel_size=5,  # for InternImage-H/G
+        res_post_norm=False,  # for InternImage-H/G
+        level2_post_norm=True,  # for InternImage-H/G
+        level2_post_norm_block_ids=[5, 11, 17, 23, 29, 35, 41, 47],  # for InternImage-H/G
+        center_feature_scale=True,  # for InternImage-H/G
+        with_cp=True,
+        out_indices=(1, 2, 3),
+        init_cfg=None # dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[1024, 2048, 4096],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    bbox_head=dict(
+        type='DINOHead',
+        num_query=900,
+        num_classes=80,
+        in_channels=2048,  # TODO
+        sync_cls_avg_factor=True,
+        as_two_stage=True,
+        with_box_refine=True,
+        dn_cfg=dict(
+            type='CdnQueryGenerator',
+            noise_scale=dict(label=0.5, box=1.0),  # 0.5, 0.4 for DN-DETR
+            group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)),
+        transformer=dict(
+            type='DinoTransformer',
+            two_stage_num_proposals=900,
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        dropout=0.0),  # 0.1 for DeformDETR
+                    feedforward_channels=2048,  # 1024 for DeformDETR
+                    ffn_cfgs=dict(
+                        type='EfficientFFN',
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        ffn_drop=0.,
+                        use_checkpoint=True,
+                        act_cfg=dict(type='ReLU', inplace=True),),
+                    ffn_dropout=0.0,  # 0.1 for DeformDETR
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DinoTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.0),  # 0.1 for DeformDETR
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            num_levels=4,
+                            embed_dims=256,
+                            dropout=0.0),  # 0.1 for DeformDETR
+                    ],
+                    feedforward_channels=2048,  # 1024 for DeformDETR
+                    ffn_cfgs=dict(
+                        type='EfficientFFN',
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        ffn_drop=0.,
+                        use_checkpoint=True,
+                        act_cfg=dict(type='ReLU', inplace=True),),
+                    ffn_dropout=0.0,  # 0.1 for DeformDETR
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            temperature=20,
+            normalize=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
+        snip_cfg=dict(
+            type='v3',
+            weight=0.1)),
+    test_cfg=dict(max_per_img=300))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Resize',
+         img_scale=[(2000, 600), (2000, 1800)],
+         multiscale_mode='range',
+         keep_ratio=True),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2000, 1000),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(filter_empty_gt=True, pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.0001,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=56, layer_decay_rate=0.94,
+                       depths=[2, 2, 48, 4], offset_lr_scale=1e-3))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[])
+runner = dict(type='IterBasedRunner', max_iters=20000)
+checkpoint_config = dict(interval=200, max_keep_ckpts=3)
+evaluation = dict(interval=200, save_best='auto')
+# resume_from = None
+# custom_hooks = [
+#     dict(
+#         type='ExpMomentumEMAHook',
+#         resume_from=resume_from,
+#         momentum=0.0003,
+#         priority=49)
+# ]
--- a/detection/configs/coco/dino_4scale_internimage_h_objects365_coco_ss.py
+++ b/detection/configs/coco/dino_4scale_internimage_h_objects365_coco_ss.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DINO',
+    backbone=dict(
+        type='InternImage',
+        core_op='DCNv3',
+        channels=320,
+        depths=[6, 6, 32, 6],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.5,
+        norm_layer='LN',
+        layer_scale=None,
+        offset_scale=1.0,
+        post_norm=False,
+        dw_kernel_size=5,  # for InternImage-H/G
+        res_post_norm=True,  # for InternImage-H/G
+        level2_post_norm=True,  # for InternImage-H/G
+        level2_post_norm_block_ids=[5, 11, 17, 23, 29],  # for InternImage-H/G
+        center_feature_scale=True,  # for InternImage-H/G
+        with_cp=True,
+        out_indices=(1, 2, 3),
+        init_cfg=None # dict(type='Pretrained', checkpoint=pretrained)
+    ),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[640, 1280, 2560],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    bbox_head=dict(
+        type='DINOHead',
+        num_query=900,
+        num_classes=80,
+        in_channels=2048,  # TODO
+        sync_cls_avg_factor=True,
+        as_two_stage=True,
+        with_box_refine=True,
+        dn_cfg=dict(
+            type='CdnQueryGenerator',
+            noise_scale=dict(label=0.5, box=1.0),  # 0.5, 0.4 for DN-DETR
+            group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)),
+        transformer=dict(
+            type='DinoTransformer',
+            two_stage_num_proposals=900,
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        dropout=0.0),  # 0.1 for DeformDETR
+                    feedforward_channels=2048,  # 1024 for DeformDETR
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        ffn_drop=0.,
+                        use_checkpoint=True,
+                        act_cfg=dict(type='ReLU', inplace=True),),
+                    ffn_dropout=0.0,  # 0.1 for DeformDETR
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DinoTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.0),  # 0.1 for DeformDETR
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            num_levels=4,
+                            embed_dims=256,
+                            dropout=0.0),  # 0.1 for DeformDETR
+                    ],
+                    feedforward_channels=2048,  # 1024 for DeformDETR
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        ffn_drop=0.,
+                        use_checkpoint=True,
+                        act_cfg=dict(type='ReLU', inplace=True),),
+                    ffn_dropout=0.0,  # 0.1 for DeformDETR
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            temperature=20,
+            normalize=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
+        snip_cfg=dict(
+            type='v3',
+            weight=0.1)),
+    test_cfg=dict(max_per_img=300))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Resize',
+         img_scale=[(2000, 600), (2000, 1800)],
+         multiscale_mode='range',
+         keep_ratio=True),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2000, 1000),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(filter_empty_gt=True, pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.0001,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.94,
+                       depths=[6, 6, 32, 6], offset_lr_scale=1e-3))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[])
+runner = dict(type='IterBasedRunner', max_iters=20000)
+checkpoint_config = dict(interval=200, max_keep_ckpts=3)
+evaluation = dict(interval=200, save_best='auto')
+# resume_from = None
+# custom_hooks = [
+#     dict(
+#         type='ExpMomentumEMAHook',
+#         resume_from=resume_from,
+#         momentum=0.0003,
+#         priority=49)
+# ]
--- a/detection/mmcv_custom/__init__.py
+++ b/detection/mmcv_custom/__init__.py
@@ -4,11 +4,14 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+import torch
 # -*- coding: utf-8 -*-
 from .custom_layer_decay_optimizer_constructor import \
    CustomLayerDecayOptimizerConstructor
+from .efficient_ffn import EfficientFFN
-__all__ = ['CustomLayerDecayOptimizerConstructor']
+__all__ = ['CustomLayerDecayOptimizerConstructor', 'EfficientFFN']
 if torch.__version__.startswith('1.11'):

--- a/detection/mmcv_custom/efficient_ffn.py
+++ b/detection/mmcv_custom/efficient_ffn.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Sequence
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from mmcv.cnn import (build_activation_layer, build_conv_layer,
+                      build_norm_layer, xavier_init)
+from mmcv.cnn.bricks.registry import (FEEDFORWARD_NETWORK, TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.runner import force_fp32
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import deprecated_api_warning, to_2tuple
+from mmdet.models.utils.builder import TRANSFORMER
+from torch.nn.init import normal_
+@FEEDFORWARD_NETWORK.register_module()
+class EfficientFFN(BaseModule):
+    @deprecated_api_warning(
+        {
+            'dropout': 'ffn_drop',
+            'add_residual': 'add_identity'
+        },
+        cls_name='EfficientFFN')
+    def __init__(self,
+                 embed_dims=256,
+                 feedforward_channels=1024,
+                 num_fcs=2,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 add_identity=True,
+                 init_cfg=None,
+                 split=4,
+                 use_checkpoint=False,
+                 **kwargs):
+        super(EfficientFFN, self).__init__(init_cfg)
+        assert num_fcs >= 2, 'num_fcs should be no less ' \
+                             f'than 2. got {num_fcs}.'
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+        self.drop = nn.Dropout(ffn_drop)
+        in_channels = embed_dims
+        self.use_checkpoint = use_checkpoint
+        self.split = split
+        for i in range(split):
+            fc1 = nn.Linear(in_channels, feedforward_channels //
+                            self.split, bias=True)
+            setattr(self, f'fc1_{i}', fc1)
+        for i in range(split):
+            fc2 = nn.Linear(feedforward_channels // self.split,
+                            embed_dims, bias=False)
+            setattr(self, f'fc2_{i}', fc2)
+        self.fc2_bias = nn.Parameter(torch.zeros(
+            (embed_dims)), requires_grad=True)
+        # fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.fc2_0.weight)
+        # bound = 1 / math.sqrt(fan_in)
+        # torch.nn.init.uniform_(self.fc2_bias, -bound, bound)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+        self.add_identity = add_identity
+    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+    def forward(self, x, identity=None):
+        def _inner_forward(x, i):
+            fc1 = getattr(self, f'fc1_{i}')
+            x = fc1(x)
+            x = self.activate(x)
+            x = self.drop(x)
+            fc2 = getattr(self, f'fc2_{i}')
+            x = fc2(x)
+            x = self.drop(x)
+            return x
+        out = 0
+        for i in range(self.split):
+            if self.use_checkpoint and x.requires_grad:
+                out = out + checkpoint.checkpoint(_inner_forward, x, i)
+            else:
+                out = out + _inner_forward(x, i)
+        out = out + self.fc2_bias
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)