Use pre-commit to reformat code

Use pre-commit to reformat code

Use pre-commit to reformat code
41b18fd8 · zhe chen · ff20ea39 · 41b18fd8 · 41b18fd8 · 41b18fd8
Commit 41b18fd8 authored Jan 06, 2025 by zhe chen
20 changed files
--- a/detection/ops_dcnv3/functions/dcnv3_func.py
+++ b/detection/ops_dcnv3/functions/dcnv3_func.py
@@ -4,16 +4,14 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------

-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+from __future__ import absolute_import, division, print_function

+import DCNv3
 import torch
 import torch.nn.functional as F
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.cuda.amp import custom_bwd, custom_fwd
-import DCNv3


 class DCNv3Function(Function):
@@ -88,6 +86,7 @@ class DCNv3Function(Function):
            im2col_step_i=int(im2col_step),
        )

+
 def _get_reference_points(spatial_shapes, device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1):
    _, H_, W_, _ = spatial_shapes
    H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1

--- a/detection/ops_dcnv3/modules/__init__.py
+++ b/detection/ops_dcnv3/modules/__init__.py
@@ -4,4 +4,4 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------

-from .dcnv3 import DCNv3, DCNv3_pytorch
\ No newline at end of file
+from .dcnv3 import DCNv3, DCNv3_pytorch
--- a/detection/ops_dcnv3/modules/dcnv3.py
+++ b/detection/ops_dcnv3/modules/dcnv3.py
@@ -4,22 +4,24 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------

-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+from __future__ import absolute_import, division, print_function

 import warnings
+
 import torch
-from torch import nn
 import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_, constant_
+from torch import nn
+from torch.nn.init import constant_, xavier_uniform_
+
 from ..functions import DCNv3Function, dcnv3_core_pytorch
+
 try:
    from DCNv4.functions import DCNv4Function
 except:
    warnings.warn('Now, we support DCNv4 in InternImage.')
 import math

+
 class to_channels_first(nn.Module):

    def __init__(self):
@@ -76,7 +78,7 @@ def build_act_layer(act_layer):
 def _is_power_of_2(n):
    if (not isinstance(n, int)) or (n < 0):
        raise ValueError(
-            "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+            'invalid input for _is_power_of_2: {} (type: {})'.format(n, type(n)))

    return (n & (n - 1) == 0) and n != 0

@@ -128,7 +130,7 @@ class DCNv3_pytorch(nn.Module):
        if not _is_power_of_2(_d_per_group):
            warnings.warn(
                "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
-                "which is more efficient in our CUDA implementation.")
+                'which is more efficient in our CUDA implementation.')

        self.offset_scale = offset_scale
        self.channels = channels
@@ -165,7 +167,7 @@ class DCNv3_pytorch(nn.Module):
        self.input_proj = nn.Linear(channels, channels)
        self.output_proj = nn.Linear(channels, channels)
        self._reset_parameters()
-        
+
        if center_feature_scale:
            self.center_feature_scale_proj_weight = nn.Parameter(
                torch.zeros((group, channels), dtype=torch.float))
@@ -234,7 +236,7 @@ class DCNv3(nn.Module):
            norm_layer='LN',
            center_feature_scale=False,
            use_dcn_v4_op=False,
-            ):
+    ):
        """
        DCNv3 Module
        :param channels
@@ -257,7 +259,7 @@ class DCNv3(nn.Module):
        if not _is_power_of_2(_d_per_group):
            warnings.warn(
                "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
-                "which is more efficient in our CUDA implementation.")
+                'which is more efficient in our CUDA implementation.')

        self.offset_scale = offset_scale
        self.channels = channels
@@ -270,7 +272,7 @@ class DCNv3(nn.Module):
        self.group_channels = channels // group
        self.offset_scale = offset_scale
        self.center_feature_scale = center_feature_scale
-         
+
        self.use_dcn_v4_op = use_dcn_v4_op

        self.dw_conv = nn.Sequential(
@@ -296,7 +298,7 @@ class DCNv3(nn.Module):
        self.input_proj = nn.Linear(channels, channels)
        self.output_proj = nn.Linear(channels, channels)
        self._reset_parameters()
-        
+
        if center_feature_scale:
            self.center_feature_scale_proj_weight = nn.Parameter(
                torch.zeros((group, channels), dtype=torch.float))
@@ -329,7 +331,7 @@ class DCNv3(nn.Module):
        x1 = self.dw_conv(x1)
        offset = self.offset(x1)
        mask = self.mask(x1).reshape(N, H, W, self.group, -1)
-        
+
        if not self.use_dcn_v4_op:
            mask = F.softmax(mask, -1).reshape(N, H, W, -1).type(dtype)
            x = DCNv3Function.apply(
@@ -349,12 +351,12 @@ class DCNv3(nn.Module):
            mask = mask.view(N, H, W, self.group, -1)
            offset_mask = torch.cat([offset, mask], -1).view(N, H, W, -1).contiguous()

-            # For efficiency, the last dimension of the offset_mask tensor in dcnv4 is a multiple of 8. 
+            # For efficiency, the last dimension of the offset_mask tensor in dcnv4 is a multiple of 8.
            K3 = offset_mask.size(-1)
-            K3_pad = int(math.ceil(K3/8)*8)
+            K3_pad = int(math.ceil(K3 / 8) * 8)
            pad_dim = K3_pad - K3
            offset_mask = torch.cat([offset_mask, offset_mask.new_zeros([*offset_mask.size()[:3], pad_dim])], -1)
-        
+
            x = DCNv4Function.apply(
                x, offset_mask,
                self.kernel_size, self.kernel_size,

--- a/detection/ops_dcnv3/setup.py
+++ b/detection/ops_dcnv3/setup.py
@@ -4,39 +4,34 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------

-import os
 import glob
+import os

 import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension

-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
-from setuptools import find_packages
-from setuptools import setup
-
-requirements = ["torch", "torchvision"]
+requirements = ['torch', 'torchvision']


 def get_extensions():
    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "src")
+    extensions_dir = os.path.join(this_dir, 'src')

-    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
-    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
-    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+    main_file = glob.glob(os.path.join(extensions_dir, '*.cpp'))
+    source_cpu = glob.glob(os.path.join(extensions_dir, 'cpu', '*.cpp'))
+    source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))

    sources = main_file + source_cpu
    extension = CppExtension
-    extra_compile_args = {"cxx": []}
+    extra_compile_args = {'cxx': []}
    define_macros = []

    if torch.cuda.is_available() and CUDA_HOME is not None:
        extension = CUDAExtension
        sources += source_cuda
-        define_macros += [("WITH_CUDA", None)]
-        extra_compile_args["nvcc"] = [
+        define_macros += [('WITH_CUDA', None)]
+        extra_compile_args['nvcc'] = [
            # "-DCUDA_HAS_FP16=1",
            # "-D__CUDA_NO_HALF_OPERATORS__",
            # "-D__CUDA_NO_HALF_CONVERSIONS__",
@@ -49,7 +44,7 @@ def get_extensions():
    include_dirs = [extensions_dir]
    ext_modules = [
        extension(
-            "DCNv3",
+            'DCNv3',
            sources,
            include_dirs=include_dirs,
            define_macros=define_macros,
@@ -60,16 +55,16 @@ def get_extensions():


 setup(
-    name="DCNv3",
-    version="1.0",
-    author="InternImage",
-    url="https://github.com/OpenGVLab/InternImage",
+    name='DCNv3',
+    version='1.0',
+    author='InternImage',
+    url='https://github.com/OpenGVLab/InternImage',
    description=
-    "PyTorch Wrapper for CUDA Functions of DCNv3",
+    'PyTorch Wrapper for CUDA Functions of DCNv3',
    packages=find_packages(exclude=(
-        "configs",
-        "tests",
+        'configs',
+        'tests',
    )),
    ext_modules=get_extensions(),
-    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
 )
--- a/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+++ b/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu
@@ -171,4 +171,4 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
    } else {
        return {grad_input, grad_offset, grad_mask};
    }
-}
\ No newline at end of file
+}
--- a/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+++ b/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
@@ -1042,4 +1042,4 @@ void dcnv3_col2im_cuda(
    if (err != cudaSuccess) {
        printf("error in dcnv3_col2im_cuda: %s\n", cudaGetErrorString(err));
    }
-}
\ No newline at end of file
+}
--- a/detection/ops_dcnv3/test.py
+++ b/detection/ops_dcnv3/test.py
@@ -4,17 +4,15 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------

-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+from __future__ import absolute_import, division, print_function

+import math
 import time
+
 import torch
 import torch.nn as nn
-import math
-from torch.autograd import gradcheck
-
 from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch
+from torch.autograd import gradcheck

 H_in, W_in = 8, 8
 N, M, D = 2, 4, 16

--- a/detection/slurm_train.sh
+++ b/detection/slurm_train.sh
@@ -22,4 +22,4 @@ srun -p ${PARTITION} \
    --kill-on-bad-exit=1 \
    --quotatype=spot \
    ${SRUN_ARGS} \
-    python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
\ No newline at end of file
+    python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
--- a/detection/test.py
+++ b/detection/test.py
@@ -11,6 +11,8 @@ import time
 import warnings

 import mmcv
+import mmcv_custom  # noqa: F401,F403
+import mmdet_custom  # noqa: F401,F403
 import torch
 from mmcv import Config, DictAction
 from mmcv.cnn import fuse_conv_bn
@@ -21,8 +23,6 @@ from mmdet.apis import multi_gpu_test, single_gpu_test
 from mmdet.datasets import (build_dataloader, build_dataset,
                            replace_ImageToTensor)
 from mmdet.models import build_detector
-import mmdet_custom  # noqa: F401,F403
-import mmcv_custom  # noqa: F401,F403


 def parse_args():

--- a/detection/tools/create_crowd_anno.py
+++ b/detection/tools/create_crowd_anno.py
 import argparse
+import concurrent.futures
+import json
 import os
 import pickle as pkl
-import numpy as np
 import random
-from PIL import Image
-import concurrent.futures
-import json
+
 import mmcv
+import numpy as np
+from PIL import Image
+

 def parse_args():
    parser = argparse.ArgumentParser(description='Generate MMDetection Annotations for Crowdhuman-like dataset')
@@ -16,6 +18,7 @@ def parse_args():
    args = parser.parse_args()
    return args.dataset, args.dataset_split

+
 def load_func(fpath):
    assert os.path.exists(fpath)
    with open(fpath, 'r') as fid:
@@ -23,6 +26,7 @@ def load_func(fpath):
    records = [json.loads(line.strip('\n')) for line in lines]
    return records

+
 def decode_annotations(records, dataset_path):
    rec_ids = list(range(len(records)))
    img_list = []
@@ -80,16 +84,17 @@ def decode_annotations(records, dataset_path):
    )
    return json_dict

-if __name__ == "__main__":
+
+if __name__ == '__main__':
    dataset_name, dataset_type = parse_args()
    dataset_path = 'data/%s/' % dataset_name
    ch_file_path = dataset_path + 'annotations/annotation_%s.odgt' % dataset_type
    json_file_path = dataset_path + 'annotations/annotation_%s.json' % dataset_type

    records = load_func(ch_file_path)
-    print("Loading Annotations Done")
+    print('Loading Annotations Done')

    json_dict = decode_annotations(records, dataset_path)

-    print("Parsing Bbox Number: %d" % len(json_dict['annotations']))
+    print('Parsing Bbox Number: %d' % len(json_dict['annotations']))
    mmcv.dump(json_dict, json_file_path)
--- a/detection/tools/evaluate/__init__.py
+++ b/detection/tools/evaluate/__init__.py
 from .compute_APMR import compute_APMR
-from .compute_JI import compute_JI_with_ignore
\ No newline at end of file
+from .compute_JI import compute_JI_with_ignore
--- a/detection/train.py
+++ b/detection/train.py
@@ -12,12 +12,13 @@ import time
 import warnings

 import mmcv
+import mmcv_custom  # noqa: F401,F403
+import mmdet_custom  # noqa: F401,F403
 import torch
 import torch.distributed as dist
 from mmcv import Config, DictAction
 from mmcv.runner import get_dist_info, init_dist
 from mmcv.utils import get_git_hash
-
 from mmdet import __version__
 from mmdet.apis import init_random_seed, set_random_seed, train_detector
 from mmdet.datasets import build_dataset
@@ -25,8 +26,6 @@ from mmdet.models import build_detector
 from mmdet.utils import (collect_env, get_device, get_root_logger,
                         replace_cfg_vals, setup_multi_processes,
                         update_data_root)
-import mmcv_custom  # noqa: F401,F403
-import mmdet_custom  # noqa: F401,F403


 def parse_args():
@@ -244,4 +243,4 @@ def main():


 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
--- a/sam/engine.py
+++ b/sam/engine.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import pickle
-import shutil
-import tempfile
-import time
-
-import numpy as np
-
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-
-import mmcv
-from mmcv.image import tensor2imgs
-from mmcv.runner import get_dist_info
-
-from mmdet.core import encode_mask_results
-
-
-def prompt_sam_with_bboxes(sam_predictor, data, box_result):
-    # process detector prediction
-    # (x1, y1, x2, y2), rescaled in original image space
-    bboxes = np.concatenate(box_result, axis=0)[..., :4]
-    if len(bboxes) == 0:
-        return [[] for _ in range(len(box_result))]
-    labels = np.concatenate([[i] * len(boxes) for i, boxes in enumerate(box_result)])
-
-    # prepare shapes
-    img_metas = data['img_metas'][0].data[0][0]
-    original_size = img_metas['ori_shape'][:2]
-
-    # prepare input img of sam
-    sam_predictor.reset_image()
-    # img has been normed (NOTE 2.x norm img in pipeline)
-    img = data['img'][0] .to(sam_predictor.model.device)
-    # resize max length to 1024 and keep aspect ratio (ViT image encoder limitation)
-    target_size = sam_predictor.transform.get_preprocess_shape(
-        img.shape[2], img.shape[3],
-        sam_predictor.transform.target_length)
-    try:
-        # `antialias=True` is provided in official implementation of SAM,
-        # which may raise TypeError in PyTorch of previous versions.
-        transformed_img = F.interpolate(
-            img, target_size, mode="bilinear",
-            align_corners=False, antialias=True)
-    except TypeError:
-        transformed_img = F.interpolate(
-            img, target_size, mode="bilinear", align_corners=False)
-    # Pad to 1024 x 1024
-    h, w = transformed_img.shape[-2:]
-    pad_h = sam_predictor.model.image_encoder.img_size - h
-    pad_w = sam_predictor.model.image_encoder.img_size - w
-    transformed_img = F.pad(transformed_img, (0, pad_w, 0, pad_h))
-
-    # extract img feature
-    sam_predictor.features = sam_predictor.model.image_encoder(
-        transformed_img).to(sam_predictor.model.device)
-
-    # set attributes
-    sam_predictor.original_size = original_size
-    sam_predictor.input_size = tuple(transformed_img.shape[-2:])
-    sam_predictor.is_image_set = True
-
-    # prepare bboxes and rescale bboxes to relative coordinates
-    bboxes_tensor = torch.from_numpy(bboxes).to(sam_predictor.model.device)
-    transformed_boxes = sam_predictor.transform.apply_boxes_torch(bboxes_tensor, original_size)
-
-    # prompt with bboxes
-    batch_masks, _, _ = sam_predictor.predict_torch(
-        point_coords=None,
-        point_labels=None,
-        boxes=transformed_boxes,
-        multimask_output=False)
-
-    batch_masks = batch_masks.squeeze(1).cpu().numpy()
-
-    mask_results = [[*batch_masks[labels == i]] for i in range(len(box_result))]
-
-    return mask_results
-
-
-def single_gpu_test(model,
-                    sam_predictor,
-                    data_loader,
-                    show=False,
-                    out_dir=None,
-                    show_score_thr=0.3):
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    PALETTE = getattr(dataset, 'PALETTE', None)
-    prog_bar = mmcv.ProgressBar(len(dataset))
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            # For instance segmentor, only the box results is used in the
-            # second stage (prompt sam with box). NOTE the mask_head is still
-            # calculated, hence the FPS, FLOPS, maybe not accurate.
-            result = model(return_loss=False, rescale=True, **data)
-            if getattr(model.module, 'with_mask', False):
-                box_result = result[0][0]  # simple_test supported
-                mask_result = prompt_sam_with_bboxes(sam_predictor, data, box_result)
-                result = [(box_result, mask_result)]
-            else:
-                raise NotImplementedError('WIP!')
-
-        batch_size = len(result)
-        if show or out_dir:
-            if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
-                img_tensor = data['img'][0]
-            else:
-                img_tensor = data['img'][0].data[0]
-            img_metas = data['img_metas'][0].data[0]
-            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
-            assert len(imgs) == len(img_metas)
-
-            for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
-                h, w, _ = img_meta['img_shape']
-                img_show = img[:h, :w, :]
-
-                ori_h, ori_w = img_meta['ori_shape'][:-1]
-                img_show = mmcv.imresize(img_show, (ori_w, ori_h))
-
-                if out_dir:
-                    out_file = osp.join(out_dir, img_meta['ori_filename'])
-                else:
-                    out_file = None
-
-                model.module.show_result(
-                    img_show,
-                    result[i],
-                    bbox_color=PALETTE,
-                    text_color=PALETTE,
-                    mask_color=PALETTE,
-                    show=show,
-                    out_file=out_file,
-                    score_thr=show_score_thr)
-
-        # encode mask results
-        if isinstance(result[0], tuple):
-            result = [(bbox_results, encode_mask_results(mask_results))
-                      for bbox_results, mask_results in result]
-        # This logic is only used in panoptic segmentation test.
-        elif isinstance(result[0], dict) and 'ins_results' in result[0]:
-            for j in range(len(result)):
-                bbox_results, mask_results = result[j]['ins_results']
-                result[j]['ins_results'] = (bbox_results,
-                                            encode_mask_results(mask_results))
-
-        results.extend(result)
-
-        for _ in range(batch_size):
-            prog_bar.update()
-    return results
-
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+from mmdet.core import encode_mask_results
+
+
+def prompt_sam_with_bboxes(sam_predictor, data, box_result):
+    # process detector prediction
+    # (x1, y1, x2, y2), rescaled in original image space
+    bboxes = np.concatenate(box_result, axis=0)[..., :4]
+    if len(bboxes) == 0:
+        return [[] for _ in range(len(box_result))]
+    labels = np.concatenate([[i] * len(boxes) for i, boxes in enumerate(box_result)])
+
+    # prepare shapes
+    img_metas = data['img_metas'][0].data[0][0]
+    original_size = img_metas['ori_shape'][:2]
+
+    # prepare input img of sam
+    sam_predictor.reset_image()
+    # img has been normed (NOTE 2.x norm img in pipeline)
+    img = data['img'][0] .to(sam_predictor.model.device)
+    # resize max length to 1024 and keep aspect ratio (ViT image encoder limitation)
+    target_size = sam_predictor.transform.get_preprocess_shape(
+        img.shape[2], img.shape[3],
+        sam_predictor.transform.target_length)
+    try:
+        # `antialias=True` is provided in official implementation of SAM,
+        # which may raise TypeError in PyTorch of previous versions.
+        transformed_img = F.interpolate(
+            img, target_size, mode='bilinear',
+            align_corners=False, antialias=True)
+    except TypeError:
+        transformed_img = F.interpolate(
+            img, target_size, mode='bilinear', align_corners=False)
+    # Pad to 1024 x 1024
+    h, w = transformed_img.shape[-2:]
+    pad_h = sam_predictor.model.image_encoder.img_size - h
+    pad_w = sam_predictor.model.image_encoder.img_size - w
+    transformed_img = F.pad(transformed_img, (0, pad_w, 0, pad_h))
+
+    # extract img feature
+    sam_predictor.features = sam_predictor.model.image_encoder(
+        transformed_img).to(sam_predictor.model.device)
+
+    # set attributes
+    sam_predictor.original_size = original_size
+    sam_predictor.input_size = tuple(transformed_img.shape[-2:])
+    sam_predictor.is_image_set = True
+
+    # prepare bboxes and rescale bboxes to relative coordinates
+    bboxes_tensor = torch.from_numpy(bboxes).to(sam_predictor.model.device)
+    transformed_boxes = sam_predictor.transform.apply_boxes_torch(bboxes_tensor, original_size)
+
+    # prompt with bboxes
+    batch_masks, _, _ = sam_predictor.predict_torch(
+        point_coords=None,
+        point_labels=None,
+        boxes=transformed_boxes,
+        multimask_output=False)
+
+    batch_masks = batch_masks.squeeze(1).cpu().numpy()
+
+    mask_results = [[*batch_masks[labels == i]] for i in range(len(box_result))]
+
+    return mask_results
+
+
+def single_gpu_test(model,
+                    sam_predictor,
+                    data_loader,
+                    show=False,
+                    out_dir=None,
+                    show_score_thr=0.3):
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    PALETTE = getattr(dataset, 'PALETTE', None)
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            # For instance segmentor, only the box results is used in the
+            # second stage (prompt sam with box). NOTE the mask_head is still
+            # calculated, hence the FPS, FLOPS, maybe not accurate.
+            result = model(return_loss=False, rescale=True, **data)
+            if getattr(model.module, 'with_mask', False):
+                box_result = result[0][0]  # simple_test supported
+                mask_result = prompt_sam_with_bboxes(sam_predictor, data, box_result)
+                result = [(box_result, mask_result)]
+            else:
+                raise NotImplementedError('WIP!')
+
+        batch_size = len(result)
+        if show or out_dir:
+            if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
+                img_tensor = data['img'][0]
+            else:
+                img_tensor = data['img'][0].data[0]
+            img_metas = data['img_metas'][0].data[0]
+            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+            assert len(imgs) == len(img_metas)
+
+            for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
+                h, w, _ = img_meta['img_shape']
+                img_show = img[:h, :w, :]
+
+                ori_h, ori_w = img_meta['ori_shape'][:-1]
+                img_show = mmcv.imresize(img_show, (ori_w, ori_h))
+
+                if out_dir:
+                    out_file = osp.join(out_dir, img_meta['ori_filename'])
+                else:
+                    out_file = None
+
+                model.module.show_result(
+                    img_show,
+                    result[i],
+                    bbox_color=PALETTE,
+                    text_color=PALETTE,
+                    mask_color=PALETTE,
+                    show=show,
+                    out_file=out_file,
+                    score_thr=show_score_thr)
+
+        # encode mask results
+        if isinstance(result[0], tuple):
+            result = [(bbox_results, encode_mask_results(mask_results))
+                      for bbox_results, mask_results in result]
+        # This logic is only used in panoptic segmentation test.
+        elif isinstance(result[0], dict) and 'ins_results' in result[0]:
+            for j in range(len(result)):
+                bbox_results, mask_results = result[j]['ins_results']
+                result[j]['ins_results'] = (bbox_results,
+                                            encode_mask_results(mask_results))
+
+        results.extend(result)
+
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
--- a/sam/main_zero_shot_instance_seg.py
+++ b/sam/main_zero_shot_instance_seg.py
-# --------------------------------------------------------
-# InternImage
-# Copyright (c) 2022 OpenGVLab
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-
-import argparse
-import os
-import os.path as osp
-import time
-import warnings
-
-import mmcv
-import torch
-from mmcv import Config, DictAction
-from mmcv.cnn import fuse_conv_bn
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
-                         wrap_fp16_model)
-from mmdet.datasets import (build_dataloader, build_dataset,
-                            replace_ImageToTensor)
-from mmdet.models import build_detector
-from mmdet.apis import multi_gpu_test
-import detection.mmdet_custom  # noqa: F401,F403
-import detection.mmcv_custom  # noqa: F401,F403
-
-from segment_anything import sam_model_registry, SamPredictor
-try:
-    from .engine import single_gpu_test
-except ImportError:
-    from sam.engine import single_gpu_test
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Zero-shot instance segmentation evaluation for '
-                    'SAM prompted by MMDet detector')
-    parser.add_argument('detector_cfg_path',
-                        help='test config file path of MMDet detector')
-    parser.add_argument('detector_ckpt_path',
-                        help='checkpoint file path of MMDet detector')
-    parser.add_argument('sam_ckpt_path', default='vit_b',
-                        help='checkpoint file path of SAM')
-    parser.add_argument('--sam_type', default='vit_b',
-                        help='test config file path of MMDet detector')
-
-    parser.add_argument('--data_type', default='test', choices=['val', 'test'],
-                        help='run val set or test set')
-    parser.add_argument(
-        '--work-dir',
-        help='the directory to save the file containing evaluation metrics')
-    parser.add_argument('--out', help='output result file in pickle format')
-    parser.add_argument(
-        '--fuse-conv-bn',
-        action='store_true',
-        help='Whether to fuse conv and bn, this will slightly increase'
-        'the inference speed')
-    parser.add_argument('--gpu-ids',
-                        type=int,
-                        nargs='+',
-                        help='ids of gpus to use '
-                        '(only applicable to non-distributed testing)')
-    parser.add_argument(
-        '--format-only',
-        action='store_true',
-        help='Format the output results without perform evaluation. It is'
-        'useful when you want to format the result to a specific format and '
-        'submit it to the test server')
-    parser.add_argument(
-        '--eval',
-        type=str,
-        nargs='+',
-        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
-        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
-    parser.add_argument('--show', action='store_true', help='show results')
-    parser.add_argument('--show-dir',
-                        help='directory where painted images will be saved')
-    parser.add_argument('--show-score-thr',
-                        type=float,
-                        default=0.3,
-                        help='score threshold (default: 0.3)')
-    parser.add_argument('--gpu-collect',
-                        action='store_true',
-                        help='whether to use gpu to collect results.')
-    parser.add_argument(
-        '--tmpdir',
-        help='tmp directory used for collecting results from multiple '
-        'workers, available when gpu-collect is not specified')
-    parser.add_argument(
-        '--cfg-options',
-        nargs='+',
-        action=DictAction,
-        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file. If the value to '
-        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
-        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
-        'Note that the quotation marks are necessary and that no white space '
-        'is allowed.')
-    parser.add_argument(
-        '--options',
-        nargs='+',
-        action=DictAction,
-        help='custom options for evaluation, the key-value pair in xxx=yyy '
-        'format will be kwargs for dataset.evaluate() function (deprecate), '
-        'change to --eval-options instead.')
-    parser.add_argument(
-        '--eval-options',
-        nargs='+',
-        action=DictAction,
-        help='custom options for evaluation, the key-value pair in xxx=yyy '
-        'format will be kwargs for dataset.evaluate() function')
-    parser.add_argument('--launcher',
-                        choices=['none', 'pytorch', 'slurm', 'mpi'],
-                        default='none',
-                        help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
-    args = parser.parse_args()
-    if 'LOCAL_RANK' not in os.environ:
-        os.environ['LOCAL_RANK'] = str(args.local_rank)
-
-    if args.options and args.eval_options:
-        raise ValueError(
-            '--options and --eval-options cannot be both '
-            'specified, --options is deprecated in favor of --eval-options')
-    if args.options:
-        warnings.warn('--options is deprecated in favor of --eval-options')
-        args.eval_options = args.options
-    return args
-
-
-def main():
-    print('!!!!!!!!!!!!!!!!!!1', flush=True)
-    args = parse_args()
-
-    assert args.out or args.eval or args.format_only or args.show \
-        or args.show_dir, \
-        ('Please specify at least one operation (save/eval/format/show the '
-         'results / save the results) with the argument "--out", "--eval"'
-         ', "--format-only", "--show" or "--show-dir"')
-
-    if args.eval and args.format_only:
-        raise ValueError('--eval and --format_only cannot be both specified')
-
-    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
-        raise ValueError('The output file must be a pkl file.')
-
-    cfg = Config.fromfile(args.detector_cfg_path)
-    if args.cfg_options is not None:
-        cfg.merge_from_dict(args.cfg_options)
-    # set cudnn_benchmark
-    if cfg.get('cudnn_benchmark', False):
-        torch.backends.cudnn.benchmark = True
-
-    cfg.model.pretrained = None
-    if cfg.model.get('neck'):
-        if isinstance(cfg.model.neck, list):
-            for neck_cfg in cfg.model.neck:
-                if neck_cfg.get('rfp_backbone'):
-                    if neck_cfg.rfp_backbone.get('pretrained'):
-                        neck_cfg.rfp_backbone.pretrained = None
-        elif cfg.model.neck.get('rfp_backbone'):
-            if cfg.model.neck.rfp_backbone.get('pretrained'):
-                cfg.model.neck.rfp_backbone.pretrained = None
-
-    # in case the test dataset is concatenated
-    samples_per_gpu = 1
-    if isinstance(cfg.data.test, dict):
-        cfg.data.test.test_mode = True
-        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
-        if samples_per_gpu > 1:
-            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
-            cfg.data.test.pipeline = replace_ImageToTensor(
-                cfg.data.test.pipeline)
-    elif isinstance(cfg.data.test, list):
-        for ds_cfg in cfg.data.test:
-            ds_cfg.test_mode = True
-        samples_per_gpu = max(
-            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
-        if samples_per_gpu > 1:
-            for ds_cfg in cfg.data.test:
-                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
-
-    if args.gpu_ids is not None:
-        cfg.gpu_ids = args.gpu_ids
-    else:
-        cfg.gpu_ids = range(1)
-
-    print('!!!!!!!!!!!!!!!!!!2', flush=True)
-    # init distributed env first, since logger depends on the dist info.
-    if args.launcher == 'none':
-        distributed = False
-        if len(cfg.gpu_ids) > 1:
-            warnings.warn(
-                f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
-                f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
-                'non-distribute testing time.')
-            cfg.gpu_ids = cfg.gpu_ids[0:1]
-    else:
-        distributed = True
-        init_dist(args.launcher, **cfg.dist_params)
-    print('!!!!!!!!!!!!!!!!!!3', flush=True)
-
-    rank, _ = get_dist_info()
-    # allows not to create
-    if args.work_dir is not None and rank == 0:
-        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
-        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
-        json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')
-
-    # build the dataloader
-    dataset = build_dataset(cfg.data.test)
-    data_loader = build_dataloader(dataset,
-                                   samples_per_gpu=samples_per_gpu,
-                                   workers_per_gpu=cfg.data.workers_per_gpu,
-                                   dist=distributed,
-                                   shuffle=False)
-
-    # build the detector and load checkpoint
-    cfg.model.train_cfg = None
-    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        wrap_fp16_model(model)
-    checkpoint = load_checkpoint(model, args.detector_ckpt_path, map_location='cpu')
-
-    if args.fuse_conv_bn:
-        model = fuse_conv_bn(model)
-    # old versions did not save class info in checkpoints, this walkaround is
-    # for backward compatibility
-    if 'CLASSES' in checkpoint.get('meta', {}):
-        model.CLASSES = checkpoint['meta']['CLASSES']
-    else:
-        model.CLASSES = dataset.CLASSES
-
-    if not distributed:
-        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
-        # The SamPredictor will be invalid If model is wrapped using MMDataParallel
-        # A better implementation will be not to use the provided SamPredictor API
-        sam = sam_model_registry[args.sam_type](
-            checkpoint=args.sam_ckpt_path).to(
-            list(model.module.parameters())[0].device)
-        sam_predictor = SamPredictor(sam)
-        outputs = single_gpu_test(model, sam_predictor, data_loader, args.show,
-                                  args.show_dir, args.show_score_thr)
-    else:
-        raise NotImplementedError
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False)
-        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
-                                 args.gpu_collect)
-
-    rank, _ = get_dist_info()
-    if rank == 0:
-        if args.out:
-            print(f'\nwriting results to {args.out}')
-            mmcv.dump(outputs, args.out)
-        kwargs = {} if args.eval_options is None else args.eval_options
-        if args.format_only:
-            dataset.format_results(outputs, **kwargs)
-        if args.eval:
-            eval_kwargs = cfg.get('evaluation', {}).copy()
-            # hard-code way to remove EvalHook args
-            for key in [
-                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
-                    'rule', 'dynamic_intervals'
-            ]:
-                eval_kwargs.pop(key, None)
-            eval_kwargs.update(dict(metric=args.eval, **kwargs))
-            metric = dataset.evaluate(outputs, **eval_kwargs)
-            print(metric)
-            metric_dict = dict(config=args.detector_cfg_path, metric=metric)
-            if args.work_dir is not None and rank == 0:
-                mmcv.dump(metric_dict, json_file)
-
-
-if __name__ == '__main__':
-    main()
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import argparse
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+from mmdet.apis import multi_gpu_test
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.models import build_detector
+from segment_anything import SamPredictor, sam_model_registry
+
+import detection.mmcv_custom  # noqa: F401,F403
+import detection.mmdet_custom  # noqa: F401,F403
+
+try:
+    from .engine import single_gpu_test
+except ImportError:
+    from sam.engine import single_gpu_test
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Zero-shot instance segmentation evaluation for '
+                    'SAM prompted by MMDet detector')
+    parser.add_argument('detector_cfg_path',
+                        help='test config file path of MMDet detector')
+    parser.add_argument('detector_ckpt_path',
+                        help='checkpoint file path of MMDet detector')
+    parser.add_argument('sam_ckpt_path', default='vit_b',
+                        help='checkpoint file path of SAM')
+    parser.add_argument('--sam_type', default='vit_b',
+                        help='test config file path of MMDet detector')
+
+    parser.add_argument('--data_type', default='test', choices=['val', 'test'],
+                        help='run val set or test set')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument('--gpu-ids',
+                        type=int,
+                        nargs='+',
+                        help='ids of gpus to use '
+                        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument('--show-dir',
+                        help='directory where painted images will be saved')
+    parser.add_argument('--show-score-thr',
+                        type=float,
+                        default=0.3,
+                        help='score threshold (default: 0.3)')
+    parser.add_argument('--gpu-collect',
+                        action='store_true',
+                        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument('--launcher',
+                        choices=['none', 'pytorch', 'slurm', 'mpi'],
+                        default='none',
+                        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both '
+            'specified, --options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    print('!!!!!!!!!!!!!!!!!!1', flush=True)
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.detector_cfg_path)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    cfg.model.pretrained = None
+    if cfg.model.get('neck'):
+        if isinstance(cfg.model.neck, list):
+            for neck_cfg in cfg.model.neck:
+                if neck_cfg.get('rfp_backbone'):
+                    if neck_cfg.rfp_backbone.get('pretrained'):
+                        neck_cfg.rfp_backbone.pretrained = None
+        elif cfg.model.neck.get('rfp_backbone'):
+            if cfg.model.neck.rfp_backbone.get('pretrained'):
+                cfg.model.neck.rfp_backbone.pretrained = None
+
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1)
+
+    print('!!!!!!!!!!!!!!!!!!2', flush=True)
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+        if len(cfg.gpu_ids) > 1:
+            warnings.warn(
+                f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
+                f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
+                'non-distribute testing time.')
+            cfg.gpu_ids = cfg.gpu_ids[0:1]
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+    print('!!!!!!!!!!!!!!!!!!3', flush=True)
+
+    rank, _ = get_dist_info()
+    # allows not to create
+    if args.work_dir is not None and rank == 0:
+        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(dataset,
+                                   samples_per_gpu=samples_per_gpu,
+                                   workers_per_gpu=cfg.data.workers_per_gpu,
+                                   dist=distributed,
+                                   shuffle=False)
+
+    # build the detector and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.detector_ckpt_path, map_location='cpu')
+
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+
+    if not distributed:
+        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
+        # The SamPredictor will be invalid If model is wrapped using MMDataParallel
+        # A better implementation will be not to use the provided SamPredictor API
+        sam = sam_model_registry[args.sam_type](
+            checkpoint=args.sam_ckpt_path).to(
+            list(model.module.parameters())[0].device)
+        sam_predictor = SamPredictor(sam)
+        outputs = single_gpu_test(model, sam_predictor, data_loader, args.show,
+                                  args.show_dir, args.show_score_thr)
+    else:
+        raise NotImplementedError
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
+                                 args.gpu_collect)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(outputs, args.out)
+        kwargs = {} if args.eval_options is None else args.eval_options
+        if args.format_only:
+            dataset.format_results(outputs, **kwargs)
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule', 'dynamic_intervals'
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+            metric = dataset.evaluate(outputs, **eval_kwargs)
+            print(metric)
+            metric_dict = dict(config=args.detector_cfg_path, metric=metric)
+            if args.work_dir is not None and rank == 0:
+                mmcv.dump(metric_dict, json_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/README.md
+++ b/segmentation/README.md
 # InternImage for Semantic Segmentation

-This folder contains the implementation of the InternImage for semantic segmentation. 
+This folder contains the implementation of the InternImage for semantic segmentation.

 Our segmentation code is developed on top of [MMSegmentation v0.27.0](https://github.com/open-mmlab/mmsegmentation/tree/v0.27.0).

@@ -27,6 +27,7 @@ conda activate internimage
 - Install `PyTorch>=1.10.0` and `torchvision>=0.9.0` with `CUDA>=10.2`:

 For examples, to install torch==1.11 with CUDA==11.3 and nvcc:
+
 ```bash
 conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch -y
 conda install -c conda-forge cudatoolkit-dev=11.3 -y # to install nvcc
@@ -34,14 +35,14 @@ conda install -c conda-forge cudatoolkit-dev=11.3 -y # to install nvcc

 - Install other requirements:

-  note: conda opencv will break torchvision as not to support GPU, so we need to install opencv using pip. 	  
+  note: conda opencv will break torchvision as not to support GPU, so we need to install opencv using pip.

 ```bash
 conda install -c conda-forge termcolor yacs pyyaml scipy pip -y
 pip install opencv-python
 ```

- Install `timm` and `mmcv-full` and `mmsegmentation':
+- Install `timm` and `mmcv-full` and \`mmsegmentation':

 ```bash
 pip install -U openmim
@@ -51,20 +52,21 @@ pip install timm==0.6.11 mmdet==2.28.1
 ```

 - Compile CUDA operators
+
 ```bash
 cd ./ops_dcnv3
 sh ./make.sh
 # unit test (should see all checking is True)
 python test.py
 ```
+
 - You can also install the operator using .whl files
-[DCNv3-1.0-whl](https://github.com/OpenGVLab/InternImage/releases/tag/whl_files)
+  [DCNv3-1.0-whl](https://github.com/OpenGVLab/InternImage/releases/tag/whl_files)

 ### Data Preparation

 Prepare datasets according to the [guidelines](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#prepare-datasets) in MMSegmentation.

-
 ### Evaluation

 To evaluate our `InternImage` on ADE20K val, run:
@@ -72,6 +74,7 @@ To evaluate our `InternImage` on ADE20K val, run:
 ```bash
 sh dist_test.sh <config-file> <checkpoint> <gpu-num> --eval mIoU
 ```
+
 You can download checkpoint files from [here](https://huggingface.co/OpenGVLab/InternImage/tree/fc1e4e7e01c3e7a39a3875bdebb6577a7256ff91). Then place it to segmentation/checkpoint_dir/seg.

 For example, to evaluate the `InternImage-T` with a single GPU:
@@ -109,19 +112,22 @@ GPUS=8 sh slurm_train.sh <partition> <job-name> configs/ade20k/upernet_internima
 ```

 ### Image Demo
+
 To inference a single/multiple image like this.
 If you specify image containing directory instead of a single image, it will process all the images in the directory.:
+
 ```
 CUDA_VISIBLE_DEVICES=0 python image_demo.py \
  data/ade/ADEChallengeData2016/images/validation/ADE_val_00000591.jpg \
  configs/ade20k/upernet_internimage_t_512_160k_ade20k.py  \
  checkpoint_dir/seg/upernet_internimage_t_512_160k_ade20k.pth  \
-  --palette ade20k 
+  --palette ade20k
 ```

 ### Export

 To export a segmentation model from PyTorch to TensorRT, run:
+
 ```shell
 MODEL="model_name"
 CKPT_PATH="/path/to/model/ckpt.pth"
@@ -137,6 +143,7 @@ python deploy.py \
 ```

 For example, to export `upernet_internimage_t_512_160k_ade20k` from PyTorch to TensorRT, run:
+
 ```shell
 MODEL="upernet_internimage_t_512_160k_ade20k"
 CKPT_PATH="/path/to/model/ckpt/upernet_internimage_t_512_160k_ade20k.pth"

--- a/segmentation/configs/_base_/models/mask2former_beit.py
+++ b/segmentation/configs/_base_/models/mask2former_beit.py
@@ -135,4 +135,4 @@ model = dict(
        filter_low_score=True),
    init_cfg=None)

-# find_unused_parameters = True
\ No newline at end of file
+# find_unused_parameters = True
--- a/segmentation/configs/_base_/models/segformer_mit-b0.py
+++ b/segmentation/configs/_base_/models/segformer_mit-b0.py
@@ -31,4 +31,4 @@ model = dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
    # model training and testing settings
    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
\ No newline at end of file
+    test_cfg=dict(mode='whole'))
--- a/segmentation/configs/ade20k/README.md
+++ b/segmentation/configs/ade20k/README.md
@@ -4,28 +4,25 @@ Introduced by Zhou et al. in [Scene Parsing Through ADE20K Dataset](https://pape

 The ADE20K semantic segmentation dataset contains more than 20K scene-centric images exhaustively annotated with pixel-level objects and object parts labels. There are totally 150 semantic categories, which include stuffs like sky, road, grass, and discrete objects like person, car, bed.

-
 ## Model Zoo

 ### UperNet + InternImage

-
-| backbone       | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download            |
-|:--------------:|:----------:|:-----------:|:-----------:|:----------:|:-------:|:-----:|:-----:|:-------------------:|
-| InternImage-T  | 512x512    | 47.9 / 48.1  | 0.23s / iter       | 10.5h      | 59M     | 944G  | [config](./upernet_internimage_t_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512_160k_ade20k.log.json)   | 
-| InternImage-S  | 512x512    | 50.1 / 50.9  | 0.25s / iter       | 11.5h      | 80M     | 1017G | [config](./upernet_internimage_s_512_160k_ade20k.py)  | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512_160k_ade20k.log.json)  | 
-| InternImage-B  | 512x512    | 50.8 / 51.3  | 0.26s / iter       | 12h        | 128M    | 1185G | [config](./upernet_internimage_b_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512_160k_ade20k.log.json)  | 
-| InternImage-L  | 640x640    | 53.9 / 54.1  | 0.42s / iter       | 19h        | 256M    | 2526G | [config](./upernet_internimage_l_640_160k_ade20k.py)| [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_640_160k_ade20k.log.json)  | 
-| InternImage-XL | 640x640    | 55.0 / 55.3  | 0.47s / iter       | 22h        | 368M    | 3142G | [config](./upernet_internimage_xl_640_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_640_160k_ade20k.log.json) | 
-| InternImage-H  | 896x896    | 59.9 / 60.3  | 0.94s / iter       | 2d (2n)       | 1.12B    | 3566G | [config](./upernet_internimage_h_896_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_h_896_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_h_896_160k_ade20k.log.json) | 
+|    backbone    | resolution | mIoU (ss/ms) | train speed  | train time | #param | FLOPs |                        Config                         |                                                                                                           Download                                                                                                           |
+| :------------: | :--------: | :----------: | :----------: | :--------: | :----: | :---: | :---------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| InternImage-T  |  512x512   | 47.9 / 48.1  | 0.23s / iter |   10.5h    |  59M   | 944G  | [config](./upernet_internimage_t_512_160k_ade20k.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512_160k_ade20k.log.json)  |
+| InternImage-S  |  512x512   | 50.1 / 50.9  | 0.25s / iter |   11.5h    |  80M   | 1017G | [config](./upernet_internimage_s_512_160k_ade20k.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512_160k_ade20k.log.json)  |
+| InternImage-B  |  512x512   | 50.8 / 51.3  | 0.26s / iter |    12h     |  128M  | 1185G | [config](./upernet_internimage_b_512_160k_ade20k.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512_160k_ade20k.log.json)  |
+| InternImage-L  |  640x640   | 53.9 / 54.1  | 0.42s / iter |    19h     |  256M  | 2526G | [config](./upernet_internimage_l_640_160k_ade20k.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_640_160k_ade20k.log.json)  |
+| InternImage-XL |  640x640   | 55.0 / 55.3  | 0.47s / iter |    22h     |  368M  | 3142G | [config](./upernet_internimage_xl_640_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_640_160k_ade20k.log.json) |
+| InternImage-H  |  896x896   | 59.9 / 60.3  | 0.94s / iter |  2d (2n)   | 1.12B  | 3566G | [config](./upernet_internimage_h_896_160k_ade20k.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_h_896_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_h_896_160k_ade20k.log.json)  |

 - Training speed is measured with A100 GPU.
 - Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
 - The logs are our recent newly trained ones. There are slight differences between the results in logs and our paper.

-
 ### Mask2Former + InternImage

-| backbone       | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download            |
-|:--------------:|:----------:|:-----------:|:-----------:|:----------:|:-------:|:-----:|:-----:|:-------------------:|
-| InternImage-H  | 896x896    | 62.6 / 62.9  | 1.21s / iter       | 1.5d (2n)       | 1.31B    | 4635G | [config](./mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.log.json) |
+|   backbone    | resolution | mIoU (ss/ms) | train speed  | train time | #param | FLOPs |                                Config                                |                                                                                                                       Download                                                                                                                       |
+| :-----------: | :--------: | :----------: | :----------: | :--------: | :----: | :---: | :------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| InternImage-H |  896x896   | 62.6 / 62.9  | 1.21s / iter | 1.5d (2n)  | 1.31B  | 4635G | [config](./mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.log.json) |
--- a/segmentation/configs/ade20k/mask2former_internimage_h_896_80k_cocostuff2ade20k_ms.py
+++ b/segmentation/configs/ade20k/mask2former_internimage_h_896_80k_cocostuff2ade20k_ms.py
@@ -161,4 +161,3 @@ optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2)
 checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
 evaluation = dict(interval=2000, metric='mIoU', save_best='mIoU')
 # fp16 = dict(loss_scale=dict(init_scale=512))
-
--- a/segmentation/configs/cityscapes/README.md
+++ b/segmentation/configs/cityscapes/README.md
@@ -8,31 +8,31 @@ Cityscapes is a large-scale database which focuses on semantic understanding of

 ### UperNet + InternImage

-| backbone       | resolution |  mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download                                                                      |
-|:--------------:|:----------:|:------------:|:-----------:|:----------:|:-------:|:-----:|:----:|:----:|
-| InternImage-T  | 512x1024   |   82.58 / 83.40    | 0.32s / iter       | 14.5h      | 59M     | 1889G | [config](./upernet_internimage_t_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512x1024_160k_cityscapes.log.json) |
-| InternImage-S  | 512x1024   |   82.74 / 83.45    | 0.36s / iter       | 16.5h      | 80M     | 2035G | [config](./upernet_internimage_s_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512x1024_160k_cityscapes.log.json)  |
-| InternImage-B  | 512x1024   |   83.18 / 83.97    | 0.39s / iter       | 17h        | 128M    | 2369G | [config](./upernet_internimage_b_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512x1024_160k_cityscapes.log.json)  |
-| InternImage-L  | 512x1024   |    83.68 / 84.41   | 0.50s / iter       | 23h        | 256M    | 3234G | [config](./upernet_internimage_l_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_cityscapes.log.json)  |
-| InternImage-XL | 512x1024   |    83.62 / 84.28   | 0.56s / iter       | 26h       | 368M    | 4022G | [config](./upernet_internimage_xl_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_cityscapes.log.json) |
+|    backbone    | resolution | mIoU (ss/ms)  | train speed  | train time | #params | FLOPs |                             Config                             |                                                                                                                    Download                                                                                                                    |
+| :------------: | :--------: | :-----------: | :----------: | :--------: | :-----: | :---: | :------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| InternImage-T  |  512x1024  | 82.58 / 83.40 | 0.32s / iter |   14.5h    |   59M   | 1889G | [config](./upernet_internimage_t_512x1024_160k_cityscapes.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512x1024_160k_cityscapes.log.json)  |
+| InternImage-S  |  512x1024  | 82.74 / 83.45 | 0.36s / iter |   16.5h    |   80M   | 2035G | [config](./upernet_internimage_s_512x1024_160k_cityscapes.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512x1024_160k_cityscapes.log.json)  |
+| InternImage-B  |  512x1024  | 83.18 / 83.97 | 0.39s / iter |    17h     |  128M   | 2369G | [config](./upernet_internimage_b_512x1024_160k_cityscapes.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512x1024_160k_cityscapes.log.json)  |
+| InternImage-L  |  512x1024  | 83.68 / 84.41 | 0.50s / iter |    23h     |  256M   | 3234G | [config](./upernet_internimage_l_512x1024_160k_cityscapes.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_cityscapes.log.json)  |
+| InternImage-XL |  512x1024  | 83.62 / 84.28 | 0.56s / iter |    26h     |  368M   | 4022G | [config](./upernet_internimage_xl_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_cityscapes.log.json) |

 - Training speed is measured with A100 GPU.
 - Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.

 ### UperNet + InternImage (with additional data)

-Mapillary 80k + Cityscapes (w/ coarse data) 160k 
+Mapillary 80k + Cityscapes (w/ coarse data) 160k

-| backbone       | resolution |  mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download     |
-|:--------------:|:----------:|:------------:|:-----------:|:-----------:|:-------:|:-----:|:------:|:------------:|
-| InternImage-L  | 512x1024   | 85.94 / 86.22  | 0.50s / iter | 23h    | 256M  | 3234G | [config](./upernet_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.pth)  \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.log.json)  |
-| InternImage-XL | 512x1024   | 86.20 / 86.42  | 0.56s / iter | 26h    | 368M  | 4022G | [config](./upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
+|    backbone    | resolution | mIoU (ss/ms)  | train speed  | train time | #params | FLOPs |                                  Config                                  |                                                                                                                              Download                                                                                                                              |
+| :------------: | :--------: | :-----------: | :----------: | :--------: | :-----: | :---: | :----------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| InternImage-L  |  512x1024  | 85.94 / 86.22 | 0.50s / iter |    23h     |  256M   | 3234G | [config](./upernet_internimage_l_512x1024_160k_mapillary2cityscapes.py)  | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.pth)  \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.log.json)  |
+| InternImage-XL |  512x1024  | 86.20 / 86.42 | 0.56s / iter |    26h     |  368M   | 4022G | [config](./upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |

 ### SegFormerHead + InternImage (with additional data)

 Mapillary 80k + Cityscapes (w/ coarse data) 160k

-| backbone       | resolution |  mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
-|:--------------:|:----------:|:------------:|:-----------:|:-----------:|:-------:|:-----:|:-----:|:---------:|
-| InternImage-L  | 512x1024   | 85.16 / 85.67  | 0.37s / iter       | 17h        | 220M    | 1580G | [config](./segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.log.json)  |
-| InternImage-XL | 512x1024   | 85.41 / 85.93  | 0.43s / iter       |  19.5h      | 330M    | 2364G | [config](./segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
+|    backbone    | resolution | mIoU (ss/ms)  | train speed  | train time | #params | FLOPs |                                   Config                                   |                                                                                                                                Download                                                                                                                                |
+| :------------: | :--------: | :-----------: | :----------: | :--------: | :-----: | :---: | :------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| InternImage-L  |  512x1024  | 85.16 / 85.67 | 0.37s / iter |    17h     |  220M   | 1580G | [config](./segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py)  |  [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.log.json)  |
+| InternImage-XL |  512x1024  | 85.41 / 85.93 | 0.43s / iter |   19.5h    |  330M   | 2364G | [config](./segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |