Commit 41b18fd8 authored by zhe chen's avatar zhe chen
Browse files

Use pre-commit to reformat code


Use pre-commit to reformat code
parent ff20ea39
......@@ -4,16 +4,14 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import, division, print_function
import DCNv3
import torch
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.cuda.amp import custom_bwd, custom_fwd
import DCNv3
class DCNv3Function(Function):
......@@ -88,6 +86,7 @@ class DCNv3Function(Function):
im2col_step_i=int(im2col_step),
)
def _get_reference_points(spatial_shapes, device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1):
_, H_, W_, _ = spatial_shapes
H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
......
......@@ -4,4 +4,4 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from .dcnv3 import DCNv3, DCNv3_pytorch
\ No newline at end of file
from .dcnv3 import DCNv3, DCNv3_pytorch
......@@ -4,22 +4,24 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import, division, print_function
import warnings
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, constant_
from torch import nn
from torch.nn.init import constant_, xavier_uniform_
from ..functions import DCNv3Function, dcnv3_core_pytorch
try:
from DCNv4.functions import DCNv4Function
except:
warnings.warn('Now, we support DCNv4 in InternImage.')
import math
class to_channels_first(nn.Module):
def __init__(self):
......@@ -76,7 +78,7 @@ def build_act_layer(act_layer):
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError(
"invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
'invalid input for _is_power_of_2: {} (type: {})'.format(n, type(n)))
return (n & (n - 1) == 0) and n != 0
......@@ -128,7 +130,7 @@ class DCNv3_pytorch(nn.Module):
if not _is_power_of_2(_d_per_group):
warnings.warn(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
'which is more efficient in our CUDA implementation.')
self.offset_scale = offset_scale
self.channels = channels
......@@ -165,7 +167,7 @@ class DCNv3_pytorch(nn.Module):
self.input_proj = nn.Linear(channels, channels)
self.output_proj = nn.Linear(channels, channels)
self._reset_parameters()
if center_feature_scale:
self.center_feature_scale_proj_weight = nn.Parameter(
torch.zeros((group, channels), dtype=torch.float))
......@@ -234,7 +236,7 @@ class DCNv3(nn.Module):
norm_layer='LN',
center_feature_scale=False,
use_dcn_v4_op=False,
):
):
"""
DCNv3 Module
:param channels
......@@ -257,7 +259,7 @@ class DCNv3(nn.Module):
if not _is_power_of_2(_d_per_group):
warnings.warn(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
'which is more efficient in our CUDA implementation.')
self.offset_scale = offset_scale
self.channels = channels
......@@ -270,7 +272,7 @@ class DCNv3(nn.Module):
self.group_channels = channels // group
self.offset_scale = offset_scale
self.center_feature_scale = center_feature_scale
self.use_dcn_v4_op = use_dcn_v4_op
self.dw_conv = nn.Sequential(
......@@ -296,7 +298,7 @@ class DCNv3(nn.Module):
self.input_proj = nn.Linear(channels, channels)
self.output_proj = nn.Linear(channels, channels)
self._reset_parameters()
if center_feature_scale:
self.center_feature_scale_proj_weight = nn.Parameter(
torch.zeros((group, channels), dtype=torch.float))
......@@ -329,7 +331,7 @@ class DCNv3(nn.Module):
x1 = self.dw_conv(x1)
offset = self.offset(x1)
mask = self.mask(x1).reshape(N, H, W, self.group, -1)
if not self.use_dcn_v4_op:
mask = F.softmax(mask, -1).reshape(N, H, W, -1).type(dtype)
x = DCNv3Function.apply(
......@@ -349,12 +351,12 @@ class DCNv3(nn.Module):
mask = mask.view(N, H, W, self.group, -1)
offset_mask = torch.cat([offset, mask], -1).view(N, H, W, -1).contiguous()
# For efficiency, the last dimension of the offset_mask tensor in dcnv4 is a multiple of 8.
# For efficiency, the last dimension of the offset_mask tensor in dcnv4 is a multiple of 8.
K3 = offset_mask.size(-1)
K3_pad = int(math.ceil(K3/8)*8)
K3_pad = int(math.ceil(K3 / 8) * 8)
pad_dim = K3_pad - K3
offset_mask = torch.cat([offset_mask, offset_mask.new_zeros([*offset_mask.size()[:3], pad_dim])], -1)
x = DCNv4Function.apply(
x, offset_mask,
self.kernel_size, self.kernel_size,
......
......@@ -4,39 +4,34 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
import glob
import os
import torch
from setuptools import find_packages, setup
from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
requirements = ['torch', 'torchvision']
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
extensions_dir = os.path.join(this_dir, 'src')
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
main_file = glob.glob(os.path.join(extensions_dir, '*.cpp'))
source_cpu = glob.glob(os.path.join(extensions_dir, 'cpu', '*.cpp'))
source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
extra_compile_args = {'cxx': []}
define_macros = []
if torch.cuda.is_available() and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
define_macros += [('WITH_CUDA', None)]
extra_compile_args['nvcc'] = [
# "-DCUDA_HAS_FP16=1",
# "-D__CUDA_NO_HALF_OPERATORS__",
# "-D__CUDA_NO_HALF_CONVERSIONS__",
......@@ -49,7 +44,7 @@ def get_extensions():
include_dirs = [extensions_dir]
ext_modules = [
extension(
"DCNv3",
'DCNv3',
sources,
include_dirs=include_dirs,
define_macros=define_macros,
......@@ -60,16 +55,16 @@ def get_extensions():
setup(
name="DCNv3",
version="1.0",
author="InternImage",
url="https://github.com/OpenGVLab/InternImage",
name='DCNv3',
version='1.0',
author='InternImage',
url='https://github.com/OpenGVLab/InternImage',
description=
"PyTorch Wrapper for CUDA Functions of DCNv3",
'PyTorch Wrapper for CUDA Functions of DCNv3',
packages=find_packages(exclude=(
"configs",
"tests",
'configs',
'tests',
)),
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
)
......@@ -171,4 +171,4 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
} else {
return {grad_input, grad_offset, grad_mask};
}
}
\ No newline at end of file
}
......@@ -1042,4 +1042,4 @@ void dcnv3_col2im_cuda(
if (err != cudaSuccess) {
printf("error in dcnv3_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
\ No newline at end of file
}
......@@ -4,17 +4,15 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import, division, print_function
import math
import time
import torch
import torch.nn as nn
import math
from torch.autograd import gradcheck
from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch
from torch.autograd import gradcheck
H_in, W_in = 8, 8
N, M, D = 2, 4, 16
......
......@@ -22,4 +22,4 @@ srun -p ${PARTITION} \
--kill-on-bad-exit=1 \
--quotatype=spot \
${SRUN_ARGS} \
python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
\ No newline at end of file
python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
......@@ -11,6 +11,8 @@ import time
import warnings
import mmcv
import mmcv_custom # noqa: F401,F403
import mmdet_custom # noqa: F401,F403
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
......@@ -21,8 +23,6 @@ from mmdet.apis import multi_gpu_test, single_gpu_test
from mmdet.datasets import (build_dataloader, build_dataset,
replace_ImageToTensor)
from mmdet.models import build_detector
import mmdet_custom # noqa: F401,F403
import mmcv_custom # noqa: F401,F403
def parse_args():
......
import argparse
import concurrent.futures
import json
import os
import pickle as pkl
import numpy as np
import random
from PIL import Image
import concurrent.futures
import json
import mmcv
import numpy as np
from PIL import Image
def parse_args():
parser = argparse.ArgumentParser(description='Generate MMDetection Annotations for Crowdhuman-like dataset')
......@@ -16,6 +18,7 @@ def parse_args():
args = parser.parse_args()
return args.dataset, args.dataset_split
def load_func(fpath):
assert os.path.exists(fpath)
with open(fpath, 'r') as fid:
......@@ -23,6 +26,7 @@ def load_func(fpath):
records = [json.loads(line.strip('\n')) for line in lines]
return records
def decode_annotations(records, dataset_path):
rec_ids = list(range(len(records)))
img_list = []
......@@ -80,16 +84,17 @@ def decode_annotations(records, dataset_path):
)
return json_dict
if __name__ == "__main__":
if __name__ == '__main__':
dataset_name, dataset_type = parse_args()
dataset_path = 'data/%s/' % dataset_name
ch_file_path = dataset_path + 'annotations/annotation_%s.odgt' % dataset_type
json_file_path = dataset_path + 'annotations/annotation_%s.json' % dataset_type
records = load_func(ch_file_path)
print("Loading Annotations Done")
print('Loading Annotations Done')
json_dict = decode_annotations(records, dataset_path)
print("Parsing Bbox Number: %d" % len(json_dict['annotations']))
print('Parsing Bbox Number: %d' % len(json_dict['annotations']))
mmcv.dump(json_dict, json_file_path)
from .compute_APMR import compute_APMR
from .compute_JI import compute_JI_with_ignore
\ No newline at end of file
from .compute_JI import compute_JI_with_ignore
......@@ -12,12 +12,13 @@ import time
import warnings
import mmcv
import mmcv_custom # noqa: F401,F403
import mmdet_custom # noqa: F401,F403
import torch
import torch.distributed as dist
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist
from mmcv.utils import get_git_hash
from mmdet import __version__
from mmdet.apis import init_random_seed, set_random_seed, train_detector
from mmdet.datasets import build_dataset
......@@ -25,8 +26,6 @@ from mmdet.models import build_detector
from mmdet.utils import (collect_env, get_device, get_root_logger,
replace_cfg_vals, setup_multi_processes,
update_data_root)
import mmcv_custom # noqa: F401,F403
import mmdet_custom # noqa: F401,F403
def parse_args():
......@@ -244,4 +243,4 @@ def main():
if __name__ == '__main__':
main()
\ No newline at end of file
main()
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import pickle
import shutil
import tempfile
import time
import numpy as np
import torch
import torch.distributed as dist
import torch.nn.functional as F
import mmcv
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info
from mmdet.core import encode_mask_results
def prompt_sam_with_bboxes(sam_predictor, data, box_result):
# process detector prediction
# (x1, y1, x2, y2), rescaled in original image space
bboxes = np.concatenate(box_result, axis=0)[..., :4]
if len(bboxes) == 0:
return [[] for _ in range(len(box_result))]
labels = np.concatenate([[i] * len(boxes) for i, boxes in enumerate(box_result)])
# prepare shapes
img_metas = data['img_metas'][0].data[0][0]
original_size = img_metas['ori_shape'][:2]
# prepare input img of sam
sam_predictor.reset_image()
# img has been normed (NOTE 2.x norm img in pipeline)
img = data['img'][0] .to(sam_predictor.model.device)
# resize max length to 1024 and keep aspect ratio (ViT image encoder limitation)
target_size = sam_predictor.transform.get_preprocess_shape(
img.shape[2], img.shape[3],
sam_predictor.transform.target_length)
try:
# `antialias=True` is provided in official implementation of SAM,
# which may raise TypeError in PyTorch of previous versions.
transformed_img = F.interpolate(
img, target_size, mode="bilinear",
align_corners=False, antialias=True)
except TypeError:
transformed_img = F.interpolate(
img, target_size, mode="bilinear", align_corners=False)
# Pad to 1024 x 1024
h, w = transformed_img.shape[-2:]
pad_h = sam_predictor.model.image_encoder.img_size - h
pad_w = sam_predictor.model.image_encoder.img_size - w
transformed_img = F.pad(transformed_img, (0, pad_w, 0, pad_h))
# extract img feature
sam_predictor.features = sam_predictor.model.image_encoder(
transformed_img).to(sam_predictor.model.device)
# set attributes
sam_predictor.original_size = original_size
sam_predictor.input_size = tuple(transformed_img.shape[-2:])
sam_predictor.is_image_set = True
# prepare bboxes and rescale bboxes to relative coordinates
bboxes_tensor = torch.from_numpy(bboxes).to(sam_predictor.model.device)
transformed_boxes = sam_predictor.transform.apply_boxes_torch(bboxes_tensor, original_size)
# prompt with bboxes
batch_masks, _, _ = sam_predictor.predict_torch(
point_coords=None,
point_labels=None,
boxes=transformed_boxes,
multimask_output=False)
batch_masks = batch_masks.squeeze(1).cpu().numpy()
mask_results = [[*batch_masks[labels == i]] for i in range(len(box_result))]
return mask_results
def single_gpu_test(model,
sam_predictor,
data_loader,
show=False,
out_dir=None,
show_score_thr=0.3):
model.eval()
results = []
dataset = data_loader.dataset
PALETTE = getattr(dataset, 'PALETTE', None)
prog_bar = mmcv.ProgressBar(len(dataset))
for i, data in enumerate(data_loader):
with torch.no_grad():
# For instance segmentor, only the box results is used in the
# second stage (prompt sam with box). NOTE the mask_head is still
# calculated, hence the FPS, FLOPS, maybe not accurate.
result = model(return_loss=False, rescale=True, **data)
if getattr(model.module, 'with_mask', False):
box_result = result[0][0] # simple_test supported
mask_result = prompt_sam_with_bboxes(sam_predictor, data, box_result)
result = [(box_result, mask_result)]
else:
raise NotImplementedError('WIP!')
batch_size = len(result)
if show or out_dir:
if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
img_tensor = data['img'][0]
else:
img_tensor = data['img'][0].data[0]
img_metas = data['img_metas'][0].data[0]
imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
assert len(imgs) == len(img_metas)
for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
h, w, _ = img_meta['img_shape']
img_show = img[:h, :w, :]
ori_h, ori_w = img_meta['ori_shape'][:-1]
img_show = mmcv.imresize(img_show, (ori_w, ori_h))
if out_dir:
out_file = osp.join(out_dir, img_meta['ori_filename'])
else:
out_file = None
model.module.show_result(
img_show,
result[i],
bbox_color=PALETTE,
text_color=PALETTE,
mask_color=PALETTE,
show=show,
out_file=out_file,
score_thr=show_score_thr)
# encode mask results
if isinstance(result[0], tuple):
result = [(bbox_results, encode_mask_results(mask_results))
for bbox_results, mask_results in result]
# This logic is only used in panoptic segmentation test.
elif isinstance(result[0], dict) and 'ins_results' in result[0]:
for j in range(len(result)):
bbox_results, mask_results = result[j]['ins_results']
result[j]['ins_results'] = (bbox_results,
encode_mask_results(mask_results))
results.extend(result)
for _ in range(batch_size):
prog_bar.update()
return results
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import pickle
import shutil
import tempfile
import time
import mmcv
import numpy as np
import torch
import torch.distributed as dist
import torch.nn.functional as F
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info
from mmdet.core import encode_mask_results
def prompt_sam_with_bboxes(sam_predictor, data, box_result):
# process detector prediction
# (x1, y1, x2, y2), rescaled in original image space
bboxes = np.concatenate(box_result, axis=0)[..., :4]
if len(bboxes) == 0:
return [[] for _ in range(len(box_result))]
labels = np.concatenate([[i] * len(boxes) for i, boxes in enumerate(box_result)])
# prepare shapes
img_metas = data['img_metas'][0].data[0][0]
original_size = img_metas['ori_shape'][:2]
# prepare input img of sam
sam_predictor.reset_image()
# img has been normed (NOTE 2.x norm img in pipeline)
img = data['img'][0] .to(sam_predictor.model.device)
# resize max length to 1024 and keep aspect ratio (ViT image encoder limitation)
target_size = sam_predictor.transform.get_preprocess_shape(
img.shape[2], img.shape[3],
sam_predictor.transform.target_length)
try:
# `antialias=True` is provided in official implementation of SAM,
# which may raise TypeError in PyTorch of previous versions.
transformed_img = F.interpolate(
img, target_size, mode='bilinear',
align_corners=False, antialias=True)
except TypeError:
transformed_img = F.interpolate(
img, target_size, mode='bilinear', align_corners=False)
# Pad to 1024 x 1024
h, w = transformed_img.shape[-2:]
pad_h = sam_predictor.model.image_encoder.img_size - h
pad_w = sam_predictor.model.image_encoder.img_size - w
transformed_img = F.pad(transformed_img, (0, pad_w, 0, pad_h))
# extract img feature
sam_predictor.features = sam_predictor.model.image_encoder(
transformed_img).to(sam_predictor.model.device)
# set attributes
sam_predictor.original_size = original_size
sam_predictor.input_size = tuple(transformed_img.shape[-2:])
sam_predictor.is_image_set = True
# prepare bboxes and rescale bboxes to relative coordinates
bboxes_tensor = torch.from_numpy(bboxes).to(sam_predictor.model.device)
transformed_boxes = sam_predictor.transform.apply_boxes_torch(bboxes_tensor, original_size)
# prompt with bboxes
batch_masks, _, _ = sam_predictor.predict_torch(
point_coords=None,
point_labels=None,
boxes=transformed_boxes,
multimask_output=False)
batch_masks = batch_masks.squeeze(1).cpu().numpy()
mask_results = [[*batch_masks[labels == i]] for i in range(len(box_result))]
return mask_results
def single_gpu_test(model,
sam_predictor,
data_loader,
show=False,
out_dir=None,
show_score_thr=0.3):
model.eval()
results = []
dataset = data_loader.dataset
PALETTE = getattr(dataset, 'PALETTE', None)
prog_bar = mmcv.ProgressBar(len(dataset))
for i, data in enumerate(data_loader):
with torch.no_grad():
# For instance segmentor, only the box results is used in the
# second stage (prompt sam with box). NOTE the mask_head is still
# calculated, hence the FPS, FLOPS, maybe not accurate.
result = model(return_loss=False, rescale=True, **data)
if getattr(model.module, 'with_mask', False):
box_result = result[0][0] # simple_test supported
mask_result = prompt_sam_with_bboxes(sam_predictor, data, box_result)
result = [(box_result, mask_result)]
else:
raise NotImplementedError('WIP!')
batch_size = len(result)
if show or out_dir:
if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
img_tensor = data['img'][0]
else:
img_tensor = data['img'][0].data[0]
img_metas = data['img_metas'][0].data[0]
imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
assert len(imgs) == len(img_metas)
for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
h, w, _ = img_meta['img_shape']
img_show = img[:h, :w, :]
ori_h, ori_w = img_meta['ori_shape'][:-1]
img_show = mmcv.imresize(img_show, (ori_w, ori_h))
if out_dir:
out_file = osp.join(out_dir, img_meta['ori_filename'])
else:
out_file = None
model.module.show_result(
img_show,
result[i],
bbox_color=PALETTE,
text_color=PALETTE,
mask_color=PALETTE,
show=show,
out_file=out_file,
score_thr=show_score_thr)
# encode mask results
if isinstance(result[0], tuple):
result = [(bbox_results, encode_mask_results(mask_results))
for bbox_results, mask_results in result]
# This logic is only used in panoptic segmentation test.
elif isinstance(result[0], dict) and 'ins_results' in result[0]:
for j in range(len(result)):
bbox_results, mask_results = result[j]['ins_results']
result[j]['ins_results'] = (bbox_results,
encode_mask_results(mask_results))
results.extend(result)
for _ in range(batch_size):
prog_bar.update()
return results
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import argparse
import os
import os.path as osp
import time
import warnings
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
wrap_fp16_model)
from mmdet.datasets import (build_dataloader, build_dataset,
replace_ImageToTensor)
from mmdet.models import build_detector
from mmdet.apis import multi_gpu_test
import detection.mmdet_custom # noqa: F401,F403
import detection.mmcv_custom # noqa: F401,F403
from segment_anything import sam_model_registry, SamPredictor
try:
from .engine import single_gpu_test
except ImportError:
from sam.engine import single_gpu_test
def parse_args():
parser = argparse.ArgumentParser(
description='Zero-shot instance segmentation evaluation for '
'SAM prompted by MMDet detector')
parser.add_argument('detector_cfg_path',
help='test config file path of MMDet detector')
parser.add_argument('detector_ckpt_path',
help='checkpoint file path of MMDet detector')
parser.add_argument('sam_ckpt_path', default='vit_b',
help='checkpoint file path of SAM')
parser.add_argument('--sam_type', default='vit_b',
help='test config file path of MMDet detector')
parser.add_argument('--data_type', default='test', choices=['val', 'test'],
help='run val set or test set')
parser.add_argument(
'--work-dir',
help='the directory to save the file containing evaluation metrics')
parser.add_argument('--out', help='output result file in pickle format')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument('--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
'(only applicable to non-distributed testing)')
parser.add_argument(
'--format-only',
action='store_true',
help='Format the output results without perform evaluation. It is'
'useful when you want to format the result to a specific format and '
'submit it to the test server')
parser.add_argument(
'--eval',
type=str,
nargs='+',
help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
parser.add_argument('--show', action='store_true', help='show results')
parser.add_argument('--show-dir',
help='directory where painted images will be saved')
parser.add_argument('--show-score-thr',
type=float,
default=0.3,
help='score threshold (default: 0.3)')
parser.add_argument('--gpu-collect',
action='store_true',
help='whether to use gpu to collect results.')
parser.add_argument(
'--tmpdir',
help='tmp directory used for collecting results from multiple '
'workers, available when gpu-collect is not specified')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--options',
nargs='+',
action=DictAction,
help='custom options for evaluation, the key-value pair in xxx=yyy '
'format will be kwargs for dataset.evaluate() function (deprecate), '
'change to --eval-options instead.')
parser.add_argument(
'--eval-options',
nargs='+',
action=DictAction,
help='custom options for evaluation, the key-value pair in xxx=yyy '
'format will be kwargs for dataset.evaluate() function')
parser.add_argument('--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
if args.options and args.eval_options:
raise ValueError(
'--options and --eval-options cannot be both '
'specified, --options is deprecated in favor of --eval-options')
if args.options:
warnings.warn('--options is deprecated in favor of --eval-options')
args.eval_options = args.options
return args
def main():
print('!!!!!!!!!!!!!!!!!!1', flush=True)
args = parse_args()
assert args.out or args.eval or args.format_only or args.show \
or args.show_dir, \
('Please specify at least one operation (save/eval/format/show the '
'results / save the results) with the argument "--out", "--eval"'
', "--format-only", "--show" or "--show-dir"')
if args.eval and args.format_only:
raise ValueError('--eval and --format_only cannot be both specified')
if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
raise ValueError('The output file must be a pkl file.')
cfg = Config.fromfile(args.detector_cfg_path)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
if cfg.model.get('neck'):
if isinstance(cfg.model.neck, list):
for neck_cfg in cfg.model.neck:
if neck_cfg.get('rfp_backbone'):
if neck_cfg.rfp_backbone.get('pretrained'):
neck_cfg.rfp_backbone.pretrained = None
elif cfg.model.neck.get('rfp_backbone'):
if cfg.model.neck.rfp_backbone.get('pretrained'):
cfg.model.neck.rfp_backbone.pretrained = None
# in case the test dataset is concatenated
samples_per_gpu = 1
if isinstance(cfg.data.test, dict):
cfg.data.test.test_mode = True
samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
if samples_per_gpu > 1:
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.test.pipeline = replace_ImageToTensor(
cfg.data.test.pipeline)
elif isinstance(cfg.data.test, list):
for ds_cfg in cfg.data.test:
ds_cfg.test_mode = True
samples_per_gpu = max(
[ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
if samples_per_gpu > 1:
for ds_cfg in cfg.data.test:
ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1)
print('!!!!!!!!!!!!!!!!!!2', flush=True)
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
if len(cfg.gpu_ids) > 1:
warnings.warn(
f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
'non-distribute testing time.')
cfg.gpu_ids = cfg.gpu_ids[0:1]
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
print('!!!!!!!!!!!!!!!!!!3', flush=True)
rank, _ = get_dist_info()
# allows not to create
if args.work_dir is not None and rank == 0:
mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')
# build the dataloader
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(dataset,
samples_per_gpu=samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False)
# build the detector and load checkpoint
cfg.model.train_cfg = None
model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
checkpoint = load_checkpoint(model, args.detector_ckpt_path, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_conv_bn(model)
# old versions did not save class info in checkpoints, this walkaround is
# for backward compatibility
if 'CLASSES' in checkpoint.get('meta', {}):
model.CLASSES = checkpoint['meta']['CLASSES']
else:
model.CLASSES = dataset.CLASSES
if not distributed:
model = MMDataParallel(model, device_ids=cfg.gpu_ids)
# The SamPredictor will be invalid If model is wrapped using MMDataParallel
# A better implementation will be not to use the provided SamPredictor API
sam = sam_model_registry[args.sam_type](
checkpoint=args.sam_ckpt_path).to(
list(model.module.parameters())[0].device)
sam_predictor = SamPredictor(sam)
outputs = single_gpu_test(model, sam_predictor, data_loader, args.show,
args.show_dir, args.show_score_thr)
else:
raise NotImplementedError
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False)
outputs = multi_gpu_test(model, data_loader, args.tmpdir,
args.gpu_collect)
rank, _ = get_dist_info()
if rank == 0:
if args.out:
print(f'\nwriting results to {args.out}')
mmcv.dump(outputs, args.out)
kwargs = {} if args.eval_options is None else args.eval_options
if args.format_only:
dataset.format_results(outputs, **kwargs)
if args.eval:
eval_kwargs = cfg.get('evaluation', {}).copy()
# hard-code way to remove EvalHook args
for key in [
'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
'rule', 'dynamic_intervals'
]:
eval_kwargs.pop(key, None)
eval_kwargs.update(dict(metric=args.eval, **kwargs))
metric = dataset.evaluate(outputs, **eval_kwargs)
print(metric)
metric_dict = dict(config=args.detector_cfg_path, metric=metric)
if args.work_dir is not None and rank == 0:
mmcv.dump(metric_dict, json_file)
if __name__ == '__main__':
main()
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import argparse
import os
import os.path as osp
import time
import warnings
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
wrap_fp16_model)
from mmdet.apis import multi_gpu_test
from mmdet.datasets import (build_dataloader, build_dataset,
replace_ImageToTensor)
from mmdet.models import build_detector
from segment_anything import SamPredictor, sam_model_registry
import detection.mmcv_custom # noqa: F401,F403
import detection.mmdet_custom # noqa: F401,F403
try:
from .engine import single_gpu_test
except ImportError:
from sam.engine import single_gpu_test
def parse_args():
parser = argparse.ArgumentParser(
description='Zero-shot instance segmentation evaluation for '
'SAM prompted by MMDet detector')
parser.add_argument('detector_cfg_path',
help='test config file path of MMDet detector')
parser.add_argument('detector_ckpt_path',
help='checkpoint file path of MMDet detector')
parser.add_argument('sam_ckpt_path', default='vit_b',
help='checkpoint file path of SAM')
parser.add_argument('--sam_type', default='vit_b',
help='test config file path of MMDet detector')
parser.add_argument('--data_type', default='test', choices=['val', 'test'],
help='run val set or test set')
parser.add_argument(
'--work-dir',
help='the directory to save the file containing evaluation metrics')
parser.add_argument('--out', help='output result file in pickle format')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument('--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
'(only applicable to non-distributed testing)')
parser.add_argument(
'--format-only',
action='store_true',
help='Format the output results without perform evaluation. It is'
'useful when you want to format the result to a specific format and '
'submit it to the test server')
parser.add_argument(
'--eval',
type=str,
nargs='+',
help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
parser.add_argument('--show', action='store_true', help='show results')
parser.add_argument('--show-dir',
help='directory where painted images will be saved')
parser.add_argument('--show-score-thr',
type=float,
default=0.3,
help='score threshold (default: 0.3)')
parser.add_argument('--gpu-collect',
action='store_true',
help='whether to use gpu to collect results.')
parser.add_argument(
'--tmpdir',
help='tmp directory used for collecting results from multiple '
'workers, available when gpu-collect is not specified')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--options',
nargs='+',
action=DictAction,
help='custom options for evaluation, the key-value pair in xxx=yyy '
'format will be kwargs for dataset.evaluate() function (deprecate), '
'change to --eval-options instead.')
parser.add_argument(
'--eval-options',
nargs='+',
action=DictAction,
help='custom options for evaluation, the key-value pair in xxx=yyy '
'format will be kwargs for dataset.evaluate() function')
parser.add_argument('--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
if args.options and args.eval_options:
raise ValueError(
'--options and --eval-options cannot be both '
'specified, --options is deprecated in favor of --eval-options')
if args.options:
warnings.warn('--options is deprecated in favor of --eval-options')
args.eval_options = args.options
return args
def main():
print('!!!!!!!!!!!!!!!!!!1', flush=True)
args = parse_args()
assert args.out or args.eval or args.format_only or args.show \
or args.show_dir, \
('Please specify at least one operation (save/eval/format/show the '
'results / save the results) with the argument "--out", "--eval"'
', "--format-only", "--show" or "--show-dir"')
if args.eval and args.format_only:
raise ValueError('--eval and --format_only cannot be both specified')
if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
raise ValueError('The output file must be a pkl file.')
cfg = Config.fromfile(args.detector_cfg_path)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
if cfg.model.get('neck'):
if isinstance(cfg.model.neck, list):
for neck_cfg in cfg.model.neck:
if neck_cfg.get('rfp_backbone'):
if neck_cfg.rfp_backbone.get('pretrained'):
neck_cfg.rfp_backbone.pretrained = None
elif cfg.model.neck.get('rfp_backbone'):
if cfg.model.neck.rfp_backbone.get('pretrained'):
cfg.model.neck.rfp_backbone.pretrained = None
# in case the test dataset is concatenated
samples_per_gpu = 1
if isinstance(cfg.data.test, dict):
cfg.data.test.test_mode = True
samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
if samples_per_gpu > 1:
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.test.pipeline = replace_ImageToTensor(
cfg.data.test.pipeline)
elif isinstance(cfg.data.test, list):
for ds_cfg in cfg.data.test:
ds_cfg.test_mode = True
samples_per_gpu = max(
[ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
if samples_per_gpu > 1:
for ds_cfg in cfg.data.test:
ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1)
print('!!!!!!!!!!!!!!!!!!2', flush=True)
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
if len(cfg.gpu_ids) > 1:
warnings.warn(
f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
'non-distribute testing time.')
cfg.gpu_ids = cfg.gpu_ids[0:1]
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
print('!!!!!!!!!!!!!!!!!!3', flush=True)
rank, _ = get_dist_info()
# allows not to create
if args.work_dir is not None and rank == 0:
mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')
# build the dataloader
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(dataset,
samples_per_gpu=samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False)
# build the detector and load checkpoint
cfg.model.train_cfg = None
model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
checkpoint = load_checkpoint(model, args.detector_ckpt_path, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_conv_bn(model)
# old versions did not save class info in checkpoints, this walkaround is
# for backward compatibility
if 'CLASSES' in checkpoint.get('meta', {}):
model.CLASSES = checkpoint['meta']['CLASSES']
else:
model.CLASSES = dataset.CLASSES
if not distributed:
model = MMDataParallel(model, device_ids=cfg.gpu_ids)
# The SamPredictor will be invalid If model is wrapped using MMDataParallel
# A better implementation will be not to use the provided SamPredictor API
sam = sam_model_registry[args.sam_type](
checkpoint=args.sam_ckpt_path).to(
list(model.module.parameters())[0].device)
sam_predictor = SamPredictor(sam)
outputs = single_gpu_test(model, sam_predictor, data_loader, args.show,
args.show_dir, args.show_score_thr)
else:
raise NotImplementedError
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False)
outputs = multi_gpu_test(model, data_loader, args.tmpdir,
args.gpu_collect)
rank, _ = get_dist_info()
if rank == 0:
if args.out:
print(f'\nwriting results to {args.out}')
mmcv.dump(outputs, args.out)
kwargs = {} if args.eval_options is None else args.eval_options
if args.format_only:
dataset.format_results(outputs, **kwargs)
if args.eval:
eval_kwargs = cfg.get('evaluation', {}).copy()
# hard-code way to remove EvalHook args
for key in [
'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
'rule', 'dynamic_intervals'
]:
eval_kwargs.pop(key, None)
eval_kwargs.update(dict(metric=args.eval, **kwargs))
metric = dataset.evaluate(outputs, **eval_kwargs)
print(metric)
metric_dict = dict(config=args.detector_cfg_path, metric=metric)
if args.work_dir is not None and rank == 0:
mmcv.dump(metric_dict, json_file)
if __name__ == '__main__':
main()
# InternImage for Semantic Segmentation
This folder contains the implementation of the InternImage for semantic segmentation.
This folder contains the implementation of the InternImage for semantic segmentation.
Our segmentation code is developed on top of [MMSegmentation v0.27.0](https://github.com/open-mmlab/mmsegmentation/tree/v0.27.0).
......@@ -27,6 +27,7 @@ conda activate internimage
- Install `PyTorch>=1.10.0` and `torchvision>=0.9.0` with `CUDA>=10.2`:
For examples, to install torch==1.11 with CUDA==11.3 and nvcc:
```bash
conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch -y
conda install -c conda-forge cudatoolkit-dev=11.3 -y # to install nvcc
......@@ -34,14 +35,14 @@ conda install -c conda-forge cudatoolkit-dev=11.3 -y # to install nvcc
- Install other requirements:
note: conda opencv will break torchvision as not to support GPU, so we need to install opencv using pip.
note: conda opencv will break torchvision as not to support GPU, so we need to install opencv using pip.
```bash
conda install -c conda-forge termcolor yacs pyyaml scipy pip -y
pip install opencv-python
```
- Install `timm` and `mmcv-full` and `mmsegmentation':
- Install `timm` and `mmcv-full` and \`mmsegmentation':
```bash
pip install -U openmim
......@@ -51,20 +52,21 @@ pip install timm==0.6.11 mmdet==2.28.1
```
- Compile CUDA operators
```bash
cd ./ops_dcnv3
sh ./make.sh
# unit test (should see all checking is True)
python test.py
```
- You can also install the operator using .whl files
[DCNv3-1.0-whl](https://github.com/OpenGVLab/InternImage/releases/tag/whl_files)
[DCNv3-1.0-whl](https://github.com/OpenGVLab/InternImage/releases/tag/whl_files)
### Data Preparation
Prepare datasets according to the [guidelines](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#prepare-datasets) in MMSegmentation.
### Evaluation
To evaluate our `InternImage` on ADE20K val, run:
......@@ -72,6 +74,7 @@ To evaluate our `InternImage` on ADE20K val, run:
```bash
sh dist_test.sh <config-file> <checkpoint> <gpu-num> --eval mIoU
```
You can download checkpoint files from [here](https://huggingface.co/OpenGVLab/InternImage/tree/fc1e4e7e01c3e7a39a3875bdebb6577a7256ff91). Then place it to segmentation/checkpoint_dir/seg.
For example, to evaluate the `InternImage-T` with a single GPU:
......@@ -109,19 +112,22 @@ GPUS=8 sh slurm_train.sh <partition> <job-name> configs/ade20k/upernet_internima
```
### Image Demo
To inference a single/multiple image like this.
If you specify image containing directory instead of a single image, it will process all the images in the directory.:
```
CUDA_VISIBLE_DEVICES=0 python image_demo.py \
data/ade/ADEChallengeData2016/images/validation/ADE_val_00000591.jpg \
configs/ade20k/upernet_internimage_t_512_160k_ade20k.py \
checkpoint_dir/seg/upernet_internimage_t_512_160k_ade20k.pth \
--palette ade20k
--palette ade20k
```
### Export
To export a segmentation model from PyTorch to TensorRT, run:
```shell
MODEL="model_name"
CKPT_PATH="/path/to/model/ckpt.pth"
......@@ -137,6 +143,7 @@ python deploy.py \
```
For example, to export `upernet_internimage_t_512_160k_ade20k` from PyTorch to TensorRT, run:
```shell
MODEL="upernet_internimage_t_512_160k_ade20k"
CKPT_PATH="/path/to/model/ckpt/upernet_internimage_t_512_160k_ade20k.pth"
......
......@@ -135,4 +135,4 @@ model = dict(
filter_low_score=True),
init_cfg=None)
# find_unused_parameters = True
\ No newline at end of file
# find_unused_parameters = True
......@@ -31,4 +31,4 @@ model = dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(),
test_cfg=dict(mode='whole'))
\ No newline at end of file
test_cfg=dict(mode='whole'))
......@@ -4,28 +4,25 @@ Introduced by Zhou et al. in [Scene Parsing Through ADE20K Dataset](https://pape
The ADE20K semantic segmentation dataset contains more than 20K scene-centric images exhaustively annotated with pixel-level objects and object parts labels. There are totally 150 semantic categories, which include stuffs like sky, road, grass, and discrete objects like person, car, bed.
## Model Zoo
### UperNet + InternImage
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download |
|:--------------:|:----------:|:-----------:|:-----------:|:----------:|:-------:|:-----:|:-----:|:-------------------:|
| InternImage-T | 512x512 | 47.9 / 48.1 | 0.23s / iter | 10.5h | 59M | 944G | [config](./upernet_internimage_t_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512_160k_ade20k.log.json) |
| InternImage-S | 512x512 | 50.1 / 50.9 | 0.25s / iter | 11.5h | 80M | 1017G | [config](./upernet_internimage_s_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512_160k_ade20k.log.json) |
| InternImage-B | 512x512 | 50.8 / 51.3 | 0.26s / iter | 12h | 128M | 1185G | [config](./upernet_internimage_b_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512_160k_ade20k.log.json) |
| InternImage-L | 640x640 | 53.9 / 54.1 | 0.42s / iter | 19h | 256M | 2526G | [config](./upernet_internimage_l_640_160k_ade20k.py)| [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_640_160k_ade20k.log.json) |
| InternImage-XL | 640x640 | 55.0 / 55.3 | 0.47s / iter | 22h | 368M | 3142G | [config](./upernet_internimage_xl_640_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_640_160k_ade20k.log.json) |
| InternImage-H | 896x896 | 59.9 / 60.3 | 0.94s / iter | 2d (2n) | 1.12B | 3566G | [config](./upernet_internimage_h_896_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_h_896_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_h_896_160k_ade20k.log.json) |
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download |
| :------------: | :--------: | :----------: | :----------: | :--------: | :----: | :---: | :---------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| InternImage-T | 512x512 | 47.9 / 48.1 | 0.23s / iter | 10.5h | 59M | 944G | [config](./upernet_internimage_t_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512_160k_ade20k.log.json) |
| InternImage-S | 512x512 | 50.1 / 50.9 | 0.25s / iter | 11.5h | 80M | 1017G | [config](./upernet_internimage_s_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512_160k_ade20k.log.json) |
| InternImage-B | 512x512 | 50.8 / 51.3 | 0.26s / iter | 12h | 128M | 1185G | [config](./upernet_internimage_b_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512_160k_ade20k.log.json) |
| InternImage-L | 640x640 | 53.9 / 54.1 | 0.42s / iter | 19h | 256M | 2526G | [config](./upernet_internimage_l_640_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_640_160k_ade20k.log.json) |
| InternImage-XL | 640x640 | 55.0 / 55.3 | 0.47s / iter | 22h | 368M | 3142G | [config](./upernet_internimage_xl_640_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_640_160k_ade20k.log.json) |
| InternImage-H | 896x896 | 59.9 / 60.3 | 0.94s / iter | 2d (2n) | 1.12B | 3566G | [config](./upernet_internimage_h_896_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_h_896_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_h_896_160k_ade20k.log.json) |
- Training speed is measured with A100 GPU.
- Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
- The logs are our recent newly trained ones. There are slight differences between the results in logs and our paper.
### Mask2Former + InternImage
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download |
|:--------------:|:----------:|:-----------:|:-----------:|:----------:|:-------:|:-----:|:-----:|:-------------------:|
| InternImage-H | 896x896 | 62.6 / 62.9 | 1.21s / iter | 1.5d (2n) | 1.31B | 4635G | [config](./mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.log.json) |
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download |
| :-----------: | :--------: | :----------: | :----------: | :--------: | :----: | :---: | :------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| InternImage-H | 896x896 | 62.6 / 62.9 | 1.21s / iter | 1.5d (2n) | 1.31B | 4635G | [config](./mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.log.json) |
......@@ -161,4 +161,3 @@ optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2)
checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
evaluation = dict(interval=2000, metric='mIoU', save_best='mIoU')
# fp16 = dict(loss_scale=dict(init_scale=512))
......@@ -8,31 +8,31 @@ Cityscapes is a large-scale database which focuses on semantic understanding of
### UperNet + InternImage
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
|:--------------:|:----------:|:------------:|:-----------:|:----------:|:-------:|:-----:|:----:|:----:|
| InternImage-T | 512x1024 | 82.58 / 83.40 | 0.32s / iter | 14.5h | 59M | 1889G | [config](./upernet_internimage_t_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512x1024_160k_cityscapes.log.json) |
| InternImage-S | 512x1024 | 82.74 / 83.45 | 0.36s / iter | 16.5h | 80M | 2035G | [config](./upernet_internimage_s_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512x1024_160k_cityscapes.log.json) |
| InternImage-B | 512x1024 | 83.18 / 83.97 | 0.39s / iter | 17h | 128M | 2369G | [config](./upernet_internimage_b_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512x1024_160k_cityscapes.log.json) |
| InternImage-L | 512x1024 | 83.68 / 84.41 | 0.50s / iter | 23h | 256M | 3234G | [config](./upernet_internimage_l_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_cityscapes.log.json) |
| InternImage-XL | 512x1024 | 83.62 / 84.28 | 0.56s / iter | 26h | 368M | 4022G | [config](./upernet_internimage_xl_512x1024_160k_cityscapes.py) |[ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_cityscapes.log.json) |
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
| :------------: | :--------: | :-----------: | :----------: | :--------: | :-----: | :---: | :------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| InternImage-T | 512x1024 | 82.58 / 83.40 | 0.32s / iter | 14.5h | 59M | 1889G | [config](./upernet_internimage_t_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512x1024_160k_cityscapes.log.json) |
| InternImage-S | 512x1024 | 82.74 / 83.45 | 0.36s / iter | 16.5h | 80M | 2035G | [config](./upernet_internimage_s_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512x1024_160k_cityscapes.log.json) |
| InternImage-B | 512x1024 | 83.18 / 83.97 | 0.39s / iter | 17h | 128M | 2369G | [config](./upernet_internimage_b_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512x1024_160k_cityscapes.log.json) |
| InternImage-L | 512x1024 | 83.68 / 84.41 | 0.50s / iter | 23h | 256M | 3234G | [config](./upernet_internimage_l_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_cityscapes.log.json) |
| InternImage-XL | 512x1024 | 83.62 / 84.28 | 0.56s / iter | 26h | 368M | 4022G | [config](./upernet_internimage_xl_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_cityscapes.log.json) |
- Training speed is measured with A100 GPU.
- Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
### UperNet + InternImage (with additional data)
Mapillary 80k + Cityscapes (w/ coarse data) 160k
Mapillary 80k + Cityscapes (w/ coarse data) 160k
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
|:--------------:|:----------:|:------------:|:-----------:|:-----------:|:-------:|:-----:|:------:|:------------:|
| InternImage-L | 512x1024 | 85.94 / 86.22 | 0.50s / iter | 23h | 256M | 3234G | [config](./upernet_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.log.json) |
| InternImage-XL | 512x1024 | 86.20 / 86.42 | 0.56s / iter | 26h | 368M | 4022G | [config](./upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
| :------------: | :--------: | :-----------: | :----------: | :--------: | :-----: | :---: | :----------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| InternImage-L | 512x1024 | 85.94 / 86.22 | 0.50s / iter | 23h | 256M | 3234G | [config](./upernet_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.log.json) |
| InternImage-XL | 512x1024 | 86.20 / 86.42 | 0.56s / iter | 26h | 368M | 4022G | [config](./upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
### SegFormerHead + InternImage (with additional data)
Mapillary 80k + Cityscapes (w/ coarse data) 160k
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
|:--------------:|:----------:|:------------:|:-----------:|:-----------:|:-------:|:-----:|:-----:|:---------:|
| InternImage-L | 512x1024 | 85.16 / 85.67 | 0.37s / iter | 17h | 220M | 1580G | [config](./segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.log.json) |
| InternImage-XL | 512x1024 | 85.41 / 85.93 | 0.43s / iter | 19.5h | 330M | 2364G | [config](./segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
| backbone | resolution | mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
| :------------: | :--------: | :-----------: | :----------: | :--------: | :-----: | :---: | :------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| InternImage-L | 512x1024 | 85.16 / 85.67 | 0.37s / iter | 17h | 220M | 1580G | [config](./segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.log.json) |
| InternImage-XL | 512x1024 | 85.41 / 85.93 | 0.43s / iter | 19.5h | 330M | 2364G | [config](./segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment