v1.0

3a6df602 · chenzk · 3a6df602 · 3a6df602 · 3a6df602 · 3a6df602
Commit 3a6df602 authored Jun 13, 2024 by chenzk
20 changed files
--- a/segmentation/tools/dist_train.sh
+++ b/segmentation/tools/dist_train.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+NCCL_P2P_DISABLE=1 \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --launcher pytorch ${@:3}
--- a/segmentation/tools/get_flops.py
+++ b/segmentation/tools/get_flops.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+from mmcv import Config
+from mmcv.cnn import get_model_complexity_info
+
+from mmseg.models import build_segmentor
+import sys 
+sys.path.append("..") 
+import xformer
+import pvt
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[2048, 1024],
+        help='input image size')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+
+    if len(args.shape) == 1:
+        input_shape = (3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (3, ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    cfg = Config.fromfile(args.config)
+    cfg.model.pretrained = None
+    model = build_segmentor(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg')).cuda()
+    model.eval()
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+
+    flops, params = get_model_complexity_info(model, input_shape)
+    split_line = '=' * 30
+    print('{0}\nInput shape: {1}\nFlops: {2}\nParams: {3}\n{0}'.format(
+        split_line, input_shape, flops, params))
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/tools/model_converters/mit2mmseg.py
+++ b/segmentation/tools/model_converters/mit2mmseg.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmcv
+import torch
+from mmcv.runner import CheckpointLoader
+
+
+def convert_mit(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        # patch embedding conversion
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}', f'layers.{stage_i-1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        # transformer encoder layer conversion
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            new_k = k.replace(f'block{stage_i}', f'layers.{stage_i-1}.1')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                new_k = new_k.replace('fc2.', '4.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        # norm layer conversion
+        elif k.startswith('norm'):
+            stage_i = int(k.split('.')[0].replace('norm', ''))
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i-1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in official pretrained segformer to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_mit(state_dict)
+    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/tools/model_converters/swin2mmseg.py
+++ b/segmentation/tools/model_converters/swin2mmseg.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmcv
+import torch
+from mmcv.runner import CheckpointLoader
+
+
+def convert_swin(ckpt):
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in official pretrained swin models to'
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_swin(state_dict)
+    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/tools/model_converters/vit2mmseg.py
+++ b/segmentation/tools/model_converters/vit2mmseg.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmcv
+import torch
+from mmcv.runner import CheckpointLoader
+
+
+def convert_vit(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm'):
+            new_k = k.replace('norm.', 'ln1.')
+        elif k.startswith('patch_embed'):
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        elif k.startswith('blocks'):
+            if 'norm' in k:
+                new_k = k.replace('norm', 'ln')
+            elif 'mlp.fc1' in k:
+                new_k = k.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in k:
+                new_k = k.replace('mlp.fc2', 'ffn.layers.1')
+            elif 'attn.qkv' in k:
+                new_k = k.replace('attn.qkv.', 'attn.attn.in_proj_')
+            elif 'attn.proj' in k:
+                new_k = k.replace('attn.proj', 'attn.attn.out_proj')
+            else:
+                new_k = k
+            new_k = new_k.replace('blocks.', 'layers.')
+        else:
+            new_k = k
+        new_ckpt[new_k] = v
+
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in timm pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        # timm checkpoint
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        # deit checkpoint
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_vit(state_dict)
+    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/tools/onnx2tensorrt.py
+++ b/segmentation/tools/onnx2tensorrt.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+from typing import Iterable, Optional, Union
+
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+import onnxruntime as ort
+import torch
+from mmcv.ops import get_onnxruntime_op_path
+from mmcv.tensorrt import (TRTWraper, is_tensorrt_plugin_loaded, onnx2trt,
+                           save_trt_engine)
+
+from mmseg.apis.inference import LoadImage
+from mmseg.datasets import DATASETS
+from mmseg.datasets.pipelines import Compose
+
+
+def get_GiB(x: int):
+    """return x GiB."""
+    return x * (1 << 30)
+
+
+def _prepare_input_img(img_path: str,
+                       test_pipeline: Iterable[dict],
+                       shape: Optional[Iterable] = None,
+                       rescale_shape: Optional[Iterable] = None) -> dict:
+    # build the data pipeline
+    if shape is not None:
+        test_pipeline[1]['img_scale'] = (shape[1], shape[0])
+    test_pipeline[1]['transforms'][0]['keep_ratio'] = False
+    test_pipeline = [LoadImage()] + test_pipeline[1:]
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    data = dict(img=img_path)
+    data = test_pipeline(data)
+    imgs = data['img']
+    img_metas = [i.data for i in data['img_metas']]
+
+    if rescale_shape is not None:
+        for img_meta in img_metas:
+            img_meta['ori_shape'] = tuple(rescale_shape) + (3, )
+
+    mm_inputs = {'imgs': imgs, 'img_metas': img_metas}
+
+    return mm_inputs
+
+
+def _update_input_img(img_list: Iterable, img_meta_list: Iterable):
+    # update img and its meta list
+    N = img_list[0].size(0)
+    img_meta = img_meta_list[0][0]
+    img_shape = img_meta['img_shape']
+    ori_shape = img_meta['ori_shape']
+    pad_shape = img_meta['pad_shape']
+    new_img_meta_list = [[{
+        'img_shape':
+        img_shape,
+        'ori_shape':
+        ori_shape,
+        'pad_shape':
+        pad_shape,
+        'filename':
+        img_meta['filename'],
+        'scale_factor':
+        (img_shape[1] / ori_shape[1], img_shape[0] / ori_shape[0]) * 2,
+        'flip':
+        False,
+    } for _ in range(N)]]
+
+    return img_list, new_img_meta_list
+
+
+def show_result_pyplot(img: Union[str, np.ndarray],
+                       result: np.ndarray,
+                       palette: Optional[Iterable] = None,
+                       fig_size: Iterable[int] = (15, 10),
+                       opacity: float = 0.5,
+                       title: str = '',
+                       block: bool = True):
+    img = mmcv.imread(img)
+    img = img.copy()
+    seg = result[0]
+    seg = mmcv.imresize(seg, img.shape[:2][::-1])
+    palette = np.array(palette)
+    assert palette.shape[1] == 3
+    assert len(palette.shape) == 2
+    assert 0 < opacity <= 1.0
+    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+    for label, color in enumerate(palette):
+        color_seg[seg == label, :] = color
+    # convert to BGR
+    color_seg = color_seg[..., ::-1]
+
+    img = img * (1 - opacity) + color_seg * opacity
+    img = img.astype(np.uint8)
+
+    plt.figure(figsize=fig_size)
+    plt.imshow(mmcv.bgr2rgb(img))
+    plt.title(title)
+    plt.tight_layout()
+    plt.show(block=block)
+
+
+def onnx2tensorrt(onnx_file: str,
+                  trt_file: str,
+                  config: dict,
+                  input_config: dict,
+                  fp16: bool = False,
+                  verify: bool = False,
+                  show: bool = False,
+                  dataset: str = 'CityscapesDataset',
+                  workspace_size: int = 1,
+                  verbose: bool = False):
+    import tensorrt as trt
+    min_shape = input_config['min_shape']
+    max_shape = input_config['max_shape']
+    # create trt engine and wrapper
+    opt_shape_dict = {'input': [min_shape, min_shape, max_shape]}
+    max_workspace_size = get_GiB(workspace_size)
+    trt_engine = onnx2trt(
+        onnx_file,
+        opt_shape_dict,
+        log_level=trt.Logger.VERBOSE if verbose else trt.Logger.ERROR,
+        fp16_mode=fp16,
+        max_workspace_size=max_workspace_size)
+    save_dir, _ = osp.split(trt_file)
+    if save_dir:
+        os.makedirs(save_dir, exist_ok=True)
+    save_trt_engine(trt_engine, trt_file)
+    print(f'Successfully created TensorRT engine: {trt_file}')
+
+    if verify:
+        inputs = _prepare_input_img(
+            input_config['input_path'],
+            config.data.test.pipeline,
+            shape=min_shape[2:])
+
+        imgs = inputs['imgs']
+        img_metas = inputs['img_metas']
+        img_list = [img[None, :] for img in imgs]
+        img_meta_list = [[img_meta] for img_meta in img_metas]
+        # update img_meta
+        img_list, img_meta_list = _update_input_img(img_list, img_meta_list)
+
+        if max_shape[0] > 1:
+            # concate flip image for batch test
+            flip_img_list = [_.flip(-1) for _ in img_list]
+            img_list = [
+                torch.cat((ori_img, flip_img), 0)
+                for ori_img, flip_img in zip(img_list, flip_img_list)
+            ]
+
+        # Get results from ONNXRuntime
+        ort_custom_op_path = get_onnxruntime_op_path()
+        session_options = ort.SessionOptions()
+        if osp.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+        sess = ort.InferenceSession(onnx_file, session_options)
+        sess.set_providers(['CPUExecutionProvider'], [{}])  # use cpu mode
+        onnx_output = sess.run(['output'],
+                               {'input': img_list[0].detach().numpy()})[0][0]
+
+        # Get results from TensorRT
+        trt_model = TRTWraper(trt_file, ['input'], ['output'])
+        with torch.no_grad():
+            trt_outputs = trt_model({'input': img_list[0].contiguous().cuda()})
+        trt_output = trt_outputs['output'][0].cpu().detach().numpy()
+
+        if show:
+            dataset = DATASETS.get(dataset)
+            assert dataset is not None
+            palette = dataset.PALETTE
+
+            show_result_pyplot(
+                input_config['input_path'],
+                (onnx_output[0].astype(np.uint8), ),
+                palette=palette,
+                title='ONNXRuntime',
+                block=False)
+            show_result_pyplot(
+                input_config['input_path'], (trt_output[0].astype(np.uint8), ),
+                palette=palette,
+                title='TensorRT')
+
+        np.testing.assert_allclose(
+            onnx_output, trt_output, rtol=1e-03, atol=1e-05)
+        print('TensorRT and ONNXRuntime output all close.')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMSegmentation models from ONNX to TensorRT')
+    parser.add_argument('config', help='Config file of the model')
+    parser.add_argument('model', help='Path to the input ONNX model')
+    parser.add_argument(
+        '--trt-file', type=str, help='Path to the output TensorRT engine')
+    parser.add_argument(
+        '--max-shape',
+        type=int,
+        nargs=4,
+        default=[1, 3, 400, 600],
+        help='Maximum shape of model input.')
+    parser.add_argument(
+        '--min-shape',
+        type=int,
+        nargs=4,
+        default=[1, 3, 400, 600],
+        help='Minimum shape of model input.')
+    parser.add_argument('--fp16', action='store_true', help='Enable fp16 mode')
+    parser.add_argument(
+        '--workspace-size',
+        type=int,
+        default=1,
+        help='Max workspace size in GiB')
+    parser.add_argument(
+        '--input-img', type=str, default='', help='Image for test')
+    parser.add_argument(
+        '--show', action='store_true', help='Whether to show output results')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='CityscapesDataset',
+        help='Dataset name')
+    parser.add_argument(
+        '--verify',
+        action='store_true',
+        help='Verify the outputs of ONNXRuntime and TensorRT')
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Whether to verbose logging messages while creating \
+                TensorRT engine.')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+
+    assert is_tensorrt_plugin_loaded(), 'TensorRT plugin should be compiled.'
+    args = parse_args()
+
+    if not args.input_img:
+        args.input_img = osp.join(osp.dirname(__file__), '../demo/demo.png')
+
+    # check arguments
+    assert osp.exists(args.config), 'Config {} not found.'.format(args.config)
+    assert osp.exists(args.model), \
+        'ONNX model {} not found.'.format(args.model)
+    assert args.workspace_size >= 0, 'Workspace size less than 0.'
+    assert DATASETS.get(args.dataset) is not None, \
+        'Dataset {} does not found.'.format(args.dataset)
+    for max_value, min_value in zip(args.max_shape, args.min_shape):
+        assert max_value >= min_value, \
+            'max_shape should be larger than min shape'
+
+    input_config = {
+        'min_shape': args.min_shape,
+        'max_shape': args.max_shape,
+        'input_path': args.input_img
+    }
+
+    cfg = mmcv.Config.fromfile(args.config)
+    onnx2tensorrt(
+        args.model,
+        args.trt_file,
+        cfg,
+        input_config,
+        fp16=args.fp16,
+        verify=args.verify,
+        show=args.show,
+        dataset=args.dataset,
+        workspace_size=args.workspace_size,
+        verbose=args.verbose)
--- a/segmentation/tools/print_config.py
+++ b/segmentation/tools/print_config.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+from mmcv import Config, DictAction
+
+from mmseg.apis import init_segmentor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--graph', action='store_true', help='print the models graph')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+    # dump config
+    cfg.dump('example.py')
+    # dump models graph
+    if args.graph:
+        model = init_segmentor(args.config, device='cpu')
+        print(f'Model graph:\n{str(model)}')
+        with open('example-graph.txt', 'w') as f:
+            f.writelines(str(model))
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/tools/publish_model.py
+++ b/segmentation/tools/publish_model.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/tools/pytorch2onnx.py
+++ b/segmentation/tools/pytorch2onnx.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from functools import partial
+
+import mmcv
+import numpy as np
+import onnxruntime as rt
+import torch
+import torch._C
+import torch.serialization
+from mmcv import DictAction
+from mmcv.onnx import register_extra_symbolics
+from mmcv.runner import load_checkpoint
+from torch import nn
+
+from mmseg.apis import show_result_pyplot
+from mmseg.apis.inference import LoadImage
+from mmseg.datasets.pipelines import Compose
+from mmseg.models import build_segmentor
+from mmseg.ops import resize
+
+torch.manual_seed(3)
+
+
+def _convert_batchnorm(module):
+    module_output = module
+    if isinstance(module, torch.nn.SyncBatchNorm):
+        module_output = torch.nn.BatchNorm2d(module.num_features, module.eps,
+                                             module.momentum, module.affine,
+                                             module.track_running_stats)
+        if module.affine:
+            module_output.weight.data = module.weight.data.clone().detach()
+            module_output.bias.data = module.bias.data.clone().detach()
+            # keep requires_grad unchanged
+            module_output.weight.requires_grad = module.weight.requires_grad
+            module_output.bias.requires_grad = module.bias.requires_grad
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+    for name, child in module.named_children():
+        module_output.add_module(name, _convert_batchnorm(child))
+    del module
+    return module_output
+
+
+def _demo_mm_inputs(input_shape, num_classes):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+        num_classes (int):
+            number of semantic classes
+    """
+    (N, C, H, W) = input_shape
+    rng = np.random.RandomState(0)
+    imgs = rng.rand(*input_shape)
+    segs = rng.randint(
+        low=0, high=num_classes - 1, size=(N, 1, H, W)).astype(np.uint8)
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+    } for _ in range(N)]
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_semantic_seg': torch.LongTensor(segs)
+    }
+    return mm_inputs
+
+
+def _prepare_input_img(img_path,
+                       test_pipeline,
+                       shape=None,
+                       rescale_shape=None):
+    # build the data pipeline
+    if shape is not None:
+        test_pipeline[1]['img_scale'] = (shape[1], shape[0])
+    test_pipeline[1]['transforms'][0]['keep_ratio'] = False
+    test_pipeline = [LoadImage()] + test_pipeline[1:]
+    test_pipeline = Compose(test_pipeline)
+    # prepare data
+    data = dict(img=img_path)
+    data = test_pipeline(data)
+    imgs = data['img']
+    img_metas = [i.data for i in data['img_metas']]
+
+    if rescale_shape is not None:
+        for img_meta in img_metas:
+            img_meta['ori_shape'] = tuple(rescale_shape) + (3, )
+
+    mm_inputs = {'imgs': imgs, 'img_metas': img_metas}
+
+    return mm_inputs
+
+
+def _update_input_img(img_list, img_meta_list, update_ori_shape=False):
+    # update img and its meta list
+    N, C, H, W = img_list[0].shape
+    img_meta = img_meta_list[0][0]
+    img_shape = (H, W, C)
+    if update_ori_shape:
+        ori_shape = img_shape
+    else:
+        ori_shape = img_meta['ori_shape']
+    pad_shape = img_shape
+    new_img_meta_list = [[{
+        'img_shape':
+        img_shape,
+        'ori_shape':
+        ori_shape,
+        'pad_shape':
+        pad_shape,
+        'filename':
+        img_meta['filename'],
+        'scale_factor':
+        (img_shape[1] / ori_shape[1], img_shape[0] / ori_shape[0]) * 2,
+        'flip':
+        False,
+    } for _ in range(N)]]
+
+    return img_list, new_img_meta_list
+
+
+def pytorch2onnx(model,
+                 mm_inputs,
+                 opset_version=11,
+                 show=False,
+                 output_file='tmp.onnx',
+                 verify=False,
+                 dynamic_export=False):
+    """Export Pytorch model to ONNX model and verify the outputs are same
+    between Pytorch and ONNX.
+
+    Args:
+        model (nn.Module): Pytorch model we want to export.
+        mm_inputs (dict): Contain the input tensors and img_metas information.
+        opset_version (int): The onnx op version. Default: 11.
+        show (bool): Whether print the computation graph. Default: False.
+        output_file (string): The path to where we store the output ONNX model.
+            Default: `tmp.onnx`.
+        verify (bool): Whether compare the outputs between Pytorch and ONNX.
+            Default: False.
+        dynamic_export (bool): Whether to export ONNX with dynamic axis.
+            Default: False.
+    """
+    model.cpu().eval()
+    test_mode = model.test_cfg.mode
+
+    if isinstance(model.decode_head, nn.ModuleList):
+        num_classes = model.decode_head[-1].num_classes
+    else:
+        num_classes = model.decode_head.num_classes
+
+    imgs = mm_inputs.pop('imgs')
+    img_metas = mm_inputs.pop('img_metas')
+
+    img_list = [img[None, :] for img in imgs]
+    img_meta_list = [[img_meta] for img_meta in img_metas]
+    # update img_meta
+    img_list, img_meta_list = _update_input_img(img_list, img_meta_list)
+
+    # replace original forward function
+    origin_forward = model.forward
+    model.forward = partial(
+        model.forward,
+        img_metas=img_meta_list,
+        return_loss=False,
+        rescale=True)
+    dynamic_axes = None
+    if dynamic_export:
+        if test_mode == 'slide':
+            dynamic_axes = {'input': {0: 'batch'}, 'output': {1: 'batch'}}
+        else:
+            dynamic_axes = {
+                'input': {
+                    0: 'batch',
+                    2: 'height',
+                    3: 'width'
+                },
+                'output': {
+                    1: 'batch',
+                    2: 'height',
+                    3: 'width'
+                }
+            }
+
+    register_extra_symbolics(opset_version)
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (img_list, ),
+            output_file,
+            input_names=['input'],
+            output_names=['output'],
+            export_params=True,
+            keep_initializers_as_inputs=False,
+            verbose=show,
+            opset_version=opset_version,
+            dynamic_axes=dynamic_axes)
+        print(f'Successfully exported ONNX model: {output_file}')
+    model.forward = origin_forward
+
+    if verify:
+        # check by onnx
+        import onnx
+        onnx_model = onnx.load(output_file)
+        onnx.checker.check_model(onnx_model)
+
+        if dynamic_export and test_mode == 'whole':
+            # scale image for dynamic shape test
+            img_list = [resize(_, scale_factor=1.5) for _ in img_list]
+            # concate flip image for batch test
+            flip_img_list = [_.flip(-1) for _ in img_list]
+            img_list = [
+                torch.cat((ori_img, flip_img), 0)
+                for ori_img, flip_img in zip(img_list, flip_img_list)
+            ]
+
+            # update img_meta
+            img_list, img_meta_list = _update_input_img(
+                img_list, img_meta_list, test_mode == 'whole')
+
+        # check the numerical value
+        # get pytorch output
+        with torch.no_grad():
+            pytorch_result = model(img_list, img_meta_list, return_loss=False)
+            pytorch_result = np.stack(pytorch_result, 0)
+
+        # get onnx output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 1)
+        sess = rt.InferenceSession(output_file)
+        onnx_result = sess.run(
+            None, {net_feed_input[0]: img_list[0].detach().numpy()})[0][0]
+        # show segmentation results
+        if show:
+            import cv2
+            import os.path as osp
+            img = img_meta_list[0][0]['filename']
+            if not osp.exists(img):
+                img = imgs[0][:3, ...].permute(1, 2, 0) * 255
+                img = img.detach().numpy().astype(np.uint8)
+                ori_shape = img.shape[:2]
+            else:
+                ori_shape = LoadImage()({'img': img})['ori_shape']
+
+            # resize onnx_result to ori_shape
+            onnx_result_ = cv2.resize(onnx_result[0].astype(np.uint8),
+                                      (ori_shape[1], ori_shape[0]))
+            show_result_pyplot(
+                model,
+                img, (onnx_result_, ),
+                palette=model.PALETTE,
+                block=False,
+                title='ONNXRuntime',
+                opacity=0.5)
+
+            # resize pytorch_result to ori_shape
+            pytorch_result_ = cv2.resize(pytorch_result[0].astype(np.uint8),
+                                         (ori_shape[1], ori_shape[0]))
+            show_result_pyplot(
+                model,
+                img, (pytorch_result_, ),
+                title='PyTorch',
+                palette=model.PALETTE,
+                opacity=0.5)
+        # compare results
+        np.testing.assert_allclose(
+            pytorch_result.astype(np.float32) / num_classes,
+            onnx_result.astype(np.float32) / num_classes,
+            rtol=1e-5,
+            atol=1e-5,
+            err_msg='The outputs are different between Pytorch and ONNX')
+        print('The outputs are same between Pytorch and ONNX')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert MMSeg to ONNX')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file', default=None)
+    parser.add_argument(
+        '--input-img', type=str, help='Images for input', default=None)
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='show onnx graph and segmentation results')
+    parser.add_argument(
+        '--verify', action='store_true', help='verify the onnx model')
+    parser.add_argument('--output-file', type=str, default='tmp.onnx')
+    parser.add_argument('--opset-version', type=int, default=11)
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=None,
+        help='input image height and width.')
+    parser.add_argument(
+        '--rescale_shape',
+        type=int,
+        nargs='+',
+        default=None,
+        help='output image rescale height and width, work for slide mode.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='Override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--dynamic-export',
+        action='store_true',
+        help='Whether to export onnx with dynamic axis.')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    cfg = mmcv.Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    cfg.model.pretrained = None
+
+    if args.shape is None:
+        img_scale = cfg.test_pipeline[1]['img_scale']
+        input_shape = (1, 3, img_scale[1], img_scale[0])
+    elif len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (
+            1,
+            3,
+        ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    test_mode = cfg.model.test_cfg.mode
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    segmentor = build_segmentor(
+        cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
+    # convert SyncBN to BN
+    segmentor = _convert_batchnorm(segmentor)
+
+    if args.checkpoint:
+        checkpoint = load_checkpoint(
+            segmentor, args.checkpoint, map_location='cpu')
+        segmentor.CLASSES = checkpoint['meta']['CLASSES']
+        segmentor.PALETTE = checkpoint['meta']['PALETTE']
+
+    # read input or create dummpy input
+    if args.input_img is not None:
+        preprocess_shape = (input_shape[2], input_shape[3])
+        rescale_shape = None
+        if args.rescale_shape is not None:
+            rescale_shape = [args.rescale_shape[0], args.rescale_shape[1]]
+        mm_inputs = _prepare_input_img(
+            args.input_img,
+            cfg.data.test.pipeline,
+            shape=preprocess_shape,
+            rescale_shape=rescale_shape)
+    else:
+        if isinstance(segmentor.decode_head, nn.ModuleList):
+            num_classes = segmentor.decode_head[-1].num_classes
+        else:
+            num_classes = segmentor.decode_head.num_classes
+        mm_inputs = _demo_mm_inputs(input_shape, num_classes)
+
+    # convert model to onnx file
+    pytorch2onnx(
+        segmentor,
+        mm_inputs,
+        opset_version=args.opset_version,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify,
+        dynamic_export=args.dynamic_export)
--- a/segmentation/tools/pytorch2torchscript.py
+++ b/segmentation/tools/pytorch2torchscript.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmcv
+import numpy as np
+import torch
+import torch._C
+import torch.serialization
+from mmcv.runner import load_checkpoint
+from torch import nn
+
+from mmseg.models import build_segmentor
+
+torch.manual_seed(3)
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+def check_torch_version():
+    torch_minimum_version = '1.8.0'
+    torch_version = digit_version(torch.__version__)
+
+    assert (torch_version >= digit_version(torch_minimum_version)), \
+        f'Torch=={torch.__version__} is not support for converting to ' \
+        f'torchscript. Please install pytorch>={torch_minimum_version}.'
+
+
+def _convert_batchnorm(module):
+    module_output = module
+    if isinstance(module, torch.nn.SyncBatchNorm):
+        module_output = torch.nn.BatchNorm2d(module.num_features, module.eps,
+                                             module.momentum, module.affine,
+                                             module.track_running_stats)
+        if module.affine:
+            module_output.weight.data = module.weight.data.clone().detach()
+            module_output.bias.data = module.bias.data.clone().detach()
+            # keep requires_grad unchanged
+            module_output.weight.requires_grad = module.weight.requires_grad
+            module_output.bias.requires_grad = module.bias.requires_grad
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+    for name, child in module.named_children():
+        module_output.add_module(name, _convert_batchnorm(child))
+    del module
+    return module_output
+
+
+def _demo_mm_inputs(input_shape, num_classes):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+        num_classes (int):
+            number of semantic classes
+    """
+    (N, C, H, W) = input_shape
+    rng = np.random.RandomState(0)
+    imgs = rng.rand(*input_shape)
+    segs = rng.randint(
+        low=0, high=num_classes - 1, size=(N, 1, H, W)).astype(np.uint8)
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+    } for _ in range(N)]
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_semantic_seg': torch.LongTensor(segs)
+    }
+    return mm_inputs
+
+
+def pytorch2libtorch(model,
+                     input_shape,
+                     show=False,
+                     output_file='tmp.pt',
+                     verify=False):
+    """Export Pytorch model to TorchScript model and verify the outputs are
+    same between Pytorch and TorchScript.
+
+    Args:
+        model (nn.Module): Pytorch model we want to export.
+        input_shape (tuple): Use this input shape to construct
+            the corresponding dummy input and execute the model.
+        show (bool): Whether print the computation graph. Default: False.
+        output_file (string): The path to where we store the
+            output TorchScript model. Default: `tmp.pt`.
+        verify (bool): Whether compare the outputs between
+            Pytorch and TorchScript. Default: False.
+    """
+    if isinstance(model.decode_head, nn.ModuleList):
+        num_classes = model.decode_head[-1].num_classes
+    else:
+        num_classes = model.decode_head.num_classes
+
+    mm_inputs = _demo_mm_inputs(input_shape, num_classes)
+
+    imgs = mm_inputs.pop('imgs')
+
+    # replace the original forword with forward_dummy
+    model.forward = model.forward_dummy
+    model.eval()
+    traced_model = torch.jit.trace(
+        model,
+        example_inputs=imgs,
+        check_trace=verify,
+    )
+
+    if show:
+        print(traced_model.graph)
+
+    traced_model.save(output_file)
+    print('Successfully exported TorchScript model: {}'.format(output_file))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMSeg to TorchScript')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file', default=None)
+    parser.add_argument(
+        '--show', action='store_true', help='show TorchScript graph')
+    parser.add_argument(
+        '--verify', action='store_true', help='verify the TorchScript model')
+    parser.add_argument('--output-file', type=str, default='tmp.pt')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[512, 512],
+        help='input image size (height, width)')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    check_torch_version()
+
+    if len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (
+            1,
+            3,
+        ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    cfg.model.pretrained = None
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    segmentor = build_segmentor(
+        cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
+    # convert SyncBN to BN
+    segmentor = _convert_batchnorm(segmentor)
+
+    if args.checkpoint:
+        load_checkpoint(segmentor, args.checkpoint, map_location='cpu')
+
+    # convert the PyTorch model to LibTorch model
+    pytorch2libtorch(
+        segmentor,
+        input_shape,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify)
--- a/segmentation/tools/slurm_test.sh
+++ b/segmentation/tools/slurm_test.sh
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-4}
+GPUS_PER_NODE=${GPUS_PER_NODE:-4}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
--- a/segmentation/tools/slurm_train.sh
+++ b/segmentation/tools/slurm_train.sh
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-4}
+CPUS_PER_TASK=${CPUS_PER_TASK:-12}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:4}
+
+export NCCL_P2P_DISABLE=1
+export MASTER_PORT=13579
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    --mem 250G \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
--- a/segmentation/tools/test.py
+++ b/segmentation/tools/test.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import shutil
+import time
+import warnings
+
+import mmcv
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+from mmcv.utils import DictAction
+
+from mmseg.apis import multi_gpu_test, single_gpu_test
+from mmseg.datasets import build_dataloader, build_dataset
+from mmseg.models import build_segmentor
+
+import repvit
+from align_resize import AlignResize
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='mmseg test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help=('if specified, the evaluation metric results will be dumped'
+              'into the directory as json'))
+    parser.add_argument(
+        '--aug-test', action='store_true', help='Use Flip and Multi scale aug')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "mIoU"'
+        ' for generic datasets, and "cityscapes" for Cityscapes')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu_collect is not specified')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='custom options')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument(
+        '--opacity',
+        type=float,
+        default=0.5,
+        help='Opacity of painted segmentation map. In (0, 1] range.')
+    parser.add_argument('--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    if args.aug_test:
+        # hard code index
+        cfg.data.test.pipeline[1].img_ratios = [
+            0.5, 0.75, 1.0, 1.25, 1.5, 1.75
+        ]
+        cfg.data.test.pipeline[1].flip = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    rank, _ = get_dist_info()
+    # allows not to create
+    if args.work_dir is not None and rank == 0:
+        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_segmentor(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        print('"CLASSES" not found in meta, use dataset.CLASSES instead')
+        model.CLASSES = dataset.CLASSES
+    if 'PALETTE' in checkpoint.get('meta', {}):
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    else:
+        print('"PALETTE" not found in meta, use dataset.PALETTE instead')
+        model.PALETTE = dataset.PALETTE
+
+    # clean gpu memory when starting a new evaluation.
+    torch.cuda.empty_cache()
+    eval_kwargs = {} if args.eval_options is None else args.eval_options
+
+    # Deprecated
+    efficient_test = eval_kwargs.get('efficient_test', False)
+    if efficient_test:
+        warnings.warn(
+            '``efficient_test=True`` does not have effect in tools/test.py, '
+            'the evaluation and format results are CPU memory efficient by '
+            'default')
+
+    eval_on_format_results = (
+        args.eval is not None and 'cityscapes' in args.eval)
+    if eval_on_format_results:
+        assert len(args.eval) == 1, 'eval on format results is not ' \
+                                    'applicable for metrics other than ' \
+                                    'cityscapes'
+    if args.format_only or eval_on_format_results:
+        if 'imgfile_prefix' in eval_kwargs:
+            tmpdir = eval_kwargs['imgfile_prefix']
+        else:
+            tmpdir = '.format_cityscapes'
+            eval_kwargs.setdefault('imgfile_prefix', tmpdir)
+        mmcv.mkdir_or_exist(tmpdir)
+    else:
+        tmpdir = None
+
+    if not distributed:
+        model = MMDataParallel(model, device_ids=[0])
+        results = single_gpu_test(
+            model,
+            data_loader,
+            args.show,
+            args.show_dir,
+            False,
+            args.opacity,
+            pre_eval=args.eval is not None and not eval_on_format_results,
+            format_only=args.format_only or eval_on_format_results,
+            format_args=eval_kwargs)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        results = multi_gpu_test(
+            model,
+            data_loader,
+            args.tmpdir,
+            args.gpu_collect,
+            False,
+            pre_eval=args.eval is not None and not eval_on_format_results,
+            format_only=args.format_only or eval_on_format_results,
+            format_args=eval_kwargs)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            warnings.warn(
+                'The behavior of ``args.out`` has been changed since MMSeg '
+                'v0.16, the pickled outputs could be seg map as type of '
+                'np.array, pre-eval results or file paths for '
+                '``dataset.format_results()``.')
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(results, args.out)
+        if args.eval:
+            eval_kwargs.update(metric=args.eval)
+            metric = dataset.evaluate(results, **eval_kwargs)
+            metric_dict = dict(config=args.config, metric=metric)
+            if args.work_dir is not None and rank == 0:
+                mmcv.dump(metric_dict, json_file, indent=4)
+            if tmpdir is not None and eval_on_format_results:
+                # remove tmp dir when cityscapes evaluation
+                shutil.rmtree(tmpdir)
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/tools/torchserve/mmseg2torchserve.py
+++ b/segmentation/tools/torchserve/mmseg2torchserve.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import mmcv
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    package_model = None
+
+
+def mmseg2torchserve(
+    config_file: str,
+    checkpoint_file: str,
+    output_folder: str,
+    model_name: str,
+    model_version: str = '1.0',
+    force: bool = False,
+):
+    """Converts mmsegmentation model (config + checkpoint) to TorchServe
+    `.mar`.
+
+    Args:
+        config_file:
+            In MMSegmentation config format.
+            The contents vary for each task repository.
+        checkpoint_file:
+            In MMSegmentation checkpoint format.
+            The contents vary for each task repository.
+        output_folder:
+            Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        model_name:
+            If not None, used for naming the `{model_name}.mar` file
+            that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version:
+            Model's version.
+        force:
+            If True, if there is an existing `{model_name}.mar`
+            file under `output_folder` it will be overwritten.
+    """
+    mmcv.mkdir_or_exist(output_folder)
+
+    config = mmcv.Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        config.dump(f'{tmpdir}/config.py')
+
+        args = Namespace(
+            **{
+                'model_file': f'{tmpdir}/config.py',
+                'serialized_file': checkpoint_file,
+                'handler': f'{Path(__file__).parent}/mmseg_handler.py',
+                'model_name': model_name or Path(checkpoint_file).stem,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': None,
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert mmseg models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if package_model is None:
+        raise ImportError('`torch-model-archiver` is required.'
+                          'Try: pip install torch-model-archiver')
+
+    mmseg2torchserve(args.config, args.checkpoint, args.output_folder,
+                     args.model_name, args.model_version, args.force)
--- a/segmentation/tools/torchserve/mmseg_handler.py
+++ b/segmentation/tools/torchserve/mmseg_handler.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import base64
+import os
+
+import cv2
+import mmcv
+import torch
+from mmcv.cnn.utils.sync_bn import revert_sync_batchnorm
+from ts.torch_handler.base_handler import BaseHandler
+
+from mmseg.apis import inference_segmentor, init_segmentor
+
+
+class MMsegHandler(BaseHandler):
+
+    def initialize(self, context):
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+
+        self.model = init_segmentor(self.config_file, checkpoint, self.device)
+        self.model = revert_sync_batchnorm(self.model)
+        self.initialized = True
+
+    def preprocess(self, data):
+        images = []
+
+        for row in data:
+            image = row.get('data') or row.get('body')
+            if isinstance(image, str):
+                image = base64.b64decode(image)
+            image = mmcv.imfrombytes(image)
+            images.append(image)
+
+        return images
+
+    def inference(self, data, *args, **kwargs):
+        results = [inference_segmentor(self.model, img) for img in data]
+        return results
+
+    def postprocess(self, data):
+        output = []
+
+        for image_result in data:
+            _, buffer = cv2.imencode('.png', image_result[0].astype('uint8'))
+            content = buffer.tobytes()
+            output.append(content)
+        return output
--- a/segmentation/tools/torchserve/test_torchserve.py
+++ b/segmentation/tools/torchserve/test_torchserve.py
+from argparse import ArgumentParser
+from io import BytesIO
+
+import matplotlib.pyplot as plt
+import mmcv
+import requests
+
+from mmseg.apis import inference_segmentor, init_segmentor
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Compare result of torchserve and pytorch,'
+        'and visualize them.')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model_name', help='The model name in the server')
+    parser.add_argument(
+        '--inference-addr',
+        default='127.0.0.1:8080',
+        help='Address and port of the inference server')
+    parser.add_argument(
+        '--result-image',
+        type=str,
+        default=None,
+        help='save server output in result-image')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    url = 'http://' + args.inference_addr + '/predictions/' + args.model_name
+    with open(args.img, 'rb') as image:
+        tmp_res = requests.post(url, image)
+    content = tmp_res.content
+    if args.result_image:
+        with open(args.result_image, 'wb') as out_image:
+            out_image.write(content)
+        plt.imshow(mmcv.imread(args.result_image, 'grayscale'))
+        plt.show()
+    else:
+        plt.imshow(plt.imread(BytesIO(content)))
+        plt.show()
+    model = init_segmentor(args.config, args.checkpoint, args.device)
+    image = mmcv.imread(args.img)
+    result = inference_segmentor(model, image)
+    plt.imshow(result[0])
+    plt.show()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/segmentation/tools/train.py
+++ b/segmentation/tools/train.py
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+from mmcv.cnn.utils import revert_sync_batchnorm
+from mmcv.runner import get_dist_info, init_dist
+from mmcv.utils import Config, DictAction, get_git_hash
+
+from mmseg import __version__
+from mmseg.apis import set_random_seed, train_segmentor
+from mmseg.datasets import build_dataset
+from mmseg.models import build_segmentor
+from mmseg.utils import collect_env, get_root_logger, get_device
+
+
+import repvit
+from align_resize import AlignResize
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--load-from', help='the checkpoint file to load weights from')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+             '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+             '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='custom options')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    if args.load_from is not None:
+        cfg.load_from = args.load_from
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # gpu_ids is used to calculate iter when resuming checkpoint
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    cfg.device = get_device()
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, deterministic: '
+                    f'{args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_segmentor(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    # SyncBN is not support for DP
+    if not distributed:
+        warnings.warn(
+            'SyncBN is only supported with DDP. To be compatible with DP, '
+            'we convert SyncBN to BN. Please use dist_train.sh which can '
+            'avoid this error.')
+        model = revert_sync_batchnorm(model)
+
+    logger.info(model)
+
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.pipeline
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmseg version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmseg_version=f'{__version__}+{get_git_hash()[:7]}',
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    # passing checkpoint meta for saving best checkpoint
+    meta.update(cfg.checkpoint_config.meta)
+    train_segmentor(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
--- a/segmentation/train.sh
+++ b/segmentation/train.sh
+./tools/dist_train.sh configs/sem_fpn/fpn_repvit_m1_1_ade20k_40k.py 8
--- a/speed_gpu.py
+++ b/speed_gpu.py
+import torch
+import time
+from timm import create_model
+import model
+import utils
+torch.autograd.set_grad_enabled(False)
+
+T0 = 5
+T1 = 10
+
+def throughput(name, model, device, batch_size, resolution=224):
+    inputs = torch.randn(batch_size, 3, resolution, resolution, device=device)
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    start = time.time()
+    while time.time() - start < T0:
+        model(inputs)
+    timing = []
+    torch.cuda.synchronize()
+    while sum(timing) < T1:
+        start = time.time()
+        model(inputs)
+        torch.cuda.synchronize()
+        timing.append(time.time() - start)
+    timing = torch.as_tensor(timing, dtype=torch.float32)
+    print(name, device, batch_size / timing.mean().item(),
+          'images/s @ batch size', batch_size)
+
+device = "cuda:0"
+
+from argparse import ArgumentParser
+
+parser = ArgumentParser()
+
+parser.add_argument('--model', default='repvit_m0_9', type=str)
+parser.add_argument('--resolution', default=224, type=int)
+parser.add_argument('--batch-size', default=2048, type=int)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    model_name = args.model
+    batch_size = args.batch_size
+    resolution = args.resolution
+    torch.cuda.empty_cache()
+    inputs = torch.randn(batch_size, 3, resolution,
+                            resolution, device=device)
+    model = create_model(model_name, num_classes=1000)
+    utils.replace_batchnorm(model)
+    model.to(device)
+    model.eval()
+    throughput(model_name, model, device, batch_size, resolution=resolution)
--- a/train.sh
+++ b/train.sh
+python -m torch.distributed.launch --nproc_per_node=4 --master_port 12312 --use_env main.py --model repvit_m0_9 --data-path ./cifar100 --data-set CIFAR --epochs 300 --dist-eval --distillation-type none
+# python -m torch.distributed.launch --nproc_per_node=4 --master_port 12346 --use_env main.py --model repvit_m0_9 --data-path ./imagenet --teacher-path regnety_160-a5fe301d.pth --epochs 1 --dist-eval
+# NCCL_P2P_DISABLE=1 python -m torch.distributed.launch --nproc_per_node=8 --master_port 12346 --use_env main.py --model repvit_m0_9 --data-path ~/imagenet --dist-eval