update codes

26b83c4a · dengjb · 2f6baaee · 26b83c4a · 26b83c4a · 26b83c4a
Commit 26b83c4a authored May 29, 2024 by dengjb
20 changed files
--- a/datasets/coco_mini.zip
+++ b/datasets/coco_mini.zip
--- a/demo/create_result_gif.py
+++ b/demo/create_result_gif.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import matplotlib.patches as mpatches
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+from mmengine.utils import scandir
+
+try:
+    import imageio
+except ImportError:
+    imageio = None
+
+
+# TODO verify after refactoring analyze_results.py
+def parse_args():
+    parser = argparse.ArgumentParser(description='Create GIF for demo')
+    parser.add_argument(
+        'image_dir',
+        help='directory where result '
+        'images save path generated by ‘analyze_results.py’')
+    parser.add_argument(
+        '--out',
+        type=str,
+        default='result.gif',
+        help='gif path where will be saved')
+    args = parser.parse_args()
+    return args
+
+
+def _generate_batch_data(sampler, batch_size):
+    batch = []
+    for idx in sampler:
+        batch.append(idx)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+
+
+def create_gif(frames, gif_name, duration=2):
+    """Create gif through imageio.
+
+    Args:
+        frames (list[ndarray]): Image frames
+        gif_name (str): Saved gif name
+        duration (int): Display interval (s),
+            Default: 2
+    """
+    if imageio is None:
+        raise RuntimeError('imageio is not installed,'
+                           'Please use “pip install imageio” to install')
+    imageio.mimsave(gif_name, frames, 'GIF', duration=duration)
+
+
+def create_frame_by_matplotlib(image_dir,
+                               nrows=1,
+                               fig_size=(300, 300),
+                               font_size=15):
+    """Create gif frame image through matplotlib.
+
+    Args:
+        image_dir (str): Root directory of result images
+        nrows (int): Number of rows displayed, Default: 1
+        fig_size (tuple): Figure size of the pyplot figure.
+           Default: (300, 300)
+        font_size (int): Font size of texts. Default: 15
+
+    Returns:
+        list[ndarray]: image frames
+    """
+
+    result_dir_names = os.listdir(image_dir)
+    assert len(result_dir_names) == 2
+    # Longer length has higher priority
+    result_dir_names.reverse()
+
+    images_list = []
+    for dir_names in result_dir_names:
+        images_list.append(scandir(osp.join(image_dir, dir_names)))
+
+    frames = []
+    for paths in _generate_batch_data(zip(*images_list), nrows):
+
+        fig, axes = plt.subplots(nrows=nrows, ncols=2)
+        fig.suptitle('Good/bad case selected according '
+                     'to the COCO mAP of the single image')
+
+        det_patch = mpatches.Patch(color='salmon', label='prediction')
+        gt_patch = mpatches.Patch(color='royalblue', label='ground truth')
+        # bbox_to_anchor may need to be finetuned
+        plt.legend(
+            handles=[det_patch, gt_patch],
+            bbox_to_anchor=(1, -0.18),
+            loc='lower right',
+            borderaxespad=0.)
+
+        if nrows == 1:
+            axes = [axes]
+
+        dpi = fig.get_dpi()
+        # set fig size and margin
+        fig.set_size_inches(
+            (fig_size[0] * 2 + fig_size[0] // 20) / dpi,
+            (fig_size[1] * nrows + fig_size[1] // 3) / dpi,
+        )
+
+        fig.tight_layout()
+        # set subplot margin
+        plt.subplots_adjust(
+            hspace=.05,
+            wspace=0.05,
+            left=0.02,
+            right=0.98,
+            bottom=0.02,
+            top=0.98)
+
+        for i, (path_tuple, ax_tuple) in enumerate(zip(paths, axes)):
+            image_path_left = osp.join(
+                osp.join(image_dir, result_dir_names[0], path_tuple[0]))
+            image_path_right = osp.join(
+                osp.join(image_dir, result_dir_names[1], path_tuple[1]))
+            image_left = mmcv.imread(image_path_left)
+            image_left = mmcv.rgb2bgr(image_left)
+            image_right = mmcv.imread(image_path_right)
+            image_right = mmcv.rgb2bgr(image_right)
+
+            if i == 0:
+                ax_tuple[0].set_title(
+                    result_dir_names[0], fontdict={'size': font_size})
+                ax_tuple[1].set_title(
+                    result_dir_names[1], fontdict={'size': font_size})
+            ax_tuple[0].imshow(
+                image_left, extent=(0, *fig_size, 0), interpolation='bilinear')
+            ax_tuple[0].axis('off')
+            ax_tuple[1].imshow(
+                image_right,
+                extent=(0, *fig_size, 0),
+                interpolation='bilinear')
+            ax_tuple[1].axis('off')
+
+        canvas = fig.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        buffer = np.frombuffer(s, dtype='uint8')
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        img = rgb.astype('uint8')
+
+        frames.append(img)
+
+    return frames
+
+
+def main():
+    args = parse_args()
+    frames = create_frame_by_matplotlib(args.image_dir)
+    create_gif(frames, args.out)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/demo.jpg
+++ b/demo/demo.jpg
--- a/demo/demo.mp4
+++ b/demo/demo.mp4
--- a/demo/demo_mot.mp4
+++ b/demo/demo_mot.mp4
--- a/demo/demo_multi_model.py
+++ b/demo/demo_multi_model.py
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Support for multi-model fusion, and currently only the Weighted Box Fusion
+(WBF) fusion method is supported.
+
+References: https://github.com/ZFTurbo/Weighted-Boxes-Fusion
+
+Example:
+
+     python demo/demo_multi_model.py demo/demo.jpg \
+         ./configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py \
+         ./configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py \
+         --checkpoints \
+         https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth \  # noqa
+         https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth \
+         --weights 1 2
+"""
+
+import argparse
+import os.path as osp
+
+import mmcv
+import mmengine
+from mmengine.fileio import isdir, join_path, list_dir_or_file
+from mmengine.logging import print_log
+from mmengine.structures import InstanceData
+
+from mmdet.apis import DetInferencer
+from mmdet.models.utils import weighted_boxes_fusion
+from mmdet.registry import VISUALIZERS
+from mmdet.structures import DetDataSample
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDetection multi-model inference demo')
+    parser.add_argument(
+        'inputs', type=str, help='Input image file or folder path.')
+    parser.add_argument(
+        'config',
+        type=str,
+        nargs='*',
+        help='Config file(s), support receive multiple files')
+    parser.add_argument(
+        '--checkpoints',
+        type=str,
+        nargs='*',
+        help='Checkpoint file(s), support receive multiple files, '
+        'remember to correspond to the above config',
+    )
+    parser.add_argument(
+        '--weights',
+        type=float,
+        nargs='*',
+        default=None,
+        help='weights for each model, remember to '
+        'correspond to the above config')
+    parser.add_argument(
+        '--fusion-iou-thr',
+        type=float,
+        default=0.55,
+        help='IoU value for boxes to be a match in wbf')
+    parser.add_argument(
+        '--skip-box-thr',
+        type=float,
+        default=0.0,
+        help='exclude boxes with score lower than this variable in wbf')
+    parser.add_argument(
+        '--conf-type',
+        type=str,
+        default='avg',  # avg, max, box_and_model_avg, absent_model_aware_avg
+        help='how to calculate confidence in weighted boxes in wbf')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of images or prediction results.')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--batch-size', type=int, default=1, help='Inference batch size.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Display the image in a popup window.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--palette',
+        default='none',
+        choices=['coco', 'voc', 'citys', 'random', 'none'],
+        help='Color palette used for visualization')
+
+    args = parser.parse_args()
+
+    if args.no_save_vis and args.no_save_pred:
+        args.out_dir = ''
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    results = []
+    cfg_visualizer = None
+    dataset_meta = None
+
+    inputs = []
+    filename_list = []
+    if isdir(args.inputs):
+        dir = list_dir_or_file(
+            args.inputs, list_dir=False, suffix=IMG_EXTENSIONS)
+        for filename in dir:
+            img = mmcv.imread(join_path(args.inputs, filename))
+            inputs.append(img)
+            filename_list.append(filename)
+    else:
+        img = mmcv.imread(args.inputs)
+        inputs.append(img)
+        img_name = osp.basename(args.inputs)
+        filename_list.append(img_name)
+
+    for i, (config,
+            checkpoint) in enumerate(zip(args.config, args.checkpoints)):
+        inferencer = DetInferencer(
+            config, checkpoint, device=args.device, palette=args.palette)
+
+        result_raw = inferencer(
+            inputs=inputs,
+            batch_size=args.batch_size,
+            no_save_vis=True,
+            pred_score_thr=args.pred_score_thr)
+
+        if i == 0:
+            cfg_visualizer = inferencer.cfg.visualizer
+            dataset_meta = inferencer.model.dataset_meta
+            results = [{
+                'bboxes_list': [],
+                'scores_list': [],
+                'labels_list': []
+            } for _ in range(len(result_raw['predictions']))]
+
+        for res, raw in zip(results, result_raw['predictions']):
+            res['bboxes_list'].append(raw['bboxes'])
+            res['scores_list'].append(raw['scores'])
+            res['labels_list'].append(raw['labels'])
+
+    visualizer = VISUALIZERS.build(cfg_visualizer)
+    visualizer.dataset_meta = dataset_meta
+
+    for i in range(len(results)):
+        bboxes, scores, labels = weighted_boxes_fusion(
+            results[i]['bboxes_list'],
+            results[i]['scores_list'],
+            results[i]['labels_list'],
+            weights=args.weights,
+            iou_thr=args.fusion_iou_thr,
+            skip_box_thr=args.skip_box_thr,
+            conf_type=args.conf_type)
+
+        pred_instances = InstanceData()
+        pred_instances.bboxes = bboxes
+        pred_instances.scores = scores
+        pred_instances.labels = labels
+
+        fusion_result = DetDataSample(pred_instances=pred_instances)
+
+        img_name = filename_list[i]
+
+        if not args.no_save_pred:
+            out_json_path = (
+                args.out_dir + '/preds/' + img_name.split('.')[0] + '.json')
+            mmengine.dump(
+                {
+                    'labels': labels.tolist(),
+                    'scores': scores.tolist(),
+                    'bboxes': bboxes.tolist()
+                }, out_json_path)
+
+        out_file = osp.join(args.out_dir, 'vis',
+                            img_name) if not args.no_save_vis else None
+
+        visualizer.add_datasample(
+            img_name,
+            inputs[i][..., ::-1],
+            data_sample=fusion_result,
+            show=args.show,
+            draw_gt=False,
+            wait_time=0,
+            pred_score_thr=args.pred_score_thr,
+            out_file=out_file)
+
+    if not args.no_save_vis:
+        print_log(f'results have been saved at {args.out_dir}')
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/image_demo.py
+++ b/demo/image_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Image Demo.
+
+This script adopts a new infenence class, currently supports image path,
+np.array and folder input formats, and will support video and webcam
+in the future.
+
+Example:
+    Save visualizations and predictions results::
+
+        python demo/image_demo.py demo/demo.jpg rtmdet-s
+
+        python demo/image_demo.py demo/demo.jpg \
+        configs/rtmdet/rtmdet_s_8xb32-300e_coco.py \
+        --weights rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth
+
+        python demo/image_demo.py demo/demo.jpg \
+        glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 --texts bench
+
+        python demo/image_demo.py demo/demo.jpg \
+        glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 --texts 'bench . car .'
+
+        python demo/image_demo.py demo/demo.jpg \
+        glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365
+        --texts 'bench . car .' -c
+
+        python demo/image_demo.py demo/demo.jpg \
+        glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 \
+        --texts 'There are a lot of cars here.'
+
+        python demo/image_demo.py demo/demo.jpg \
+        glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 \
+        --texts '$: coco'
+
+        python demo/image_demo.py demo/demo.jpg \
+        glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 \
+        --texts '$: lvis' --pred-score-thr 0.7 \
+        --palette random --chunked-size 80
+
+        python demo/image_demo.py demo/demo.jpg \
+        grounding_dino_swin-t_pretrain_obj365_goldg_cap4m \
+        --texts '$: lvis' --pred-score-thr 0.4 \
+        --palette random --chunked-size 80
+
+        python demo/image_demo.py demo/demo.jpg \
+        grounding_dino_swin-t_pretrain_obj365_goldg_cap4m \
+        --texts "a red car in the upper right corner" \
+        --tokens-positive -1
+
+    Visualize prediction results::
+
+        python demo/image_demo.py demo/demo.jpg rtmdet-ins-s --show
+
+        python demo/image_demo.py demo/demo.jpg rtmdet-ins_s_8xb32-300e_coco \
+        --show
+"""
+
+import ast
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet.apis import DetInferencer
+from mmdet.evaluation import get_classes
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'inputs', type=str, help='Input image file or folder path.')
+    parser.add_argument(
+        'model',
+        type=str,
+        help='Config or checkpoint .pth file or the model name '
+        'and alias defined in metafile. The model configuration '
+        'file will try to read from .pth if the parameter is '
+        'a .pth weights file.')
+    parser.add_argument('--weights', default=None, help='Checkpoint file')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of images or prediction results.')
+    # Once you input a format similar to $: xxx, it indicates that
+    # the prompt is based on the dataset class name.
+    # support $: coco, $: voc, $: cityscapes, $: lvis, $: imagenet_det.
+    # detail to `mmdet/evaluation/functional/class_names.py`
+    parser.add_argument(
+        '--texts', help='text prompt, such as "bench . car .", "$: coco"')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--batch-size', type=int, default=1, help='Inference batch size.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Display the image in a popup window.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection vis results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection json results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    parser.add_argument(
+        '--palette',
+        default='none',
+        choices=['coco', 'voc', 'citys', 'random', 'none'],
+        help='Color palette used for visualization')
+    # only for GLIP and Grounding DINO
+    parser.add_argument(
+        '--custom-entities',
+        '-c',
+        action='store_true',
+        help='Whether to customize entity names? '
+        'If so, the input text should be '
+        '"cls_name1 . cls_name2 . cls_name3 ." format')
+    parser.add_argument(
+        '--chunked-size',
+        '-s',
+        type=int,
+        default=-1,
+        help='If the number of categories is very large, '
+        'you can specify this parameter to truncate multiple predictions.')
+    # only for Grounding DINO
+    parser.add_argument(
+        '--tokens-positive',
+        '-p',
+        type=str,
+        help='Used to specify which locations in the input text are of '
+        'interest to the user. -1 indicates that no area is of interest, '
+        'None indicates ignoring this parameter. '
+        'The two-dimensional array represents the start and end positions.')
+
+    call_args = vars(parser.parse_args())
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    if call_args['model'].endswith('.pth'):
+        print_log('The model is a weight file, automatically '
+                  'assign the model to --weights')
+        call_args['weights'] = call_args['model']
+        call_args['model'] = None
+
+    if call_args['texts'] is not None:
+        if call_args['texts'].startswith('$:'):
+            dataset_name = call_args['texts'][3:].strip()
+            class_names = get_classes(dataset_name)
+            call_args['texts'] = [tuple(class_names)]
+
+    if call_args['tokens_positive'] is not None:
+        call_args['tokens_positive'] = ast.literal_eval(
+            call_args['tokens_positive'])
+
+    init_kws = ['model', 'weights', 'device', 'palette']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    return init_args, call_args
+
+
+def main():
+    init_args, call_args = parse_args()
+    # TODO: Video and Webcam are currently not supported and
+    #  may consume too much memory if your input folder has a lot of images.
+    #  We will be optimized later.
+    inferencer = DetInferencer(**init_args)
+
+    chunked_size = call_args.pop('chunked_size')
+    inferencer.model.test_cfg.chunked_size = chunked_size
+
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(f'results have been saved at {call_args["out_dir"]}')
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/large_image.jpg
+++ b/demo/large_image.jpg
--- a/demo/large_image_demo.py
+++ b/demo/large_image_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Perform MMDET inference on large images (as satellite imagery) as:
+
+```shell
+wget -P checkpoint https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth # noqa: E501, E261.
+
+python demo/large_image_demo.py \
+    demo/large_image.jpg \
+    configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py \
+    checkpoint/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth
+```
+"""
+
+import os
+import random
+from argparse import ArgumentParser
+from pathlib import Path
+
+import mmcv
+import numpy as np
+from mmengine.config import Config, ConfigDict
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar
+
+from mmdet.apis import inference_detector, init_detector
+
+try:
+    from sahi.slicing import slice_image
+except ImportError:
+    raise ImportError('Please run "pip install -U sahi" '
+                      'to install sahi first for large image inference.')
+
+from mmdet.registry import VISUALIZERS
+from mmdet.utils.large_image import merge_results_by_nms, shift_predictions
+from mmdet.utils.misc import get_file_list
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Perform MMDET inference on large images.')
+    parser.add_argument(
+        'img', help='Image path, include image file, dir and URL.')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--out-dir', default='./output', help='Path to output file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--show', action='store_true', help='Show the detection results')
+    parser.add_argument(
+        '--tta',
+        action='store_true',
+        help='Whether to use test time augmentation')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='Bbox score threshold')
+    parser.add_argument(
+        '--patch-size', type=int, default=640, help='The size of patches')
+    parser.add_argument(
+        '--patch-overlap-ratio',
+        type=float,
+        default=0.25,
+        help='Ratio of overlap between two patches')
+    parser.add_argument(
+        '--merge-iou-thr',
+        type=float,
+        default=0.25,
+        help='IoU threshould for merging results')
+    parser.add_argument(
+        '--merge-nms-type',
+        type=str,
+        default='nms',
+        help='NMS type for merging results')
+    parser.add_argument(
+        '--batch-size',
+        type=int,
+        default=1,
+        help='Batch size, must greater than or equal to 1')
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Export debug results before merging')
+    parser.add_argument(
+        '--save-patch',
+        action='store_true',
+        help='Save the results of each patch. '
+        'The `--debug` must be enabled.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    config = args.config
+
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
+
+    if args.tta:
+        assert 'tta_model' in config, 'Cannot find ``tta_model`` in config.' \
+                                      " Can't use tta !"
+        assert 'tta_pipeline' in config, 'Cannot find ``tta_pipeline`` ' \
+                                         "in config. Can't use tta !"
+        config.model = ConfigDict(**config.tta_model, module=config.model)
+        test_data_cfg = config.test_dataloader.dataset
+        while 'dataset' in test_data_cfg:
+            test_data_cfg = test_data_cfg['dataset']
+
+        test_data_cfg.pipeline = config.tta_pipeline
+
+    # TODO: TTA mode will error if cfg_options is not set.
+    #  This is an mmdet issue and needs to be fixed later.
+    # build the model from a config file and a checkpoint file
+    model = init_detector(
+        config, args.checkpoint, device=args.device, cfg_options={})
+
+    if not os.path.exists(args.out_dir) and not args.show:
+        os.mkdir(args.out_dir)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    # get file list
+    files, source_type = get_file_list(args.img)
+
+    # start detector inference
+    print(f'Performing inference on {len(files)} images.... '
+          'This may take a while.')
+    progress_bar = ProgressBar(len(files))
+    for file in files:
+        # read image
+        img = mmcv.imread(file)
+
+        # arrange slices
+        height, width = img.shape[:2]
+        sliced_image_object = slice_image(
+            img,
+            slice_height=args.patch_size,
+            slice_width=args.patch_size,
+            auto_slice_resolution=False,
+            overlap_height_ratio=args.patch_overlap_ratio,
+            overlap_width_ratio=args.patch_overlap_ratio,
+        )
+        # perform sliced inference
+        slice_results = []
+        start = 0
+        while True:
+            # prepare batch slices
+            end = min(start + args.batch_size, len(sliced_image_object))
+            images = []
+            for sliced_image in sliced_image_object.images[start:end]:
+                images.append(sliced_image)
+
+            # forward the model
+            slice_results.extend(inference_detector(model, images))
+
+            if end >= len(sliced_image_object):
+                break
+            start += args.batch_size
+
+        if source_type['is_dir']:
+            filename = os.path.relpath(file, args.img).replace('/', '_')
+        else:
+            filename = os.path.basename(file)
+
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+        out_file = None if args.show else os.path.join(args.out_dir, filename)
+
+        # export debug images
+        if args.debug:
+            # export sliced image results
+            name, suffix = os.path.splitext(filename)
+
+            shifted_instances = shift_predictions(
+                slice_results,
+                sliced_image_object.starting_pixels,
+                src_image_shape=(height, width))
+            merged_result = slice_results[0].clone()
+            merged_result.pred_instances = shifted_instances
+
+            debug_file_name = name + '_debug' + suffix
+            debug_out_file = None if args.show else os.path.join(
+                args.out_dir, debug_file_name)
+            visualizer.set_image(img.copy())
+
+            debug_grids = []
+            for starting_point in sliced_image_object.starting_pixels:
+                start_point_x = starting_point[0]
+                start_point_y = starting_point[1]
+                end_point_x = start_point_x + args.patch_size
+                end_point_y = start_point_y + args.patch_size
+                debug_grids.append(
+                    [start_point_x, start_point_y, end_point_x, end_point_y])
+            debug_grids = np.array(debug_grids)
+            debug_grids[:, 0::2] = np.clip(debug_grids[:, 0::2], 1,
+                                           img.shape[1] - 1)
+            debug_grids[:, 1::2] = np.clip(debug_grids[:, 1::2], 1,
+                                           img.shape[0] - 1)
+
+            palette = np.random.randint(0, 256, size=(len(debug_grids), 3))
+            palette = [tuple(c) for c in palette]
+            line_styles = random.choices(['-', '-.', ':'], k=len(debug_grids))
+            visualizer.draw_bboxes(
+                debug_grids,
+                edge_colors=palette,
+                alpha=1,
+                line_styles=line_styles)
+            visualizer.draw_bboxes(
+                debug_grids, face_colors=palette, alpha=0.15)
+
+            visualizer.draw_texts(
+                list(range(len(debug_grids))),
+                debug_grids[:, :2] + 5,
+                colors='w')
+
+            visualizer.add_datasample(
+                debug_file_name,
+                visualizer.get_image(),
+                data_sample=merged_result,
+                draw_gt=False,
+                show=args.show,
+                wait_time=0,
+                out_file=debug_out_file,
+                pred_score_thr=args.score_thr,
+            )
+
+            if args.save_patch:
+                debug_patch_out_dir = os.path.join(args.out_dir,
+                                                   f'{name}_patch')
+                for i, slice_result in enumerate(slice_results):
+                    patch_out_file = os.path.join(
+                        debug_patch_out_dir,
+                        f'{filename}_slice_{i}_result.jpg')
+                    image = mmcv.imconvert(sliced_image_object.images[i],
+                                           'bgr', 'rgb')
+
+                    visualizer.add_datasample(
+                        'patch_result',
+                        image,
+                        data_sample=slice_result,
+                        draw_gt=False,
+                        show=False,
+                        wait_time=0,
+                        out_file=patch_out_file,
+                        pred_score_thr=args.score_thr,
+                    )
+
+        image_result = merge_results_by_nms(
+            slice_results,
+            sliced_image_object.starting_pixels,
+            src_image_shape=(height, width),
+            nms_cfg={
+                'type': args.merge_nms_type,
+                'iou_threshold': args.merge_iou_thr
+            })
+
+        visualizer.add_datasample(
+            filename,
+            img,
+            data_sample=image_result,
+            draw_gt=False,
+            show=args.show,
+            wait_time=0,
+            out_file=out_file,
+            pred_score_thr=args.score_thr,
+        )
+        progress_bar.update()
+
+    if not args.show or (args.debug and args.save_patch):
+        print_log(
+            f'\nResults have been saved at {os.path.abspath(args.out_dir)}')
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/mot_demo.py
+++ b/demo/mot_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+from argparse import ArgumentParser
+
+import mmcv
+import mmengine
+from mmengine.registry import init_default_scope
+
+from mmdet.apis import inference_mot, init_track_model
+from mmdet.registry import VISUALIZERS
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png')
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'inputs', type=str, help='Input image file or folder path.')
+    parser.add_argument('config', help='config file')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument('--detector', help='det checkpoint file')
+    parser.add_argument('--reid', help='reid checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='device used for inference')
+    parser.add_argument(
+        '--score-thr',
+        type=float,
+        default=0.0,
+        help='The threshold of score to filter bboxes.')
+    parser.add_argument(
+        '--out', help='output video file (mp4 format) or folder')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='whether show the results on the fly')
+    parser.add_argument('--fps', help='FPS of the output video')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    assert args.out or args.show
+    # load images
+    if osp.isdir(args.inputs):
+        imgs = sorted(
+            filter(lambda x: x.endswith(IMG_EXTENSIONS),
+                   os.listdir(args.inputs)),
+            key=lambda x: int(x.split('.')[0]))
+        in_video = False
+    else:
+        imgs = mmcv.VideoReader(args.inputs)
+        in_video = True
+
+    # define output
+    out_video = False
+    if args.out is not None:
+        if args.out.endswith('.mp4'):
+            out_video = True
+            out_dir = tempfile.TemporaryDirectory()
+            out_path = out_dir.name
+            _out = args.out.rsplit(os.sep, 1)
+            if len(_out) > 1:
+                os.makedirs(_out[0], exist_ok=True)
+        else:
+            out_path = args.out
+            os.makedirs(out_path, exist_ok=True)
+
+    fps = args.fps
+    if args.show or out_video:
+        if fps is None and in_video:
+            fps = imgs.fps
+        if not fps:
+            raise ValueError('Please set the FPS for the output video.')
+        fps = int(fps)
+
+    init_default_scope('mmdet')
+
+    # build the model from a config file and a checkpoint file
+    model = init_track_model(
+        args.config,
+        args.checkpoint,
+        args.detector,
+        args.reid,
+        device=args.device)
+
+    # build the visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    prog_bar = mmengine.ProgressBar(len(imgs))
+    # test and show/save the images
+    for i, img in enumerate(imgs):
+        if isinstance(img, str):
+            img_path = osp.join(args.inputs, img)
+            img = mmcv.imread(img_path)
+        # result [TrackDataSample]
+        result = inference_mot(model, img, frame_id=i, video_len=len(imgs))
+        if args.out is not None:
+            if in_video or out_video:
+                out_file = osp.join(out_path, f'{i:06d}.jpg')
+            else:
+                out_file = osp.join(out_path, img.rsplit(os.sep, 1)[-1])
+        else:
+            out_file = None
+
+        # show the results
+        visualizer.add_datasample(
+            'mot',
+            img[..., ::-1],
+            data_sample=result[0],
+            show=args.show,
+            draw_gt=False,
+            out_file=out_file,
+            wait_time=float(1 / int(fps)) if fps else 0,
+            pred_score_thr=args.score_thr,
+            step=i)
+
+        prog_bar.update()
+
+    if args.out and out_video:
+        print(f'making the output video at {args.out} with a FPS of {fps}')
+        mmcv.frames2video(out_path, args.out, fps=fps, fourcc='mp4v')
+        out_dir.cleanup()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/demo/video_demo.py
+++ b/demo/video_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+import mmcv
+from mmcv.transforms import Compose
+from mmengine.utils import track_iter_progress
+
+from mmdet.apis import inference_detector, init_detector
+from mmdet.registry import VISUALIZERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDetection video demo')
+    parser.add_argument('video', help='Video file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='Bbox score threshold')
+    parser.add_argument('--out', type=str, help='Output video file')
+    parser.add_argument('--show', action='store_true', help='Show video')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=1,
+        help='The interval of show (s), 0 is block')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    assert args.out or args.show, \
+        ('Please specify at least one operation (save/show the '
+         'video) with the argument "--out" or "--show"')
+
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+
+    # build test pipeline
+    model.cfg.test_dataloader.dataset.pipeline[
+        0].type = 'mmdet.LoadImageFromNDArray'
+    test_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    # the dataset_meta is loaded from the checkpoint and
+    # then pass to the model in init_detector
+    visualizer.dataset_meta = model.dataset_meta
+
+    video_reader = mmcv.VideoReader(args.video)
+    video_writer = None
+    if args.out:
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(
+            args.out, fourcc, video_reader.fps,
+            (video_reader.width, video_reader.height))
+
+    for frame in track_iter_progress((video_reader, len(video_reader))):
+        result = inference_detector(model, frame, test_pipeline=test_pipeline)
+        visualizer.add_datasample(
+            name='video',
+            image=frame,
+            data_sample=result,
+            draw_gt=False,
+            show=False,
+            pred_score_thr=args.score_thr)
+        frame = visualizer.get_image()
+
+        if args.show:
+            cv2.namedWindow('video', 0)
+            mmcv.imshow(frame, 'video', args.wait_time)
+        if args.out:
+            video_writer.write(frame)
+
+    if video_writer:
+        video_writer.release()
+    cv2.destroyAllWindows()
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/video_gpuaccel_demo.py
+++ b/demo/video_gpuaccel_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from typing import Tuple
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.transforms import Compose
+from mmengine.utils import track_iter_progress
+
+from mmdet.apis import init_detector
+from mmdet.registry import VISUALIZERS
+from mmdet.structures import DetDataSample
+
+try:
+    import ffmpegcv
+except ImportError:
+    raise ImportError(
+        'Please install ffmpegcv with:\n\n    pip install ffmpegcv')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDetection video demo with GPU acceleration')
+    parser.add_argument('video', help='Video file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='Bbox score threshold')
+    parser.add_argument('--out', type=str, help='Output video file')
+    parser.add_argument('--show', action='store_true', help='Show video')
+    parser.add_argument(
+        '--nvdecode', action='store_true', help='Use NVIDIA decoder')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=1,
+        help='The interval of show (s), 0 is block')
+    args = parser.parse_args()
+    return args
+
+
+def prefetch_batch_input_shape(model: nn.Module, ori_wh: Tuple[int,
+                                                               int]) -> dict:
+    cfg = model.cfg
+    w, h = ori_wh
+    cfg.test_dataloader.dataset.pipeline[0].type = 'LoadImageFromNDArray'
+    test_pipeline = Compose(cfg.test_dataloader.dataset.pipeline)
+    data = {'img': np.zeros((h, w, 3), dtype=np.uint8), 'img_id': 0}
+    data = test_pipeline(data)
+    data['inputs'] = [data['inputs']]
+    data['data_samples'] = [data['data_samples']]
+    data_sample = model.data_preprocessor(data, False)['data_samples']
+    batch_input_shape = data_sample[0].batch_input_shape
+    return batch_input_shape
+
+
+def pack_data(frame_resize: np.ndarray, batch_input_shape: Tuple[int, int],
+              ori_shape: Tuple[int, int]) -> dict:
+    assert frame_resize.shape[:2] == batch_input_shape
+    data_sample = DetDataSample()
+    data_sample.set_metainfo({
+        'img_shape':
+        batch_input_shape,
+        'ori_shape':
+        ori_shape,
+        'scale_factor': (batch_input_shape[0] / ori_shape[0],
+                         batch_input_shape[1] / ori_shape[1])
+    })
+    frame_resize = torch.from_numpy(frame_resize).permute((2, 0, 1)).cuda()
+    data = {'inputs': [frame_resize], 'data_samples': [data_sample]}
+    return data
+
+
+def main():
+    args = parse_args()
+    assert args.out or args.show, \
+        ('Please specify at least one operation (save/show the '
+         'video) with the argument "--out" or "--show"')
+
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    # the dataset_meta is loaded from the checkpoint and
+    # then pass to the model in init_detector
+    visualizer.dataset_meta = model.dataset_meta
+
+    if args.nvdecode:
+        VideoCapture = ffmpegcv.VideoCaptureNV
+    else:
+        VideoCapture = ffmpegcv.VideoCapture
+    video_origin = VideoCapture(args.video)
+
+    batch_input_shape = prefetch_batch_input_shape(
+        model, (video_origin.width, video_origin.height))
+    ori_shape = (video_origin.height, video_origin.width)
+    resize_wh = batch_input_shape[::-1]
+    video_resize = VideoCapture(
+        args.video,
+        resize=resize_wh,
+        resize_keepratio=True,
+        resize_keepratioalign='topleft')
+
+    video_writer = None
+    if args.out:
+        video_writer = ffmpegcv.VideoWriter(args.out, fps=video_origin.fps)
+
+    with torch.no_grad():
+        for i, (frame_resize, frame_origin) in enumerate(
+                zip(track_iter_progress(video_resize), video_origin)):
+            data = pack_data(frame_resize, batch_input_shape, ori_shape)
+            result = model.test_step(data)[0]
+
+            visualizer.add_datasample(
+                name='video',
+                image=frame_origin,
+                data_sample=result,
+                draw_gt=False,
+                show=False,
+                pred_score_thr=args.score_thr)
+
+            frame_mask = visualizer.get_image()
+
+            if args.show:
+                cv2.namedWindow('video', 0)
+                mmcv.imshow(frame_mask, 'video', args.wait_time)
+            if args.out:
+                video_writer.write(frame_mask)
+
+    if video_writer:
+        video_writer.release()
+    video_origin.release()
+    video_resize.release()
+
+    cv2.destroyAllWindows()
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/webcam_demo.py
+++ b/demo/webcam_demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+import mmcv
+import torch
+
+from mmdet.apis import inference_detector, init_detector
+from mmdet.registry import VISUALIZERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDetection webcam demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--camera-id', type=int, default=0, help='camera device id')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.5, help='bbox score threshold')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # build the model from a config file and a checkpoint file
+    device = torch.device(args.device)
+    model = init_detector(args.config, args.checkpoint, device=device)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    # the dataset_meta is loaded from the checkpoint and
+    # then pass to the model in init_detector
+    visualizer.dataset_meta = model.dataset_meta
+
+    camera = cv2.VideoCapture(args.camera_id)
+
+    print('Press "Esc", "q" or "Q" to exit.')
+    while True:
+        ret_val, img = camera.read()
+        result = inference_detector(model, img)
+
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+        visualizer.add_datasample(
+            name='result',
+            image=img,
+            data_sample=result,
+            draw_gt=False,
+            pred_score_thr=args.score_thr,
+            show=False)
+
+        img = visualizer.get_image()
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+        cv2.imshow('result', img)
+
+        ch = cv2.waitKey(1)
+        if ch == 27 or ch == ord('q') or ch == ord('Q'):
+            break
+
+
+if __name__ == '__main__':
+    main()
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
\ No newline at end of file
--- a/docs/en/Makefile
+++ b/docs/en/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/en/_static/css/readthedocs.css
+++ b/docs/en/_static/css/readthedocs.css
+.header-logo {
+    background-image: url("../image/mmdet-logo.png");
+    background-size: 156px 40px;
+    height: 40px;
+    width: 156px;
+}
--- a/docs/en/_static/image/mmdet-logo.png
+++ b/docs/en/_static/image/mmdet-logo.png
--- a/docs/en/advanced_guides/conventions.md
+++ b/docs/en/advanced_guides/conventions.md
+# Conventions
+
+Please check the following conventions if you would like to modify MMDetection as your own project.
+
+## About the order of image shape
+
+In OpenMMLab 2.0, to be consistent with the input argument of OpenCV, the argument about image shape in the data transformation pipeline is always in the `(width, height)` order. On the contrary, for computation convenience, the order of the field going through the data pipeline and the model is `(height, width)`. Specifically, in the results processed by each data transform pipeline, the fields and their value meaning is as below:
+
+- img_shape: (height, width)
+- ori_shape: (height, width)
+- pad_shape: (height, width)
+- batch_input_shape: (height, width)
+
+As an example, the initialization arguments of `Mosaic` are as below:
+
+```python
+@TRANSFORMS.register_module()
+class Mosaic(BaseTransform):
+    def __init__(self,
+                img_scale: Tuple[int, int] = (640, 640),
+                center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                bbox_clip_border: bool = True,
+                pad_val: float = 114.0,
+                prob: float = 1.0) -> None:
+       ...
+
+       # img_scale order should be (width, height)
+       self.img_scale = img_scale
+
+    def transform(self, results: dict) -> dict:
+        ...
+
+        results['img'] = mosaic_img
+        # (height, width)
+        results['img_shape'] = mosaic_img.shape[:2]
+```
+
+## Loss
+
+In MMDetection, a `dict` containing losses and metrics will be returned by `model(**data)`.
+
+For example, in bbox head,
+
+```python
+class BBoxHead(nn.Module):
+    ...
+    def loss(self, ...):
+        losses = dict()
+        # classification loss
+        losses['loss_cls'] = self.loss_cls(...)
+        # classification accuracy
+        losses['acc'] = accuracy(...)
+        # bbox regression loss
+        losses['loss_bbox'] = self.loss_bbox(...)
+        return losses
+```
+
+`bbox_head.loss()` will be called during model forward.
+The returned dict contains `'loss_bbox'`, `'loss_cls'`, `'acc'` .
+Only `'loss_bbox'`, `'loss_cls'` will be used during back propagation,
+`'acc'` will only be used as a metric to monitor training process.
+
+By default, only values whose keys contain `'loss'` will be back propagated.
+This behavior could be changed by modifying `BaseDetector.train_step()`.
+
+## Empty Proposals
+
+In MMDetection, We have added special handling and unit test for empty proposals of two-stage. We need to deal with the empty proposals of the entire batch and single image at the same time. For example, in CascadeRoIHead,
+
+```python
+# simple_test method
+...
+# There is no proposal in the whole batch
+if rois.shape[0] == 0:
+    bbox_results = [[
+        np.zeros((0, 5), dtype=np.float32)
+        for _ in range(self.bbox_head[-1].num_classes)
+    ]] * num_imgs
+    if self.with_mask:
+        mask_classes = self.mask_head[-1].num_classes
+        segm_results = [[[] for _ in range(mask_classes)]
+                        for _ in range(num_imgs)]
+        results = list(zip(bbox_results, segm_results))
+    else:
+        results = bbox_results
+    return results
+...
+
+# There is no proposal in the single image
+for i in range(self.num_stages):
+    ...
+    if i < self.num_stages - 1:
+          for j in range(num_imgs):
+                # Handle empty proposal
+                if rois[j].shape[0] > 0:
+                    bbox_label = cls_score[j][:, :-1].argmax(dim=1)
+                    refine_roi = self.bbox_head[i].regress_by_class(
+                         rois[j], bbox_label, bbox_pred[j], img_metas[j])
+                    refine_roi_list.append(refine_roi)
+```
+
+If you have customized `RoIHead`, you can refer to the above method to deal with empty proposals.
+
+## Coco Panoptic Dataset
+
+In MMDetection, we have supported COCO Panoptic dataset. We clarify a few conventions about the implementation of `CocoPanopticDataset` here.
+
+1. For mmdet\<=2.16.0, the range of foreground and background labels in semantic segmentation are different from the default setting of MMDetection. The label `0` stands for `VOID` label and the category labels start from `1`.
+   Since mmdet=2.17.0, the category labels of semantic segmentation start from `0` and label `255` stands for `VOID` for consistency with labels of bounding boxes.
+   To achieve that, the `Pad` pipeline supports setting the padding value for `seg`.
+2. In the evaluation, the panoptic result is a map with the same shape as the original image. Each value in the result map has the format of `instance_id * INSTANCE_OFFSET + category_id`.
--- a/docs/en/advanced_guides/customize_dataset.md
+++ b/docs/en/advanced_guides/customize_dataset.md
+# Customize Datasets
+
+## Support new data format
+
+To support a new data format, you can either convert them to existing formats (COCO format or PASCAL format) or directly convert them to the middle format. You could also choose to convert them offline (before training by a script) or online (implement a new dataset and do the conversion at training). In MMDetection, we recommend to convert the data into COCO formats and do the conversion offline, thus you only need to modify the config's data annotation paths and classes after the conversion of your data.
+
+### Reorganize new data formats to existing format
+
+The simplest way is to convert your dataset to existing dataset formats (COCO or PASCAL VOC).
+
+The annotation JSON files in COCO format has the following necessary keys:
+
+```python
+'images': [
+    {
+        'file_name': 'COCO_val2014_000000001268.jpg',
+        'height': 427,
+        'width': 640,
+        'id': 1268
+    },
+    ...
+],
+
+'annotations': [
+    {
+        'segmentation': [[192.81,
+            247.09,
+            ...
+            219.03,
+            249.06]],  # If you have mask labels, and it is in polygon XY point coordinate format, you need to ensure that at least 3 point coordinates are included. Otherwise, it is an invalid polygon.
+        'area': 1035.749,
+        'iscrowd': 0,
+        'image_id': 1268,
+        'bbox': [192.81, 224.8, 74.73, 33.43],
+        'category_id': 16,
+        'id': 42986
+    },
+    ...
+],
+
+'categories': [
+    {'id': 0, 'name': 'car'},
+ ]
+```
+
+There are three necessary keys in the JSON file:
+
+- `images`: contains a list of images with their information like `file_name`, `height`, `width`, and `id`.
+- `annotations`: contains the list of instance annotations.
+- `categories`: contains the list of categories names and their ID.
+
+After the data pre-processing, there are two steps for users to train the customized new dataset with existing format (e.g. COCO format):
+
+1. Modify the config file for using the customized dataset.
+2. Check the annotations of the customized dataset.
+
+Here we give an example to show the above two steps, which uses a customized dataset of 5 classes with COCO format to train an existing Cascade Mask R-CNN R50-FPN detector.
+
+#### 1. Modify the config file for using the customized dataset
+
+There are two aspects involved in the modification of config file:
+
+1. The `data` field. Specifically, you need to explicitly add the `metainfo=dict(classes=classes)` fields in `train_dataloader.dataset`, `val_dataloader.dataset` and `test_dataloader.dataset` and `classes` must be a tuple type.
+2. The `num_classes` field in the `model` part. Explicitly over-write all the `num_classes` from default value (e.g. 80 in COCO) to your classes number.
+
+In `configs/my_custom_config.py`:
+
+```python
+
+# the new config inherits the base configs to highlight the necessary modification
+_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py'
+
+# 1. dataset settings
+dataset_type = 'CocoDataset'
+classes = ('a', 'b', 'c', 'd', 'e')
+data_root='path/to/your/'
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(
+        type=dataset_type,
+        # explicitly add your class names to the field `metainfo`
+        metainfo=dict(classes=classes),
+        data_root=data_root,
+        ann_file='train/annotation_data',
+        data_prefix=dict(img='train/image_data')
+        )
+    )
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    dataset=dict(
+        type=dataset_type,
+        test_mode=True,
+        # explicitly add your class names to the field `metainfo`
+        metainfo=dict(classes=classes),
+        data_root=data_root,
+        ann_file='val/annotation_data',
+        data_prefix=dict(img='val/image_data')
+        )
+    )
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    dataset=dict(
+        type=dataset_type,
+        test_mode=True,
+        # explicitly add your class names to the field `metainfo`
+        metainfo=dict(classes=classes),
+        data_root=data_root,
+        ann_file='test/annotation_data',
+        data_prefix=dict(img='test/image_data')
+        )
+    )
+
+# 2. model settings
+
+# explicitly over-write all the `num_classes` field from default 80 to 5.
+model = dict(
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                # explicitly over-write all the `num_classes` field from default 80 to 5.
+                num_classes=5),
+            dict(
+                type='Shared2FCBBoxHead',
+                # explicitly over-write all the `num_classes` field from default 80 to 5.
+                num_classes=5),
+            dict(
+                type='Shared2FCBBoxHead',
+                # explicitly over-write all the `num_classes` field from default 80 to 5.
+                num_classes=5)],
+    # explicitly over-write all the `num_classes` field from default 80 to 5.
+    mask_head=dict(num_classes=5)))
+```
+
+#### 2. Check the annotations of the customized dataset
+
+Assuming your customized dataset is COCO format, make sure you have the correct annotations in the customized dataset:
+
+1. The length for `categories` field in annotations should exactly equal the tuple length of `classes` fields in your config, meaning the number of classes (e.g. 5 in this example).
+2. The `classes` fields in your config file should have exactly the same elements and the same order with the `name` in `categories` of annotations. MMDetection automatically maps the uncontinuous `id` in `categories` to the continuous label indices, so the string order of `name` in `categories` field affects the order of label indices. Meanwhile, the string order of `classes` in config affects the label text during visualization of predicted bounding boxes.
+3. The `category_id` in `annotations` field should be valid, i.e., all values in `category_id` should belong to `id` in `categories`.
+
+Here is a valid example of annotations:
+
+```python
+
+'annotations': [
+    {
+        'segmentation': [[192.81,
+            247.09,
+            ...
+            219.03,
+            249.06]],  # if you have mask labels
+        'area': 1035.749,
+        'iscrowd': 0,
+        'image_id': 1268,
+        'bbox': [192.81, 224.8, 74.73, 33.43],
+        'category_id': 16,
+        'id': 42986
+    },
+    ...
+],
+
+# MMDetection automatically maps the uncontinuous `id` to the continuous label indices.
+'categories': [
+    {'id': 1, 'name': 'a'}, {'id': 3, 'name': 'b'}, {'id': 4, 'name': 'c'}, {'id': 16, 'name': 'd'}, {'id': 17, 'name': 'e'},
+ ]
+```
+
+We use this way to support CityScapes dataset. The script is in [cityscapes.py](../../../tools/dataset_converters/cityscapes.py) and we also provide the finetuning [configs](../../../configs/cityscapes).
+
+**Note**
+
+1. For instance segmentation datasets, **MMDetection only supports evaluating mask AP of dataset in COCO format for now**.
+2. It is recommended to convert the data offline before training, thus you can still use `CocoDataset` and only need to modify the path of annotations and the training classes.
+
+### Reorganize new data format to middle format
+
+It is also fine if you do not want to convert the annotation format to COCO or PASCAL format.
+Actually, we define a simple annotation format in MMEninge's [BaseDataset](https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/base_dataset.py#L116) and all existing datasets are
+processed to be compatible with it, either online or offline.
+
+The annotation of the dataset must be in `json` or `yaml`, `yml` or `pickle`, `pkl` format; the dictionary stored in the annotation file must contain two fields `metainfo` and `data_list`.  The `metainfo` is a dictionary, which contains the metadata of the dataset, such as class information; `data_list` is a list, each element in the list is a dictionary, the dictionary defines the raw data of one image, and each raw data contains a or several training/testing samples.
+
+Here is an example.
+
+```python
+{
+    'metainfo':
+        {
+            'classes': ('person', 'bicycle', 'car', 'motorcycle'),
+            ...
+        },
+    'data_list':
+        [
+            {
+                "img_path": "xxx/xxx_1.jpg",
+                "height": 604,
+                "width": 640,
+                "instances":
+                [
+                  {
+                    "bbox": [0, 0, 10, 20],
+                    "bbox_label": 1,
+                    "ignore_flag": 0
+                  },
+                  {
+                    "bbox": [10, 10, 110, 120],
+                    "bbox_label": 2,
+                    "ignore_flag": 0
+                  }
+                ]
+              },
+            {
+                "img_path": "xxx/xxx_2.jpg",
+                "height": 320,
+                "width": 460,
+                "instances":
+                [
+                  {
+                    "bbox": [10, 0, 20, 20],
+                    "bbox_label": 3,
+                    "ignore_flag": 1,
+                  }
+                ]
+              },
+            ...
+        ]
+}
+```
+
+Some datasets may provide annotations like crowd/difficult/ignored bboxes, we use `ignore_flag`to cover them.
+
+After obtaining the above standard data annotation format, you can directly use [BaseDetDataset](../../../mmdet/datasets/base_det_dataset.py#L13) of MMDetection in the configuration , without conversion.
+
+### An example of customized dataset
+
+Assume the annotation is in a new format in text files.
+The bounding boxes annotations are stored in text file `annotation.txt` as the following
+
+```
+#
+000001.jpg
+1280 720
+2
+10 20 40 60 1
+20 40 50 60 2
+#
+000002.jpg
+1280 720
+3
+50 20 40 60 2
+20 40 30 45 2
+30 40 50 60 3
+```
+
+We can create a new dataset in `mmdet/datasets/my_dataset.py` to load the data.
+
+```python
+import mmengine
+
+from mmdet.base_det_dataset import BaseDetDataset
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class MyDataset(BaseDetDataset):
+
+    METAINFO = {
+       'classes': ('person', 'bicycle', 'car', 'motorcycle'),
+        'palette': [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230)]
+    }
+
+    def load_data_list(self, ann_file):
+        ann_list = mmengine.list_from_file(ann_file)
+
+        data_infos = []
+        for i, ann_line in enumerate(ann_list):
+            if ann_line != '#':
+                continue
+
+            img_shape = ann_list[i + 2].split(' ')
+            width = int(img_shape[0])
+            height = int(img_shape[1])
+            bbox_number = int(ann_list[i + 3])
+
+            instances = []
+            for anns in ann_list[i + 4:i + 4 + bbox_number]:
+                instance = {}
+                instance['bbox'] = [float(ann) for ann in anns.split(' ')[:4]]
+                instance['bbox_label']=int(anns[4])
+ 				instances.append(instance)
+
+            data_infos.append(
+                dict(
+                    img_path=ann_list[i + 1],
+                    img_id=i,
+                    width=width,
+                    height=height,
+                    instances=instances
+                ))
+
+        return data_infos
+```
+
+Then in the config, to use `MyDataset` you can modify the config as the following
+
+```python
+dataset_A_train = dict(
+    type='MyDataset',
+    ann_file = 'image_list.txt',
+    pipeline=train_pipeline
+)
+```
+
+## Customize datasets by dataset wrappers
+
+MMEngine also supports many dataset wrappers to mix the dataset or modify the dataset distribution for training.
+Currently it supports to three dataset wrappers as below:
+
+- `RepeatDataset`: simply repeat the whole dataset.
+- `ClassBalancedDataset`: repeat dataset in a class balanced manner.
+- `ConcatDataset`: concat datasets.
+
+For detailed usage, see [MMEngine Dataset Wrapper](#TODO).
+
+## Modify Dataset Classes
+
+With existing dataset types, we can modify the metainfo of them to train subset of the annotations.
+For example, if you want to train only three classes of the current dataset,
+you can modify the classes of dataset.
+The dataset will filter out the ground truth boxes of other classes automatically.
+
+```python
+classes = ('person', 'bicycle', 'car')
+train_dataloader = dict(
+    dataset=dict(
+        metainfo=dict(classes=classes))
+    )
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=dict(classes=classes))
+    )
+test_dataloader = dict(
+    dataset=dict(
+        metainfo=dict(classes=classes))
+    )
+```
+
+**Note**:
+
+- Before MMDetection v2.5.0, the dataset will filter out the empty GT images automatically if the classes are set and there is no way to disable that through config. This is an undesirable behavior and introduces confusion because if the classes are not set, the dataset only filter the empty GT images when `filter_empty_gt=True` and `test_mode=False`. After MMDetection v2.5.0, we decouple the image filtering process and the classes modification, i.e., the dataset will only filter empty GT images when `filter_cfg=dict(filter_empty_gt=True)` and `test_mode=False`, no matter whether the classes are set. Thus, setting the classes only influences the annotations of classes used for training and users could decide whether to filter empty GT images by themselves.
+- When directly using `BaseDataset` in MMEngine or `BaseDetDataset` in MMDetection, users cannot filter images without GT by modifying the configuration, but it can be solved in an offline way.
+- Please remember to modify the `num_classes` in the head when specifying `classes` in dataset. We implemented [NumClassCheckHook](../../../mmdet/engine/hooks/num_class_check_hook.py) to check whether the numbers are consistent since v2.9.0(after PR#4508).
+
+## COCO Panoptic Dataset
+
+Now we support COCO Panoptic Dataset, the format of panoptic annotations is different from COCO format.
+Both the foreground and the background will exist in the annotation file.
+The annotation json files in COCO Panoptic format has the following necessary keys:
+
+```python
+'images': [
+    {
+        'file_name': '000000001268.jpg',
+        'height': 427,
+        'width': 640,
+        'id': 1268
+    },
+    ...
+]
+
+'annotations': [
+    {
+        'filename': '000000001268.jpg',
+        'image_id': 1268,
+        'segments_info': [
+            {
+                'id':8345037,  # One-to-one correspondence with the id in the annotation map.
+                'category_id': 51,
+                'iscrowd': 0,
+                'bbox': (x1, y1, w, h),  # The bbox of the background is the outer rectangle of its mask.
+                'area': 24315
+            },
+            ...
+        ]
+    },
+    ...
+]
+
+'categories': [  # including both foreground categories and background categories
+    {'id': 0, 'name': 'person'},
+    ...
+ ]
+```
+
+Moreover, the `seg` must be set to the path of the panoptic annotation images.
+
+```python
+dataset_type = 'CocoPanopticDataset'
+data_root='path/to/your/'
+
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img='train/image_data/', seg='train/panoptic/image_annotation_data/')
+    )
+)
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img='val/image_data/', seg='val/panoptic/image_annotation_data/')
+    )
+)
+test_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img='test/image_data/', seg='test/panoptic/image_annotation_data/')
+    )
+)
+```
--- a/docs/en/advanced_guides/customize_losses.md
+++ b/docs/en/advanced_guides/customize_losses.md
+# Customize Losses
+
+MMDetection provides users with different loss functions. But the default configuration may be not applicable for different datasets or models, so users may want to modify a specific loss to adapt the new situation.
+
+This tutorial first elaborate the computation pipeline of losses, then give some instructions about how to modify each step. The modification can be categorized as tweaking and weighting.
+
+## Computation pipeline of a loss
+
+Given the input prediction and target, as well as the weights, a loss function maps the input tensor to the final loss scalar. The mapping can be divided into five steps:
+
+1. Set the sampling method to sample positive and negative samples.
+
+2. Get **element-wise** or **sample-wise** loss by the loss kernel function.
+
+3. Weighting the loss with a weight tensor **element-wisely**.
+
+4. Reduce the loss tensor to a **scalar**.
+
+5. Weighting the loss with a **scalar**.
+
+## Set sampling method (step 1)
+
+For some loss functions, sampling strategies are needed to avoid imbalance between positive and negative samples.
+
+For example, when using `CrossEntropyLoss` in RPN head, we need to set `RandomSampler` in `train_cfg`
+
+```python
+train_cfg=dict(
+    rpn=dict(
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False))
+```
+
+For some other losses which have positive and negative sample balance mechanism such as Focal Loss, GHMC, and QualityFocalLoss, the sampler is no more necessary.
+
+## Tweaking loss
+
+Tweaking a loss is more related with step 2, 4, 5, and most modifications can be specified in the config.
+Here we take [Focal Loss (FL)](../../../mmdet/models/losses/focal_loss.py) as an example.
+The following code sniper are the construction method and config of FL respectively, they are actually one to one correspondence.
+
+```python
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0):
+```
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=1.0)
+```
+
+### Tweaking hyper-parameters (step 2)
+
+`gamma` and `beta` are two hyper-parameters in the Focal Loss. Say if we want to change the value of `gamma` to be 1.5 and `alpha` to be 0.5, then we can specify them in the config as follows:
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=1.5,
+    alpha=0.5,
+    loss_weight=1.0)
+```
+
+### Tweaking the way of reduction (step 3)
+
+The default way of reduction is `mean` for FL. Say if we want to change the reduction from `mean` to `sum`, we can specify it in the config as follows:
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=1.0,
+    reduction='sum')
+```
+
+### Tweaking loss weight (step 5)
+
+The loss weight here is a scalar which controls the weight of different losses in multi-task learning, e.g. classification loss and regression loss. Say if we want to change to loss weight of classification loss to be 0.5, we can specify it in the config as follows:
+
+```python
+loss_cls=dict(
+    type='FocalLoss',
+    use_sigmoid=True,
+    gamma=2.0,
+    alpha=0.25,
+    loss_weight=0.5)
+```
+
+## Weighting loss (step 3)
+
+Weighting loss means we re-weight the loss element-wisely. To be more specific, we multiply the loss tensor with a weight tensor which has the same shape. As a result, different entries of the loss can be scaled differently, and so called element-wisely.
+The loss weight varies across different models and highly context related, but overall there are two kinds of loss weights, `label_weights` for classification loss and `bbox_weights` for bbox regression loss. You can find them in the `get_target` method of the corresponding head. Here we take [ATSSHead](../../../mmdet/models/dense_heads/atss_head.py#L322) as an example, which inherit [AnchorHead](../../../mmdet/models/dense_heads/anchor_head.py) but overwrite its `get_targets` method which yields different `label_weights` and `bbox_weights`.
+
+```
+class ATSSHead(AnchorHead):
+
+    ...
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+```