First commit

106580f9 · chenych · 106580f9 · 106580f9 · 106580f9 · 106580f9
Commit 106580f9 authored Dec 29, 2023 by chenych
20 changed files
--- a/data/mmpose_custom/model/top_down.py
+++ b/data/mmpose_custom/model/top_down.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+
+import mmcv
+import numpy as np
+from PIL import Image
+import torch
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_bboxes, imshow_keypoints
+from mmpose.models import builder
+from mmpose.models.builder import POSENETS
+# from .base import BasePose
+from mmpose.models.detectors import TopDown
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+from mmpose.core.post_processing import flip_back
+from data.pipelines.custom_transform import define_colors_gb_mean_sep
+color_dict = define_colors_gb_mean_sep()
+color_list = [v for k, v in color_dict.items()]
+color_list.append((0, 0))
+
+
+@POSENETS.register_module()
+class TopDownCustom(TopDown):
+    """Top-down pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+    """
+    colors = torch.tensor(color_list, dtype=torch.float32, device="cuda")
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            keypoint_head=keypoint_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            loss_pose=loss_pose)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                pseudo_test=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if pseudo_test:
+            return self.forward_pseudo_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)  # (b, c, h, w)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap + output_flipped_heatmap)
+                if self.test_cfg.get('regression_flip_shift', False):
+                    output_heatmap[..., 0] -= 1.0 / img_width
+                output_heatmap = output_heatmap / 2
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_pseudo_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        output_heatmap = self.decode_images_to_heatmaps_minmax(
+            images=img, resize=False,
+        )
+
+        # add support for flip test
+        if self.test_cfg.get('flip_test', True):
+            image_flip_list = []
+            for batch_idx in range(img.shape[0]):
+                flip_image_dir = os.path.dirname(img_metas[batch_idx]['image_file']) + "_flip"
+                flip_image_name = os.path.basename(img_metas[batch_idx]['image_file'])
+                flip_image_path = os.path.join(flip_image_dir, flip_image_name)
+                image = np.array(Image.open(flip_image_path))
+                image_tensor = torch.from_numpy(image).to(img.device)
+                image_flip_list.append(image_tensor)
+            img_flipped = torch.stack(image_flip_list)  # (b, h, w, 3)
+            if self.with_keypoint:
+                # output_flipped_heatmap = self.keypoint_head.inference_model(
+                #     features_flipped, img_metas[0]['flip_pairs'])
+                output = self.decode_images_to_heatmaps_minmax(
+                    images=img_flipped, resize=False,
+                )
+                flip_pairs = img_metas[0]['flip_pairs']
+                assert flip_pairs is not None
+                output_flipped_heatmap = flip_back(
+                    output,
+                    flip_pairs,
+                    target_type=self.keypoint_head.target_type)
+                # feature is not aligned, shift flipped heatmap for higher accuracy
+                if self.test_cfg.get('shift_heatmap', False):
+                    output_flipped_heatmap[:, :, :, 1:] = output_flipped_heatmap[:, :, :, :-1]
+                output_heatmap = (output_heatmap + output_flipped_heatmap)
+                if self.test_cfg.get('regression_flip_shift', False):
+                    output_heatmap[..., 0] -= 1.0 / img_width
+                output_heatmap = output_heatmap / 2
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def decode_images_to_heatmaps_minmax(self, images, resize=False):
+        """
+
+        Args:
+            images: (bs, 256, 192, 3)
+            resize: whether to resize to (64, 48)
+
+        Returns:
+            heatmaps: (bs, 17, h, w)
+        """
+        assert images.shape[-1] == 3
+        batch_size, image_height, image_width, _ = images.shape
+        images = images.float()
+
+        # classify each pixel using GB
+        GB = images[..., 1:].view(batch_size, 1, image_height, image_width, 2)  # (bs, 1, 256, 192, 2)
+        colors = TopDown.colors
+        num_classes = colors.shape[0]
+        colors = colors.view(1, -1, 1, 1, 2)
+        dist = torch.abs(GB - colors).sum(-1)  # (bs, 18, 256, 192)
+        dist, indices = torch.min(dist, dim=1)  # (bs, 256, 192)
+        keypoint_mask_list = []
+        for idx in range(num_classes):
+            mask = indices == idx  # (bs, 256, 192)
+            keypoint_mask_list.append(mask)
+
+        R = images[..., 0]  # (bs, 256, 192)
+        heatmap_list = []
+        for idx in range(num_classes):
+            if idx == 17:
+                continue
+            mask = keypoint_mask_list[idx]
+            heatmap = mask * R
+            heatmap_list.append(heatmap.unsqueeze(1))
+        heatmaps = torch.cat(heatmap_list, dim=1)
+
+        if resize:
+            raise NotImplementedError
+
+        return heatmaps.cpu().numpy() / 255.
--- a/data/mmpose_custom/painter_inference_pose.py
+++ b/data/mmpose_custom/painter_inference_pose.py
+# --------------------------------------------------------
+# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
+# Github source: https://github.com/baaivision/Painter
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Xinlong Wang, Wen Wang
+# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
+# --------------------------------------------------------'
+
+import sys
+import os
+import warnings
+
+import requests
+import argparse
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+import glob
+import tqdm
+
+import matplotlib.pyplot as plt
+from PIL import Image
+import torch.distributed as dist
+from torch.utils.data import DataLoader, DistributedSampler
+
+sys.path.append('.')
+import models_painter
+from util.ddp_utils import DatasetTest
+from util import ddp_utils
+
+
+imagenet_mean = np.array([0.485, 0.456, 0.406])
+imagenet_std = np.array([0.229, 0.224, 0.225])
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('COCO Pose Estimation', add_help=False)
+    parser.add_argument('--ckpt_path', type=str, help='path to ckpt', default='')
+    parser.add_argument('--model', type=str, help='dir to ckpt',
+                        default='painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1')
+    parser.add_argument('--prompt', type=str, help='prompt image in train set',
+                        default='000000000165_box0')
+    parser.add_argument('--input_size', type=int, default=448)
+    parser.add_argument('--flip_test', action='store_true', help='use offline bbox')
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    return parser.parse_args()
+
+
+def prepare_model(chkpt_dir, arch, args=None):
+    # build model
+    model = getattr(models_painter, arch)()
+    model.to("cuda")
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+    model_without_ddp = model.module
+    # load model
+    checkpoint = torch.load(chkpt_dir, map_location='cpu')
+    msg = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+    print(msg)
+    return model
+
+
+def run_one_image(img, tgt, size, model, out_path, device):
+    x = torch.tensor(img)
+    x = x.unsqueeze(dim=0)
+    x = torch.einsum('nhwc->nchw', x)
+
+    tgt = torch.tensor(tgt)
+    tgt = tgt.unsqueeze(dim=0)
+    tgt = torch.einsum('nhwc->nchw', tgt)
+
+    bool_masked_pos = torch.zeros(model.module.patch_embed.num_patches)
+    bool_masked_pos[model.module.patch_embed.num_patches//2:] = 1
+    bool_masked_pos = bool_masked_pos.unsqueeze(dim=0)
+
+    valid = torch.ones_like(tgt)
+    loss, y, mask = model(x.float().to(device), tgt.float().to(device), bool_masked_pos.to(device), valid.float().to(device))
+    y = model.module.unpatchify(y)
+    y = torch.einsum('nchw->nhwc', y).detach().cpu()
+
+    output = y[0, y.shape[1]//2:, :, :]
+    output = torch.clip((output * imagenet_std + imagenet_mean) * 255, 0, 255)
+    output = F.interpolate(output[None, ...].permute(0, 3, 1, 2), size=[size[1], size[0]], mode='nearest').permute(0, 2, 3, 1)[0]
+
+    output = output.int()
+    output = Image.fromarray(output.numpy().astype(np.uint8))
+    output.save(out_path)
+
+
+if __name__ == '__main__':
+    dataset_dir = "datasets/"
+    args = get_args_parser()
+    args = ddp_utils.init_distributed_mode(args)
+    device = torch.device("cuda")
+
+    ckpt_path = args.ckpt_path
+    model = args.model
+    prompt = args.prompt
+    input_size = args.input_size
+
+    path_splits = ckpt_path.split('/')
+    ckpt_dir, ckpt_file = path_splits[-2], path_splits[-1]
+    dst_dir = os.path.join('models_inference', ckpt_dir.split('/')[-1],
+                           "coco_pose_inference_{}_{}".format(ckpt_path, os.path.basename(prompt).split(".")[0]))
+    if args.flip_test:
+        dst_dir = dst_dir + "_flip"
+
+    if ddp_utils.get_rank() == 0:
+        if not os.path.exists(dst_dir):
+            os.makedirs(dst_dir)
+        print("output_dir: {}".format(dst_dir))
+
+    model_painter = prepare_model(ckpt_path, model, args)
+    print('Model loaded.')
+
+    img_src_dir = dataset_dir + "coco_pose/data_pair/test_256x192"
+    if args.flip_test:
+        img_src_dir += "_flip"
+
+    dataset_val = DatasetTest(img_src_dir, input_size, ext_list=('*.png',))
+
+    sampler_val = DistributedSampler(dataset_val, shuffle=False)
+    data_loader_val = DataLoader(dataset_val, batch_size=1, sampler=sampler_val,
+                                 drop_last=False, collate_fn=ddp_utils.collate_fn, num_workers=2)
+
+    img2_path = dataset_dir + "coco_pose/data_pair/train_256x192_aug0/{}_image.png".format(prompt)
+    tgt2_path = dataset_dir + "coco_pose/data_pair/train_256x192_aug0/{}_label.png".format(prompt)
+
+    # load the shared prompt image pair
+    img2 = Image.open(img2_path).convert("RGB")
+    img2 = img2.resize((input_size, input_size))
+    img2 = np.array(img2) / 255.
+
+    tgt2 = Image.open(tgt2_path)
+    tgt2 = tgt2.resize((input_size, input_size))
+    tgt2 = np.array(tgt2) / 255.
+
+    model_painter.eval()
+    for data in tqdm.tqdm(data_loader_val):
+        """ Load an image """
+        assert len(data) == 1
+        img, img_path, size = data[0]
+        img_name = os.path.basename(img_path)
+        out_path = os.path.join(dst_dir, img_name.replace('.jpg', '.png'))
+
+        img = np.concatenate((img2, img), axis=0)
+        assert img.shape == (input_size * 2, input_size, 3)
+        # normalize by ImageNet mean and std
+        img = img - imagenet_mean
+        img = img / imagenet_std
+
+        tgt = tgt2  # tgt is not available
+        tgt = np.concatenate((tgt2, tgt), axis=0)
+
+        assert tgt.shape == (input_size * 2, input_size, 3)
+        # normalize by ImageNet mean and std
+        tgt = tgt - imagenet_mean
+        tgt = tgt / imagenet_std
+
+        # make random mask reproducible (comment out to make it change)
+        torch.manual_seed(2)
+        run_one_image(img, tgt, size, model_painter, out_path, device)
--- a/data/mmpose_custom/tools/dist_test.sh
+++ b/data/mmpose_custom/tools/dist_test.sh
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
--- a/data/mmpose_custom/tools/dist_train.sh
+++ b/data/mmpose_custom/tools/dist_train.sh
+#!/usr/bin/env bash
+# Copyright (c) OpenMMLab. All rights reserved.
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --launcher pytorch ${@:3}
--- a/data/mmpose_custom/tools/test.py
+++ b/data/mmpose_custom/tools/test.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import sys
+sys.path.insert(0, "./")
+import tqdm
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import get_dist_info, init_dist, load_checkpoint
+
+from mmpose.apis import multi_gpu_test
+from apis.test import single_gpu_test
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.models import build_posenet
+from mmpose.utils import setup_multi_processes
+
+try:
+    from mmcv.runner import wrap_fp16_model
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import wrap_fp16_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='mmpose test model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='output result file')
+    parser.add_argument(
+        '--work-dir', help='the dir to save evaluation results')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--eval',
+        default=None,
+        nargs='+',
+        help='evaluation metric, which depends on the dataset,'
+        ' e.g., "mAP" for MSCOCO')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results')
+    parser.add_argument('--tmpdir', help='tmp dir for writing some results')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def merge_configs(cfg1, cfg2):
+    # Merge cfg2 into cfg1
+    # Overwrite cfg1 if repeated, ignore if value is None.
+    cfg1 = {} if cfg1 is None else cfg1.copy()
+    cfg2 = {} if cfg2 is None else cfg2
+    for k, v in cfg2.items():
+        if v:
+            cfg1[k] = v
+    return cfg1
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.data.test.test_mode = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test, dict(test_mode=True))
+    # step 1: give default values and override (if exist) from cfg.data
+    loader_cfg = {
+        **dict(seed=cfg.get('seed'), drop_last=False, dist=distributed),
+        **({} if torch.__version__ != 'parrots' else dict(
+               prefetch_num=2,
+               pin_memory=False,
+           )),
+        **dict((k, cfg.data[k]) for k in [
+                   'seed',
+                   'prefetch_num',
+                   'pin_memory',
+                   'persistent_workers',
+               ] if k in cfg.data)
+    }
+    # step2: cfg.data.test_dataloader has higher priority
+    test_loader_cfg = {
+        **loader_cfg,
+        **dict(shuffle=False, drop_last=False),
+        **dict(workers_per_gpu=cfg.data.get('workers_per_gpu', 1)),
+        **dict(samples_per_gpu=cfg.data.get('samples_per_gpu', 1)),
+        **cfg.data.get('test_dataloader', {})
+    }
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
+
+    load_data_only = cfg.data.get('load_data_only', False)
+    if load_data_only:
+        for _ in tqdm.tqdm(data_loader):
+            pass
+        print("dataset enumerated, exit!")
+        sys.exit()
+
+    # build the model and load checkpoint
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    # load_checkpoint(model, args.checkpoint, map_location='cpu')
+
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+
+    pseudo_test = cfg.data.get('pseudo_test', False)
+    assert pseudo_test
+    # only support single gpu test
+    model = MMDataParallel(model, device_ids=[args.gpu_id])
+    outputs = single_gpu_test(model, data_loader, pseudo_test=True)
+
+    rank, _ = get_dist_info()
+    eval_config = cfg.get('evaluation', {})
+    eval_config = merge_configs(eval_config, dict(metric=args.eval))
+
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(outputs, args.out)
+
+        results = dataset.evaluate(outputs, cfg.work_dir, **eval_config)
+        for k, v in sorted(results.items()):
+            print(f'{k}: {v}')
+
+
+if __name__ == '__main__':
+    main()
--- a/data/mmpose_custom/tools/train.py
+++ b/data/mmpose_custom/tools/train.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist, set_random_seed
+from mmcv.utils import get_git_hash
+
+from mmpose import __version__
+from mmpose.apis import init_random_seed
+from apis.train import train_model
+from mmpose.datasets import build_dataset
+from mmpose.models import build_posenet
+from mmpose.utils import collect_env, get_root_logger, setup_multi_processes
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a pose model')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='(Deprecated, please use --gpu-id) number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--diff_seed',
+        action='store_true',
+        help='Whether or not set different seeds for different ranks')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]
+
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+        if len(cfg.gpu_ids) > 1:
+            warnings.warn(
+                f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
+                f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
+                'non-distribute training time.')
+            cfg.gpu_ids = cfg.gpu_ids[0:1]
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    seed = init_random_seed(args.seed)
+    seed = seed + dist.get_rank() if args.diff_seed else seed
+    logger.info(f'Set random seed to {seed}, '
+                f'deterministic: {args.deterministic}')
+    set_random_seed(seed, deterministic=args.deterministic)
+    cfg.seed = seed
+    meta['seed'] = seed
+
+    # model = build_posenet(cfg.model)
+    model = None
+    datasets = [build_dataset(cfg.data.train)]
+
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.pipeline
+        datasets.append(build_dataset(val_dataset))
+
+    if cfg.checkpoint_config is not None:
+        # save mmpose version, config file content
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmpose_version=__version__ + get_git_hash(digits=7),
+            config=cfg.pretty_text,
+        )
+    train_model(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
--- a/data/pair_transforms.py
+++ b/data/pair_transforms.py
+# --------------------------------------------------------
+# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
+# Github source: https://github.com/baaivision/Painter
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Xinlong Wang, Wen Wang
+# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
+# --------------------------------------------------------'
+
+import math
+import numbers
+import random
+import warnings
+from collections.abc import Sequence
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+import torchvision.transforms as transforms
+
+try:
+    import accimage
+except ImportError:
+    accimage = None
+
+import torchvision.transforms.functional as F
+from torchvision.transforms.functional import _interpolation_modes_from_int, InterpolationMode
+from PIL import Image, ImageFilter, ImageOps
+
+__all__ = [
+    "Compose",
+    "ToTensor",
+    "Normalize",
+    "RandomHorizontalFlip",
+    "RandomResizedCrop",
+]
+
+
+
+class Compose(transforms.Compose):
+    """Composes several transforms together. This transform does not support torchscript.
+    Please, see the note below.
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+    """
+
+    def __init__(self, transforms):
+        super().__init__(transforms)
+
+    def __call__(self, img, tgt, interpolation1=None, interpolation2=None):
+        for t in self.transforms:
+            img, tgt = t(img, tgt, interpolation1=interpolation1, interpolation2=interpolation2)
+        return img, tgt
+
+
+class ToTensor(transforms.ToTensor):
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript.
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+    In the other cases, tensors are returned without scaling.
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def __call__(self, pic1, pic2, interpolation1=None, interpolation2=None):
+        """
+        Args:
+            pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        return F.to_tensor(pic1), F.to_tensor(pic2)
+
+
+class Normalize(transforms.Normalize):
+    """Normalize a tensor image with mean and standard deviation.
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        super().__init__(mean, std, inplace)
+
+    def forward(self, tensor1: Tensor, tensor2: Tensor, interpolation1=None, interpolation2=None):
+        """
+        Args:
+            tensor (Tensor): Tensor image to be normalized.
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        return F.normalize(tensor1, self.mean, self.std, self.inplace), F.normalize(tensor2, self.mean, self.std, self.inplace)
+
+
+class RandomResizedCrop(transforms.RandomResizedCrop):
+    """Crop a random portion of image and resize it to a given size.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    A crop of the original image is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and
+            ``InterpolationMode.BICUBIC`` are supported.
+            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
+            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+    """
+
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation=InterpolationMode.BILINEAR,
+    ):
+        super().__init__(size, scale=scale, ratio=ratio, interpolation=interpolation)
+
+    def forward(self, img, tgt, interpolation1=None, interpolation2=None):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped and resized.
+        Returns:
+            PIL Image or Tensor: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        if interpolation1 == 'nearest':
+            interpolation1 = InterpolationMode.NEAREST
+        else:
+            interpolation1 = InterpolationMode.BICUBIC
+        if interpolation2 == 'nearest':
+            interpolation2 = InterpolationMode.NEAREST
+        else:
+            interpolation2 = InterpolationMode.BICUBIC
+            
+        return F.resized_crop(img, i, j, h, w, self.size, interpolation1), \
+                F.resized_crop(tgt, i, j, h, w, self.size, interpolation2)
+
+
+class RandomHorizontalFlip(transforms.RandomHorizontalFlip):
+    """Horizontally flip the given image randomly with a given probability.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        super().__init__(p=p)
+
+    def forward(self, img, tgt, interpolation1=None, interpolation2=None):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be flipped.
+        Returns:
+            PIL Image or Tensor: Randomly flipped image.
+        """
+        if torch.rand(1) < self.p:
+            return F.hflip(img), F.hflip(tgt)
+        return img, tgt
+
+
+class RandomApply(transforms.RandomApply):
+    """Apply randomly a list of transformations with a given probability.
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability
+    """
+
+    def __init__(self, transforms, p=0.5):
+        super().__init__(transforms, p=p)
+
+    def forward(self, img, tgt, interpolation1=None, interpolation2=None):
+        if self.p < torch.rand(1):
+            return img, tgt
+        for t in self.transforms:
+            img, tgt = t(img, tgt)
+        return img, tgt
+
+class ColorJitter(transforms.ColorJitter):
+    """Randomly change the brightness, contrast, saturation and hue of an image.
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
+
+    def forward(self, img, tgt, interpolation1=None, interpolation2=None):
+        """
+        Args:
+            img (PIL Image or Tensor): Input image.
+        Returns:
+            PIL Image or Tensor: Color jittered image.
+        """
+        fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+            self.brightness, self.contrast, self.saturation, self.hue
+        )
+
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return img, tgt
+
+
+class RandomErasing(transforms.RandomErasing):
+    """Randomly selects a rectangle region in a torch.Tensor image and erases its pixels.
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+    Args:
+         p: probability that the random erasing operation will be performed.
+         scale: range of proportion of erased area against input image.
+         ratio: range of aspect ratio of erased area.
+         value: erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+         inplace: boolean to make this transform inplace. Default set to False.
+    Returns:
+        Erased Image.
+    Example:
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
+    def __init__(self, p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False):
+        super().__init__(p=p, scale=scale, ratio=ratio, value=value, inplace=inplace)
+
+    def forward(self, img, tgt, interpolation1=None, interpolation2=None):
+        """
+        Args:
+            img (Tensor): Tensor image to be erased.
+        Returns:
+            img (Tensor): Erased Tensor image.
+        """
+        if torch.rand(1) < self.p:
+
+            # cast self.value to script acceptable type
+            if isinstance(self.value, (int, float)):
+                value = [self.value]
+            elif isinstance(self.value, str):
+                value = None
+            elif isinstance(self.value, tuple):
+                value = list(self.value)
+            else:
+                value = self.value
+
+            if value is not None and not (len(value) in (1, img.shape[-3])):
+                raise ValueError(
+                    "If value is a sequence, it should have either a single value or "
+                    f"{img.shape[-3]} (number of input channels)"
+                )
+
+            x, y, h, w, v = self.get_params(img, scale=self.scale, ratio=self.ratio, value=value)
+            return F.erase(img, x, y, h, w, v, self.inplace), tgt
+        return img, tgt
+
+
+
+class GaussianBlur(object):
+    """Gaussian blur augmentation from SimCLR: https://arxiv.org/abs/2002.05709"""
+
+    def __init__(self, sigma=[.1, 2.]):
+        self.sigma = sigma
+
+    def __call__(self, img, tgt, interpolation1=None, interpolation2=None):
+        sigma = random.uniform(self.sigma[0], self.sigma[1])
+        img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
+        return img, tgt
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}( sigma={self.sigma})"
+        return s
+
--- a/data/pairdataset.py
+++ b/data/pairdataset.py
+# --------------------------------------------------------
+# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
+# Github source: https://github.com/baaivision/Painter
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Xinlong Wang, Wen Wang
+# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
+# --------------------------------------------------------'
+
+import os.path
+import json
+from typing import Any, Callable, List, Optional, Tuple
+import random
+
+from PIL import Image
+import numpy as np
+
+import torch
+from torchvision.datasets.vision import VisionDataset, StandardTransform
+
+
+class PairDataset(VisionDataset):
+    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
+
+    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        json_path_list: list,
+        transform: Optional[Callable] = None,
+        transform2: Optional[Callable] = None,
+        transform3: Optional[Callable] = None,
+        transform_seccrop: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+        masked_position_generator: Optional[Callable] = None,
+        use_two_pairs: bool = True,
+        half_mask_ratio:float = 0.,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+
+        self.pairs = []
+        self.weights = []
+        type_weight_list = [0.1, 0.2, 0.15, 0.25, 0.2, 0.15, 0.05, 0.05]
+        for idx, json_path in enumerate(json_path_list):
+            cur_pairs = json.load(open(json_path))
+            self.pairs.extend(cur_pairs)
+            cur_num = len(cur_pairs)
+            self.weights.extend([type_weight_list[idx] * 1./cur_num]*cur_num)
+            print(json_path, type_weight_list[idx])
+        self.use_two_pairs = use_two_pairs
+        if self.use_two_pairs:
+            self.pair_type_dict = {}
+            for idx, pair in enumerate(self.pairs):
+                if "type" in pair:
+                    if pair["type"] not in self.pair_type_dict:
+                        self.pair_type_dict[pair["type"]] = [idx]
+                    else:
+                        self.pair_type_dict[pair["type"]].append(idx)
+            for t in self.pair_type_dict:
+                print(t, len(self.pair_type_dict[t]))
+        self.transforms = PairStandardTransform(transform, target_transform) if transform is not None else None
+        self.transforms2 = PairStandardTransform(transform2, target_transform) if transform2 is not None else None
+        self.transforms3 = PairStandardTransform(transform3, target_transform) if transform3 is not None else None
+        self.transforms_seccrop = PairStandardTransform(transform_seccrop, target_transform) if transform_seccrop is not None else None
+        self.masked_position_generator = masked_position_generator
+        self.half_mask_ratio = half_mask_ratio
+
+    def _load_image(self, path: str) -> Image.Image:
+        while True:
+            try:
+                img = Image.open(os.path.join(self.root, path))
+            except OSError as e:
+                print(f"Catched exception: {str(e)}. Re-trying...")
+                import time
+                time.sleep(1)
+            else:
+                break
+        # process for nyuv2 depth: scale to 0~255
+        if "sync_depth" in path:
+            # nyuv2's depth range is 0~10m
+            img = np.array(img) / 10000.
+            img = img * 255
+            img = Image.fromarray(img)
+        img = img.convert("RGB")
+        return img
+
+    def _combine_images(self, image, image2, interpolation='bicubic'):
+        # image under image2
+        h, w = image.shape[1], image.shape[2]
+        dst = torch.cat([image, image2], dim=1)
+        return dst
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        pair = self.pairs[index]
+        image = self._load_image(pair['image_path'])
+        target = self._load_image(pair['target_path'])
+
+        # decide mode for interpolation
+        pair_type = pair['type']
+        if "depth" in pair_type or "pose" in pair_type:
+            interpolation1 = 'bicubic'
+            interpolation2 = 'bicubic'
+        elif "image2" in pair_type:
+            interpolation1 = 'bicubic'
+            interpolation2 = 'nearest'
+        elif "2image" in pair_type:
+            interpolation1 = 'nearest'
+            interpolation2 = 'bicubic'
+        else:
+            interpolation1 = 'bicubic'
+            interpolation2 = 'bicubic'
+            
+        # no aug for instance segmentation
+        if "inst" in pair['type'] and self.transforms2 is not None:
+            cur_transforms = self.transforms2
+        elif "pose" in pair['type'] and self.transforms3 is not None:
+            cur_transforms = self.transforms3
+        else:
+            cur_transforms = self.transforms
+
+        image, target = cur_transforms(image, target, interpolation1, interpolation2)
+
+        if self.use_two_pairs:
+            pair_type = pair['type']
+            # sample the second pair belonging to the same type
+            pair2_index = random.choice(self.pair_type_dict[pair_type])
+            pair2 = self.pairs[pair2_index]
+            image2 = self._load_image(pair2['image_path'])
+            target2 = self._load_image(pair2['target_path'])
+            assert pair2['type'] == pair_type
+            image2, target2 = cur_transforms(image2, target2, interpolation1, interpolation2)
+            image = self._combine_images(image, image2, interpolation1)
+            target = self._combine_images(target, target2, interpolation2)
+
+        use_half_mask = torch.rand(1)[0] < self.half_mask_ratio
+        if (self.transforms_seccrop is None) or ("inst" in pair['type']) or ("pose" in pair['type']) or use_half_mask:
+            pass
+        else:
+            image, target = self.transforms_seccrop(image, target, interpolation1, interpolation2)
+        
+        valid = torch.ones_like(target)
+        imagenet_mean=torch.tensor([0.485, 0.456, 0.406])
+        imagenet_std=torch.tensor([0.229, 0.224, 0.225])
+        if "nyuv2_image2depth" in pair_type:
+            thres = torch.ones(3) * (1e-3 * 0.1)
+            thres = (thres - imagenet_mean) / imagenet_std
+            valid[target < thres[:, None, None]] = 0
+        elif "ade20k_image2semantic" in pair_type:
+            thres = torch.ones(3) * (1e-5) # ignore black
+            thres = (thres - imagenet_mean) / imagenet_std
+            valid[target < thres[:, None, None]] = 0
+        elif "coco_image2panoptic_sem_seg" in pair_type:
+            thres = torch.ones(3) * (1e-5) # ignore black
+            thres = (thres - imagenet_mean) / imagenet_std
+            valid[target < thres[:, None, None]] = 0
+        elif "image2pose" in pair_type:
+            thres = torch.ones(3) * (1e-5) # ignore black
+            thres = (thres - imagenet_mean) / imagenet_std
+            valid[target > thres[:, None, None]] = 10.0
+            fg = target > thres[:, None, None]
+            if fg.sum() < 100*3:
+                valid = valid * 0.
+        elif "image2panoptic_inst" in pair_type:
+            thres = torch.ones(3) * (1e-5) # ignore black
+            thres = (thres - imagenet_mean) / imagenet_std
+            fg = target > thres[:, None, None]
+            if fg.sum() < 100*3:
+                valid = valid * 0.
+
+        if use_half_mask:
+            num_patches = self.masked_position_generator.num_patches
+            mask = np.zeros(self.masked_position_generator.get_shape(), dtype=np.int32)
+            mask[mask.shape[0]//2:, :] = 1
+        else:
+            mask = self.masked_position_generator()
+        
+        return image, target, mask, valid
+
+    def __len__(self) -> int:
+        return len(self.pairs)
+
+
+class PairStandardTransform(StandardTransform):
+    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
+        super().__init__(transform=transform, target_transform=target_transform)
+
+    def __call__(self, input: Any, target: Any, interpolation1: Any, interpolation2: Any) -> Tuple[Any, Any]:
+        if self.transform is not None:
+            input, target = self.transform(input, target, interpolation1, interpolation2)
+        return input, target
--- a/data/panoptic_coco_categories.json
+++ b/data/panoptic_coco_categories.json
+[{"supercategory": "person", "color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, {"supercategory": "vehicle", "color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, {"supercategory": "vehicle", "color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, {"supercategory": "vehicle", "color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, {"supercategory": "vehicle", "color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, {"supercategory": "vehicle", "color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, {"supercategory": "vehicle", "color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, {"supercategory": "vehicle", "color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, {"supercategory": "vehicle", "color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, {"supercategory": "outdoor", "color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, {"supercategory": "outdoor", "color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, {"supercategory": "outdoor", "color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, {"supercategory": "outdoor", "color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, {"supercategory": "outdoor", "color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, {"supercategory": "animal", "color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, {"supercategory": "animal", "color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, {"supercategory": "animal", "color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, {"supercategory": "animal", "color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, {"supercategory": "animal", "color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, {"supercategory": "animal", "color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, {"supercategory": "animal", "color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, {"supercategory": "animal", "color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, {"supercategory": "animal", "color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, {"supercategory": "animal", "color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, {"supercategory": "accessory", "color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, {"supercategory": "accessory", "color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, {"supercategory": "accessory", "color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, {"supercategory": "accessory", "color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, {"supercategory": "accessory", "color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, {"supercategory": "sports", "color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, {"supercategory": "sports", "color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, {"supercategory": "sports", "color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, {"supercategory": "sports", "color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, {"supercategory": "sports", "color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, {"supercategory": "sports", "color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, {"supercategory": "sports", "color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, {"supercategory": "sports", "color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, {"supercategory": "sports", "color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, {"supercategory": "sports", "color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, {"supercategory": "kitchen", "color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, {"supercategory": "kitchen", "color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, {"supercategory": "kitchen", "color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, {"supercategory": "kitchen", "color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, {"supercategory": "kitchen", "color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, {"supercategory": "kitchen", "color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, {"supercategory": "kitchen", "color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, {"supercategory": "food", "color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, {"supercategory": "food", "color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, {"supercategory": "food", "color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, {"supercategory": "food", "color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, {"supercategory": "food", "color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, {"supercategory": "food", "color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, {"supercategory": "food", "color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, {"supercategory": "food", "color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, {"supercategory": "food", "color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, {"supercategory": "food", "color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, {"supercategory": "furniture", "color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, {"supercategory": "furniture", "color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, {"supercategory": "furniture", "color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, {"supercategory": "furniture", "color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, {"supercategory": "furniture", "color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, {"supercategory": "furniture", "color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, {"supercategory": "electronic", "color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, {"supercategory": "electronic", "color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, {"supercategory": "electronic", "color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, {"supercategory": "electronic", "color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, {"supercategory": "electronic", "color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, {"supercategory": "electronic", "color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, {"supercategory": "appliance", "color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, {"supercategory": "appliance", "color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, {"supercategory": "appliance", "color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, {"supercategory": "appliance", "color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, {"supercategory": "appliance", "color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, {"supercategory": "indoor", "color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, {"supercategory": "indoor", "color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, {"supercategory": "indoor", "color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, {"supercategory": "indoor", "color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, {"supercategory": "indoor", "color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, {"supercategory": "indoor", "color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, {"supercategory": "indoor", "color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, {"supercategory": "textile", "color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"}, {"supercategory": "textile", "color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"}, {"supercategory": "building", "color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"}, {"supercategory": "raw-material", "color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"}, {"supercategory": "furniture-stuff", "color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"}, {"supercategory": "textile", "color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"}, {"supercategory": "furniture-stuff", "color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"}, {"supercategory": "floor", "color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"}, {"supercategory": "plant", "color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"}, {"supercategory": "food-stuff", "color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"}, {"supercategory": "ground", "color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"}, {"supercategory": "building", "color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"}, {"supercategory": "furniture-stuff", "color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"}, {"supercategory": "furniture-stuff", "color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"}, {"supercategory": "structural", "color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"}, {"supercategory": "textile", "color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"}, {"supercategory": "ground", "color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"}, {"supercategory": "ground", "color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"}, {"supercategory": "ground", "color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"}, {"supercategory": "water", "color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"}, {"supercategory": "ground", "color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"}, {"supercategory": "building", "color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"}, {"supercategory": "ground", "color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"}, {"supercategory": "water", "color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"}, {"supercategory": "furniture-stuff", "color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"}, {"supercategory": "ground", "color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"}, {"supercategory": "furniture-stuff", "color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"}, {"supercategory": "building", "color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"}, {"supercategory": "textile", "color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"}, {"supercategory": "wall", "color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"}, {"supercategory": "wall", "color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"}, {"supercategory": "wall", "color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"}, {"supercategory": "wall", "color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"}, {"supercategory": "water", "color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"}, {"supercategory": "window", "color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"}, {"supercategory": "window", "color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"}, {"supercategory": "plant", "color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"}, {"supercategory": "structural", "color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"}, {"supercategory": "ceiling", "color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"}, {"supercategory": "sky", "color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"}, {"supercategory": "furniture-stuff", "color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"}, {"supercategory": "furniture-stuff", "color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"}, {"supercategory": "floor", "color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"}, {"supercategory": "ground", "color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"}, {"supercategory": "solid", "color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"}, {"supercategory": "plant", "color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"}, {"supercategory": "ground", "color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"}, {"supercategory": "raw-material", "color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"}, {"supercategory": "food-stuff", "color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"}, {"supercategory": "building", "color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"}, {"supercategory": "solid", "color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"}, {"supercategory": "wall", "color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"}, {"supercategory": "textile", "color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"}]
--- a/data/prepare_ade20k_sem_seg.py
+++ b/data/prepare_ade20k_sem_seg.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import os
+from pathlib import Path
+import tqdm
+from PIL import Image
+
+
+def convert(input, output):
+    img = np.asarray(Image.open(input))
+    assert img.dtype == np.uint8
+    img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
+    Image.fromarray(img).save(output)
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
+    for name in ["training", "validation"]:
+        annotation_dir = dataset_dir / "annotations" / name
+        output_dir = dataset_dir / "annotations_detectron2" / name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for file in tqdm.tqdm(list(annotation_dir.iterdir())):
+            output_file = output_dir / file.name
+            convert(file, output_file)
--- a/data/prepare_coco_semantic_annos_from_panoptic_annos.py
+++ b/data/prepare_coco_semantic_annos_from_panoptic_annos.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import functools
+import json
+import multiprocessing as mp
+import numpy as np
+import os
+import time
+from fvcore.common.download import download
+from panopticapi.utils import rgb2id
+from PIL import Image
+
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+
+
+def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
+    panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
+    panoptic = rgb2id(panoptic)
+    output = np.zeros_like(panoptic, dtype=np.uint8) + 255
+    for seg in segments:
+        cat_id = seg["category_id"]
+        new_cat_id = id_map[cat_id]
+        output[panoptic == seg["id"]] = new_cat_id
+    Image.fromarray(output).save(output_semantic)
+
+
+def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
+    """
+    Create semantic segmentation annotations from panoptic segmentation
+    annotations, to be used by PanopticFPN.
+    It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
+    It maps all stuff categories to contiguous ids starting from 1.
+    Args:
+        panoptic_json (str): path to the panoptic json file, in COCO's format.
+        panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
+        sem_seg_root (str): a directory to output semantic annotation files
+        categories (list[dict]): category metadata. Each dict needs to have:
+            "id": corresponds to the "category_id" in the json annotations
+            "isthing": 0 or 1
+    """
+    os.makedirs(sem_seg_root, exist_ok=True)
+
+    id_map = {}  # map from category id to id in the output semantic annotation
+    assert len(categories) <= 254
+    for i, k in enumerate(categories):
+        id_map[k["id"]] = i
+    # what is id = 0?
+    # id_map[0] = 255
+    print(id_map)
+
+    with open(panoptic_json) as f:
+        obj = json.load(f)
+
+    pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
+
+    def iter_annotations():
+        for anno in obj["annotations"]:
+            file_name = anno["file_name"]
+            segments = anno["segments_info"]
+            input = os.path.join(panoptic_root, file_name)
+            output = os.path.join(sem_seg_root, file_name)
+            yield input, output, segments
+
+    print("Start writing to {} ...".format(sem_seg_root))
+    start = time.time()
+    pool.starmap(
+        functools.partial(_process_panoptic_to_semantic, id_map=id_map),
+        iter_annotations(),
+        chunksize=100,
+    )
+    print("Finished. time: {:.2f}s".format(time.time() - start))
+
+
+if __name__ == "__main__":
+    dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
+    # for s in ["val2017", "train2017"]:
+    for s in ["val2017"]:
+        separate_coco_semantic_from_panoptic(
+            os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
+            os.path.join(dataset_dir, "panoptic_{}".format(s)),
+            os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
+            COCO_CATEGORIES,
+        )
--- a/data/register_coco_panoptic_annos_semseg.py
+++ b/data/register_coco_panoptic_annos_semseg.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import json
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+from detectron2.utils.file_io import PathManager
+
+
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+    "coco_2017_train_panoptic": (
+        # This is the original panoptic annotation directory
+        "coco/panoptic_train2017",
+        "coco/annotations/panoptic_train2017.json",
+        # This directory contains semantic annotations that are
+        # converted from panoptic annotations.
+        # It is used by PanopticFPN.
+        # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
+        # to create these directories.
+        "coco/panoptic_semseg_train2017",
+    ),
+    "coco_2017_val_panoptic": (
+        "coco/panoptic_val2017",
+        "coco/annotations/panoptic_val2017.json",
+        "coco/panoptic_semseg_val2017",
+    ),
+}
+
+
+def get_metadata():
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    stuff_classes = [k["name"] for k in COCO_CATEGORIES]
+    stuff_colors = [k["color"] for k in COCO_CATEGORIES]
+
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+
+    # Convert category id for training:
+    #   category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the linear
+    #           softmax classifier.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+
+    for i, cat in enumerate(COCO_CATEGORIES):
+        if cat["isthing"]:
+            thing_dataset_id_to_contiguous_id[cat["id"]] = i
+        # else:
+        #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+        # in order to use sem_seg evaluator
+        stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+
+    return meta
+
+
+def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = int(ann["image_id"])
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "sem_seg_file_name": sem_label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
+    return ret
+
+
+def register_coco_panoptic_annos_sem_seg(
+    name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
+):
+    panoptic_name = name
+    delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
+    delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
+    MetadataCatalog.get(panoptic_name).set(
+        thing_classes=metadata["thing_classes"],
+        thing_colors=metadata["thing_colors"],
+        # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
+    )
+
+    # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
+    semantic_name = name + "_with_sem_seg"
+    DatasetCatalog.register(
+        semantic_name,
+        lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
+    )
+    MetadataCatalog.get(semantic_name).set(
+        sem_seg_root=sem_seg_root,
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="coco_panoptic_seg",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+
+
+def register_all_coco_panoptic_annos_sem_seg(root):
+    for (
+        prefix,
+        (panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+        prefix_instances = prefix[: -len("_panoptic")]
+        instances_meta = MetadataCatalog.get(prefix_instances)
+        image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+
+        register_coco_panoptic_annos_sem_seg(
+            prefix,
+            get_metadata(),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, semantic_root),
+            instances_json,
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_coco_panoptic_annos_sem_seg(_root)
--- a/data/sampler.py
+++ b/data/sampler.py
+# --------------------------------------------------------
+# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
+# Github source: https://github.com/baaivision/Painter
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Xinlong Wang, Wen Wang
+# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
+# --------------------------------------------------------'
+
+from typing import Iterator, List, Optional, Union
+from collections import Counter
+import logging
+from operator import itemgetter
+from random import choices, sample
+
+import numpy as np
+
+import torch
+from torch.utils.data import Dataset, Sampler
+from torch.utils.data import DistributedSampler
+
+
+class DatasetFromSampler(Dataset):
+    """Dataset to create indexes from `Sampler`.
+    Args:
+        sampler: PyTorch sampler
+    """
+
+    def __init__(self, sampler: Sampler):
+        """Initialisation for DatasetFromSampler."""
+        self.sampler = sampler
+        self.sampler_list = None
+
+    def __getitem__(self, index: int):
+        """Gets element of the dataset.
+        Args:
+            index: index of the element in the dataset
+        Returns:
+            Single element by index
+        """
+        if self.sampler_list is None:
+            self.sampler_list = list(self.sampler)
+        return self.sampler_list[index]
+
+    def __len__(self) -> int:
+        """
+        Returns:
+            int: length of the dataset
+        """
+        return len(self.sampler)
+
+
+class DistributedSamplerWrapper(DistributedSampler):
+    """
+    Wrapper over `Sampler` for distributed training.
+    Allows you to use any sampler in distributed mode.
+    It is especially useful in conjunction with
+    `torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSamplerWrapper instance as a DataLoader
+    sampler, and load a subset of subsampled data of the original dataset
+    that is exclusive to it.
+    .. note::
+        Sampler is assumed to be of constant size.
+    """
+
+    def __init__(
+        self,
+        sampler,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+    ):
+        """
+        Args:
+            sampler: Sampler used for subsampling
+            num_replicas (int, optional): Number of processes participating in
+              distributed training
+            rank (int, optional): Rank of the current process
+              within ``num_replicas``
+            shuffle (bool, optional): If true (default),
+              sampler will shuffle the indices
+        """
+        super(DistributedSamplerWrapper, self).__init__(
+            DatasetFromSampler(sampler),
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+        )
+        self.sampler = sampler
+
+    def __iter__(self):
+        """@TODO: Docs. Contribution is welcome."""
+        self.dataset = DatasetFromSampler(self.sampler)
+        indexes_of_indexes = super().__iter__()
+        subsampler_indexes = self.dataset
+        return iter(itemgetter(*indexes_of_indexes)(subsampler_indexes))
+
--- a/data/sidd/gen_json_sidd.py
+++ b/data/sidd/gen_json_sidd.py
+# --------------------------------------------------------
+# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
+# Github source: https://github.com/baaivision/Painter
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Xinlong Wang, Wen Wang
+# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
+# --------------------------------------------------------'
+
+import os
+import glob
+import json
+import tqdm
+import argparse
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('SIDD denoising preparation', add_help=False)
+    parser.add_argument('--split', type=str, help='dataset split', 
+                        choices=['train', 'val'], required=True)
+    parser.add_argument('--output_dir', type=str, help='path to output dir', 
+                        default='datasets/denoise')
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = get_args_parser()
+
+    image_dir = "datasets/denoise/{}/input".format(args.split)
+    save_path = os.path.join(args.output_dir, "denoise_ssid_{}.json".format(args.split))
+    print(save_path)
+
+    output_dict = []
+
+    image_path_list = glob.glob(os.path.join(image_dir, '*.png'))
+    for image_path in tqdm.tqdm(image_path_list):
+        # image_name = os.path.basename(image_path)
+        target_path = image_path.replace('input', 'groundtruth')
+        assert os.path.isfile(image_path)
+        assert os.path.isfile(target_path)
+        pair_dict = {}
+        pair_dict["image_path"] = image_path.replace('datasets/', '')
+        pair_dict["target_path"] = target_path.replace('datasets/', '')
+        pair_dict["type"] = "ssid_image2denoise"
+        output_dict.append(pair_dict)
+
+    json.dump(output_dict, open(save_path, 'w'))
--- a/data/sidd/generate_patches_SIDD.py
+++ b/data/sidd/generate_patches_SIDD.py
+# modified from uformer https://github.com/ZhendongWang6/Uformer
+from glob import glob
+from tqdm import tqdm
+import numpy as np
+import os
+from natsort import natsorted
+import cv2
+from joblib import Parallel, delayed
+import multiprocessing
+import argparse
+
+parser = argparse.ArgumentParser(description='Generate patches from Full Resolution images')
+parser.add_argument('--src_dir', default='datasets/denoising/SIDD_Medium_Srgb/Data', type=str, help='Directory for full resolution images')
+parser.add_argument('--tar_dir', default='datasets/denoising/sidd/train',type=str, help='Directory for image patches')
+parser.add_argument('--ps', default=256, type=int, help='Image Patch Size')
+parser.add_argument('--num_patches', default=300, type=int, help='Number of patches per image')
+parser.add_argument('--num_cores', default=10, type=int, help='Number of CPU Cores')
+
+args = parser.parse_args()
+
+src = args.src_dir
+tar = args.tar_dir
+PS = args.ps
+NUM_PATCHES = args.num_patches
+NUM_CORES = args.num_cores
+
+noisy_patchDir = os.path.join(tar, 'input')
+clean_patchDir = os.path.join(tar, 'groundtruth')
+
+if os.path.exists(tar):
+    os.system("rm -r {}".format(tar))
+
+os.makedirs(noisy_patchDir)
+os.makedirs(clean_patchDir)
+
+#get sorted folders
+files = natsorted(glob(os.path.join(src, '*', '*.PNG')))
+
+noisy_files, clean_files = [], []
+for file_ in files:
+    filename = os.path.split(file_)[-1]
+    if 'GT' in filename:
+        clean_files.append(file_)
+    if 'NOISY' in filename:
+        noisy_files.append(file_)
+
+def save_files(i):
+    noisy_file, clean_file = noisy_files[i], clean_files[i]
+    noisy_img = cv2.imread(noisy_file)
+    clean_img = cv2.imread(clean_file)
+
+    H = noisy_img.shape[0]
+    W = noisy_img.shape[1]
+    for j in range(NUM_PATCHES):
+        rr = np.random.randint(0, H - PS)
+        cc = np.random.randint(0, W - PS)
+        noisy_patch = noisy_img[rr:rr + PS, cc:cc + PS, :]
+        clean_patch = clean_img[rr:rr + PS, cc:cc + PS, :]
+
+        cv2.imwrite(os.path.join(noisy_patchDir, '{}_{}.png'.format(i+1,j+1)), noisy_patch)
+        cv2.imwrite(os.path.join(clean_patchDir, '{}_{}.png'.format(i+1,j+1)), clean_patch)
+
+Parallel(n_jobs=NUM_CORES)(delayed(save_files)(i) for i in tqdm(range(len(noisy_files))))
\ No newline at end of file
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest
+RUN source /opt/dtk/env.sh
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
--- a/docs/CODE_OF_CONDUCT.md
+++ b/docs/CODE_OF_CONDUCT.md
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
+# Contributing to Painter
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to painter, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
\ No newline at end of file
--- a/docs/DATA.md
+++ b/docs/DATA.md
+# Prepare datasets for Painter
+
+The training of our model uses [COCO](https://cocodataset.org/), [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/), [NYUDepthV2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html), [Synthetic Rain Datasets](https://paperswithcode.com/dataset/synthetic-rain-datasets), [SIDD](https://www.eecs.yorku.ca/~kamel/sidd/), and [LoL](https://daooshee.github.io/BMVC2018website/) datasets.
+
+After processing, the datasets should look like:
+
+```
+$Painter_ROOT/datasets/
+    nyu_depth_v2/
+        sync/
+        official_splits/
+        nyu_depth_v2_labeled.mat
+        datasets/nyu_depth_v2/
+        nyuv2_sync_image_depth.json  # generated
+        nyuv2_test_image_depth.json  # generated
+    ade20k/
+        images/
+        annotations/
+        annotations_detectron2/  # generated
+        annotations_with_color/  # generated
+        ade20k_training_image_semantic.json  # generated
+        ade20k_validation_image_semantic.json  # generated
+    ADEChallengeData2016/  # sim-link to $Painter_ROOT/datasets/ade20k
+    coco/
+        train2017/
+        val2017/
+        annotations/
+            instances_train2017.json
+            instances_val2017.json
+            person_keypoints_val2017.json
+            panoptic_train2017.json
+            panoptic_val2017.json
+            panoptic_train2017/
+            panoptic_val2017/
+        panoptic_semseg_val2017/  # generated
+        panoptic_val2017/  # sim-link to $Painter_ROOT/datasets/coco/annotations/panoptic_val2017
+        pano_sem_seg/  # generated
+            panoptic_segm_train2017_with_color
+            panoptic_segm_val2017_with_color
+            coco_train2017_image_panoptic_sem_seg.json
+            coco_val2017_image_panoptic_sem_seg.json
+        pano_ca_inst/  # generated
+            train_aug0/
+            train_aug1/
+            ...
+            train_aug29/
+            train_org/
+            train_flip/
+            val_org/
+            coco_train_image_panoptic_inst.json
+            coco_val_image_panoptic_inst.json
+    coco_pose/
+        person_detection_results/
+            COCO_val2017_detections_AP_H_56_person.json
+        data_pair/  # generated
+            train_256x192_aug0/
+            train_256x192_aug1/
+            ...
+            train_256x192_aug19/
+            val_256x192/
+            test_256x192/
+            test_256x192_flip/
+        coco_pose_256x192_train.json  # generated
+        coco_pose_256x192_val.json  # generated
+    derain/
+        train/
+            input/
+            target/
+        test/
+            Rain100H/
+            Rain100L/
+            Test100/
+            Test1200/
+            Test2800/
+        derain_train.json
+        derain_test_rain100h.json
+    denoise/
+        SIDD_Medium_Srgb/
+        train/
+        val/
+        denoise_ssid_train.json  # generated
+        denoise_ssid_val.json  # generated
+    light_enhance/
+        our485/
+            low/
+            high/
+        eval15/
+            low/
+            high/
+        enhance_lol_train.json  # generated
+        enhance_lol_val.json  # generated
+
+```
+Please follow the following instruction to pre-process individual datasets.
+
+
+## NYU Depth V2
+
+First, download the dataset from [here](https://drive.google.com/file/d/1AysroWpfISmm-yRFGBgFTrLy6FjQwvwP/view?usp=sharing). Please make sure to locate the downloaded file to `$Painter_ROOT/datasets/nyu_depth_v2/sync.zip`
+
+Next, prepare [NYU Depth V2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html) test set.
+```bash
+# get official NYU Depth V2 split file
+wget -P datasets/nyu_depth_v2/ http://horatio.cs.nyu.edu/mit/silberman/nyu_depth_v2/nyu_depth_v2_labeled.mat
+# convert mat file to image files
+python data/depth/extract_official_train_test_set_from_mat.py datasets/nyu_depth_v2/nyu_depth_v2_labeled.mat data/depth/splits.mat datasets/nyu_depth_v2/official_splits/
+```
+
+Lastly, prepare json files for training and evaluation. The generated json files will be saved at `$Painter_ROOT/datasets/nyu_depth_v2/`.
+```bash
+python data/depth/gen_json_nyuv2_depth.py --split sync
+python data/depth/gen_json_nyuv2_depth.py --split test
+```
+
+## ADE20k Semantic Segmentation
+
+First, download the dataset from the [official website](https://groups.csail.mit.edu/vision/datasets/ADE20K/), and put it in `$Painter_ROOT/datasets/`. Afterward, unzip the zip file and rename the target folder as `ade20k`. The ADE20k folder should look like:
+```
+ade20k/
+    images/
+    annotations/
+```
+
+Second, prepare annotations for training using the following command. The generated annotations will be saved at `$Painter_ROOT/datasets/ade20k/annotations_with_color/`.
+```bash
+python data/ade20k/gen_color_ade20k_sem.py --split training
+python data/ade20k/gen_color_ade20k_sem.py --split validation
+```
+
+Third, prepare json files for training and evaluation. The generated json files will be saved at `$Painter_ROOT/datasets/ade20k/`.
+```bash
+python data/ade20k/gen_json_ade20k_sem.py --split training
+python data/ade20k/gen_json_ade20k_sem.py --split validation
+```
+
+Lastly, to enable evaluation with detectron2, link `$Painter_ROOT/datasets/ade20k` to `$Painter_ROOT/datasets/ADEChallengeData2016` and run:
+```bash
+# ln -s $Painter_ROOT/datasets/ade20k datasets/ADEChallengeData2016
+python data/prepare_ade20k_sem_seg.py
+```
+
+## COCO Panoptic Segmentation
+Download the COCO2017 dataset and the corresponding panoptic segmentation annotation. The COCO folder should look like:
+```
+coco/
+    train2017/
+    val2017/
+    annotations/
+        instances_train2017.json
+        instances_val2017.json
+        panoptic_train2017.json
+        panoptic_val2017.json
+        panoptic_train2017/
+        panoptic_val2017/
+```
+
+### Prepare Data for COCO Semantic Segmentation
+Prepare annotations for training using the following command. The generated annotations will be saved at `$Painter_ROOT/datasets/coco/pano_sem_seg/`.
+```bash
+python data/coco_semseg/gen_color_coco_panoptic_segm.py --split train2017
+python data/coco_semseg/gen_color_coco_panoptic_segm.py --split val2017
+```
+
+Prepare json files for training and evaluation. The generated json files will be saved at `$Painter_ROOT/datasets/coco/pano_sem_seg/`.
+```bash
+python data/coco_semseg/gen_json_coco_panoptic_segm.py --split train2017
+python data/coco_semseg/gen_json_coco_panoptic_segm.py --split val2017
+```
+
+### Prepare Data for COCO Class-Agnostic Instance Segmentation 
+
+First, pre-process the dataset using the following command, the painted ground truth will be saved to `$Painter_ROOT/datasets/coco/pano_ca_inst`. 
+
+```bash
+cd $Painter_ROOT/data/mmdet_custom
+
+# generate training data with common data augmentation for instance segmentation, 
+# note we generate 30 copies by alternating train_aug{idx} in configs/coco_panoptic_ca_inst_gen_aug.py
+./tools/dist_train.sh configs/coco_panoptic_ca_inst_gen_aug.py 1
+# generate training data with only horizontal flip augmentation
+./tools/dist_train.sh configs/coco_panoptic_ca_inst_gen_orgflip.py 1
+# generate training data w/o data augmentation
+./tools/dist_train.sh configs/coco_panoptic_ca_inst_gen_org.py 1
+
+# generate validation data (w/o data augmentation)
+./tools/dist_test.sh configs/coco_panoptic_ca_inst_gen_org.py none 1 --eval segm
+```
+
+Next, prepare json files for training and evaluation. The generated json files will be saved at `$Painter_ROOT/datasets/coco/pano_ca_inst`.
+```bash
+cd $Painter_ROOT
+python data/mmdet_custom/gen_json_coco_panoptic_inst.py --split train
+python data/mmdet_custom/gen_json_coco_panoptic_inst.py --split val
+```
+
+Lastly, to enable evaluation with detectron2, link `$Painter_ROOT/datasets/coco/annotations/panoptic_val2017` to `$Painter_ROOT/datasets/coco/panoptic_val2017` and run:
+```bash
+# ln -s $Painter_ROOT/datasets/coco/annotations/panoptic_val2017 datasets/coco/panoptic_val2017
+python data/prepare_coco_semantic_annos_from_panoptic_annos.py
+```
+
+
+## COCO Human Pose Estimation
+
+First, download person detection result of COCO val2017 from [google drive](https://drive.google.com/drive/folders/1fRUDNUDxe9fjqcRZ2bnF_TKMlO0nB_dk), and put it in `$Painter_ROOT/datasets/coco_pose/`
+
+
+First, pre-process the dataset using the following command, the painted ground truth will be saved to `$Painter_ROOT/datasets/coco_pose/`. 
+
+```bash
+cd $Painter_ROOT/data/mmpose_custom
+
+# generate training data with common data augmentation for pose estimation, note we generate 20 copies for training
+./tools/dist_train.sh configs/coco_256x192_gendata.py 1
+# genearte data for eval during training
+./tools/dist_test.sh configs/coco_256x192_gendata.py none 1
+
+# generate data for testing (using offline boxes)
+./tools/dist_test.sh configs/coco_256x192_gendata_test.py none 1
+# generate data for testing (using offline boxes & with flip)
+./tools/dist_test.sh configs/coco_256x192_gendata_testflip.py none 1
+```
+
+Next, prepare json files for training and evaluation. The generated json files will be saved at `datasets/pano_ca_inst/`.
+```bash
+cd $Painter_ROOT
+python data/mmpose_custom/gen_json_coco_pose.py --split train
+python data/mmpose_custom/gen_json_coco_pose.py --split val
+```
+
+
+## Low-level Vision Tasks
+
+### Deraining
+We follow [MPRNet](https://github.com/swz30/MPRNet) to prepare the data for deraining.
+
+Download the dataset following the instructions in [MPRNet](https://github.com/swz30/MPRNet/blob/main/Deraining/Datasets/README.md), and put it in `$Painter_ROOT/datasets/derain/`. The folder should look like:
+```bash
+derain/
+    train/
+        input/
+        target/
+    test/
+        Rain100H/
+        Rain100L/
+        Test100/
+        Test1200/
+        Test2800/
+```
+
+Next, prepare json files for training and evaluation. The generated json files will be saved at `datasets/derain/`.
+```bash
+python data/derain/gen_json_rain.py --split train
+python data/derain/gen_json_rain.py --split val
+```
+
+### Denoising
+We follow [Uformer](https://github.com/ZhendongWang6/Uformer) to prepare the data for SIDD denoising dataset.
+
+For training data of SIDD, you can download the SIDD-Medium dataset from the [official url](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php). For evaluation on SIDD, you can download data from [here](https://mailustceducn-my.sharepoint.com/:f:/g/personal/zhendongwang_mail_ustc_edu_cn/Ev832uKaw2JJhwROKqiXGfMBttyFko_zrDVzfSbFFDoi4Q?e=S3p5hQ).
+
+Next, generate image patches for training by the following command:
+```bash
+python data/sidd/generate_patches_SIDD.py --src_dir datasets/denoise/SIDD_Medium_Srgb/Data --tar_dir datasets/denoise/train
+```
+
+Lastly, prepare json files for training and evaluation. The generated json files will be saved at `datasets/denoise/`.
+```bash
+python data/sidd/gen_json_sidd.py --split train
+python data/sidd/gen_json_sidd.py --split val
+```
+
+
+### Low-Light Image Enhancement
+
+First, download images of LOL dataset from [google drive](https://drive.google.com/file/d/157bjO1_cFuSd0HWDUuAmcHRJDVyWpOxB/view) and put it in `$Painter_ROOT/datasets/light_enhance/`. The folder should look like:
+look like:
+```bash
+light_enhance/
+    our485/
+        low/
+        high/
+    eval15/
+        low/
+        high/
+```
+
+Next, prepare json files for training and evaluation. The generated json files will be saved at `$Painter_ROOTdatasets/light_enhance/`.
+```bash
+python data/lol/gen_json_lol.py --split train
+python data/lol/gen_json_lol.py --split val
+```
+
--- a/docs/EVAL.md
+++ b/docs/EVAL.md
+# Evaluation Instructions for Painter
+
+## NYU Depth V2
+
+To evaluate Painter on NYU Depth V2, you may first update the `$JOB_NAME` in `$Painter_ROOT/eval/nyuv2_depth/eval.sh`, then run:
+```bash
+bash eval/nyuv2_depth/eval.sh
+```
+
+## ADE20k Semantic Segmentation
+
+To evaluate Painter on ADE20k semantic segmentation, you may first update the `$JOB_NAME` in `$Painter_ROOT/eval/ade20k_semantic/eval.sh`, then run:
+```bash
+bash eval/ade20k_semantic/eval.sh
+```
+
+## COCO Panoptic Segmentation
+
+To evaluate Painter on COCO panoptic segmentation, you may first update the `$JOB_NAME` in `$Painter_ROOT/eval/coco_panoptic/eval.sh`, then run:
+```bash
+bash eval/coco_panoptic/eval.sh
+```
+
+
+## COCO Human Pose Estimation
+
+To evaluate Painter on COCO pose estimation, first generate the painted images:
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 --master_port=29500 --use_env eval/mmpose_custom/painter_inference_pose.py --ckpt_path models/painter_vit_large/painter_vit_large.pth
+python -m torch.distributed.launch --nproc_per_node=8 --master_port=29500 --use_env eval/mmpose_custom/painter_inference_pose.py --ckpt_path models/painter_vit_large/painter_vit_large.pth --flip_test
+```
+
+Then, you may update the `job_name` and `ckpt_file` in `$Painter_ROOT/eval/mmpose_custom/configs/coco_256x192_test_offline.py`, and run:
+```bash
+cd $Painter_ROOT/eval/mmpose_custom
+./tools/dist_test.sh configs/coco_256x192_test_offline.py none 1 --eval mAP
+```
+
+## Low-level Vision Tasks
+
+### Deraining
+
+To evaluate Painter on deraining, first generate the derained images.
+```bash
+python eval/derain/painter_inference_derain.py --ckpt_path models/painter_vit_large/painter_vit_large.pth
+```
+
+Then, update the path to derained images and ground truth in `$Painter_ROOT/eval/derain/evaluate_PSNR_SSIM.m` and run the following script in MATLAB.
+```bash
+$Painter_ROOT/eval/derain/evaluate_PSNR_SSIM.m 
+```
+
+
+### Denoising
+
+To evaluate Painter on SIDD denoising, first generate the denoised images.
+```bash
+python eval/sidd/painter_inference_sidd.py --ckpt_path models/painter_vit_large/painter_vit_large.pth
+```
+
+Then, update the path to denoising output and ground truth in `$Painter_ROOT/eval/sidd/eval_sidd.m` and run the following script in MATLAB.
+```bash
+$Painter_ROOT/eval/sidd/eval_sidd.m 
+```
+
+
+### Low-Light Image Enhancement
+
+To evaluate Painter on LoL image enhancement:
+```bash
+python eval/lol/painter_inference_lol.py --ckpt_path models/painter_vit_large/painter_vit_large.pth
+```