First commit

106580f9 · chenych · 106580f9 · 106580f9 · 106580f9 · 106580f9
Commit 106580f9 authored Dec 29, 2023 by chenych
20 changed files
--- a/data/mmdet_custom/configs/coco_panoptic_ca_inst_gen_orgflip.py
+++ b/data/mmdet_custom/configs/coco_panoptic_ca_inst_gen_orgflip.py
+# modified from mask2former config
+_base_ = [
+    './_base_/dataset/coco_panoptic.py', './_base_/default_runtime.py'
+]
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = None
+
+# dataset settings
+image_size = (1024, 1024)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='RandomFlip', flip_ratio=1.0),
+    # # large scale jittering
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(1.0, 1.0),
+        multiscale_mode='range',
+        keep_ratio=False),
+    # dict(
+    #     type='RandomCrop',
+    #     crop_size=image_size,
+    #     crop_type='absolute',
+    #     recompute_bbox=True,
+    #     allow_negative_crop=True),
+    # dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=image_size),
+    dict(
+        type='SaveDataPairCustom',
+        dir_name='train_orgflip',
+        target_path='/home/datasets/coco/pano_ca_inst',
+    ),  # custom, we don't care the transforms afterward
+    dict(type='DefaultFormatBundle', img_to_float=True),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.0),
+    # large scale jittering
+    dict(
+        type='Resize',
+        img_scale=image_size,
+        ratio_range=(1.0, 1.0),
+        multiscale_mode='range',
+        keep_ratio=False),
+    dict(type='Pad', size=image_size),
+    dict(
+        type='SaveDataPairCustom',
+        dir_name='val_org',
+        target_path='/home/datasets/coco/pano_ca_inst',
+    ),  # custom, we don't care the transforms afterward
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle', img_to_float=True),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+
+data_root = '/home/datasets/coco/'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(pipeline=train_pipeline),
+    val=dict(
+        pipeline=test_pipeline,
+        ins_ann_file=data_root + 'annotations/instances_val2017.json',
+    ),
+    test=dict(
+        pipeline=test_pipeline,
+        ins_ann_file=data_root + 'annotations/instances_val2017.json',
+    ))
+
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
+
+custom = dict(
+    load_data_only=True,
+)
+by_epoch = True
+# learning policy
+lr_config = dict(
+    policy='step',
+    gamma=0.1,
+    by_epoch=by_epoch,
+    step=[327778, 355092],
+    warmup='linear',
+    warmup_by_epoch=by_epoch,
+    warmup_ratio=1.0,  # no warmup
+    warmup_iters=10)
+
+max_iters = 368750
+# runner = dict(type='IterBasedRunner', max_iters=max_iters)
+runner = dict(type='EpochBasedRunner', max_epochs=1)  # we prefer by epoch
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', by_epoch=by_epoch),
+        dict(type='TensorboardLoggerHook', by_epoch=by_epoch)
+    ])
+interval = 5000
+workflow = [('train', interval)]
+checkpoint_config = dict(
+    by_epoch=by_epoch, interval=interval, save_last=True, max_keep_ckpts=3)
+
+# Before 365001th iteration, we do evaluation every 5000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means that we do evaluation at the end of training.
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+evaluation = dict(
+    interval=interval,
+    dynamic_intervals=dynamic_intervals,
+    metric=['PQ', 'bbox', 'segm'])
+
+# import newly registered module
+custom_imports = dict(
+    imports=[
+        'data.coco_panoptic',
+        'data.pipelines.transforms',
+    ],
+    allow_failed_imports=False)
--- a/data/mmdet_custom/data/coco_panoptic.py
+++ b/data/mmdet_custom/data/coco_panoptic.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import os
+from collections import defaultdict
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from mmdet.core import INSTANCE_OFFSET
+from mmdet.datasets.api_wrappers import COCO, pq_compute_multi_core
+from mmdet.datasets.builder import DATASETS
+from mmdet.datasets.coco import CocoDataset
+from mmdet.datasets.coco_panoptic import CocoPanopticDataset, COCOPanoptic
+
+try:
+    import panopticapi
+    from panopticapi.evaluation import VOID
+    from panopticapi.utils import id2rgb
+except ImportError:
+    panopticapi = None
+    id2rgb = None
+    VOID = None
+
+__all__ = ['CocoPanopticDatasetCustom']
+
+
+class COCOPanoptic(COCO):
+    """This wrapper is for loading the panoptic style annotation file.
+
+    The format is shown in the CocoPanopticDataset class.
+
+    Args:
+        annotation_file (str): Path of annotation file.
+    """
+
+    def __init__(self, annotation_file=None):
+        if panopticapi is None:
+            raise RuntimeError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+
+        super(COCOPanoptic, self).__init__(annotation_file)
+
+    def createIndex(self, use_ext=False):
+        assert use_ext is False
+        # create index
+        print('creating index...')
+        # anns stores 'segment_id -> annotation'
+        anns, cats, imgs = {}, {}, {}
+        img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann, img_info in zip(self.dataset['annotations'],
+                                     self.dataset['images']):
+                img_info['segm_file'] = ann['file_name']
+                for seg_ann in ann['segments_info']:
+                    # to match with instance.json
+                    seg_ann['image_id'] = ann['image_id']
+                    seg_ann['height'] = img_info['height']
+                    seg_ann['width'] = img_info['width']
+                    img_to_anns[ann['image_id']].append(seg_ann)
+                    # segment_id is not unique in coco dataset orz...
+                    if seg_ann['id'] in anns.keys():
+                        anns[seg_ann['id']].append(seg_ann)
+                    else:
+                        anns[seg_ann['id']] = [seg_ann]
+
+        if 'images' in self.dataset:
+            for img in self.dataset['images']:
+                imgs[img['id']] = img
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    cat_to_imgs[seg_ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        self.anns = anns
+        self.imgToAnns = img_to_anns
+        self.catToImgs = cat_to_imgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def load_anns(self, ids=[]):
+        """Load anns with the specified ids.
+
+        self.anns is a list of annotation lists instead of a
+        list of annotations.
+
+        Args:
+            ids (int array): integer ids specifying anns
+
+        Returns:
+            anns (object array): loaded ann objects
+        """
+        anns = []
+
+        if hasattr(ids, '__iter__') and hasattr(ids, '__len__'):
+            # self.anns is a list of annotation lists instead of
+            # a list of annotations
+            for id in ids:
+                anns += self.anns[id]
+            return anns
+        elif type(ids) == int:
+            return self.anns[ids]
+
+
+@DATASETS.register_module()
+class CocoPanopticDatasetCustom(CocoPanopticDataset):
+    """Coco dataset for Panoptic segmentation.
+
+    The annotation format is shown as follows. The `ann` field is optional
+    for testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': f'{image_id:012}.png',
+                'image_id':9
+                'segments_info': {
+                    [
+                        {
+                            'id': 8345037, (segment_id in panoptic png,
+                                            convert from rgb)
+                            'category_id': 51,
+                            'iscrowd': 0,
+                            'bbox': (x1, y1, w, h),
+                            'area': 24315,
+                            'segmentation': list,(coded mask)
+                        },
+                        ...
+                    }
+                }
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Panoptic segmentation annotation file path.
+        pipeline (list[dict]): Processing pipeline.
+        ins_ann_file (str): Instance segmentation annotation file path.
+            Defaults to None.
+        classes (str | Sequence[str], optional): Specify classes to load.
+            If is None, ``cls.CLASSES`` will be used. Defaults to None.
+        data_root (str, optional): Data root for ``ann_file``,
+            ``ins_ann_file`` ``img_prefix``, ``seg_prefix``, ``proposal_file``
+            if specified. Defaults to None.
+        img_prefix (str, optional): Prefix of path to images. Defaults to ''.
+        seg_prefix (str, optional): Prefix of path to segmentation files.
+            Defaults to None.
+        proposal_file (str, optional): Path to proposal file. Defaults to None.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+            Defaults to False.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes of the dataset's classes will be filtered out. This option
+            only works when `test_mode=False`, i.e., we never filter images
+            during tests. Defaults to True.
+        file_client_args (:obj:`mmcv.ConfigDict` | dict): file client args.
+            Defaults to dict(backend='disk').
+    """
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO Panoptic style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+        self.coco = COCOPanoptic(ann_file)
+        self.cat_ids = self.coco.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.categories = self.coco.cats
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            info['filename'] = info['file_name']
+            info['segm_file'] = info['filename'].replace('jpg', 'png')
+            data_infos.append(info)
+        return data_infos
+
+    def prepare_test_img(self, idx):
+        """Get testing data after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Testing data after pipeline with new keys introduced by \
+                pipeline.
+        """
+
+        img_info = self.data_infos[idx]
+        # results = dict(img_info=img_info)
+        ann_info = self.get_ann_info(idx)
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
--- a/data/mmdet_custom/data/pipelines/transforms.py
+++ b/data/mmdet_custom/data/pipelines/transforms.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import math
+import warnings
+
+import os
+from PIL import Image
+import cv2
+import mmcv
+import numpy as np
+from numpy import random
+
+from mmdet.datasets.builder import PIPELINES
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+
+def define_colors_per_location_r_gb(num_location_r=16, num_location_gb=20):
+    sep_r = 255 // num_location_r
+    sep_gb = 256 // num_location_gb + 1  # +1 for bigger sep in gb
+
+    color_dict = {}
+    # R = G = B = 0
+    # B += separation_per_channel  # offset for the first loop
+    for global_y in range(4):
+        for global_x in range(4):
+            global_locat = (global_x, global_y)
+            global_locat_sum = global_y * 4 + global_x
+            R = 255 - global_locat_sum * sep_r
+            for local_y in range(num_location_gb):
+                for local_x in range(num_location_gb):
+                    local_locat = (local_x, local_y)
+                    G = 255 - local_y * sep_gb
+                    B = 255 - local_x * sep_gb
+
+                    assert (R < 256) and (G < 256) and (B < 256)
+                    assert (R >= 0) and (G >= 0) and (B >= 0)
+                    assert (R, G, B) not in color_dict.values()
+
+                    location = (global_locat, local_locat)
+                    color_dict[location] = (R, G, B)
+
+    # colors = [v for k, v in color_dict.items()]
+    return color_dict
+
+
+def simplify_color_dict(color_dict, num_location_r=16, num_location_gb=20):
+    color_dict_simple = {}
+    for k, v in color_dict.items():
+        global_locat, local_locat = k
+        global_x, global_y = global_locat
+        local_x, local_y = local_locat
+        absolute_x = global_x * num_location_gb + local_x
+        absolute_y = global_y * num_location_gb + local_y
+        color_dict_simple[(absolute_x, absolute_y)] = np.array(v)
+    return color_dict_simple
+
+
+@PIPELINES.register_module()
+class SaveDataPairCustom:
+    """Save PanoInst Masks
+
+    """
+
+    def __init__(self,
+                 dir_name,
+                 target_path='../datasets/coco/pano_ca_inst',
+                 method='mass_center',
+                 num_location_r=16,
+                 num_location_gb=20):
+        self.dir_name = dir_name
+        self.target_path = target_path
+        output_dir = os.path.join(self.target_path, self.dir_name)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        self.method = method
+        self.color_dict_global_local = define_colors_per_location_r_gb(
+            num_location_r=num_location_r, num_location_gb=num_location_gb)
+        self.color_dict = simplify_color_dict(
+            self.color_dict_global_local, num_location_r=num_location_r, num_location_gb=num_location_gb)
+
+    def __call__(self, results):
+        """Call function to save images.
+        """
+        # get keys of interest
+        img = results['img']  # (h, w, 3), ndarray, range 0-255
+        gt_bboxes = results['gt_bboxes']  # (num_inst, 4), ndarray, xyxy
+        gt_labels = results['gt_labels']  # (num_inst, )
+        gt_masks = results['gt_masks'].masks  # BitmapMasks, gt_masks.masks: (num_inst, h, w)
+        # gt_semantic_seg = results['gt_semantic_seg']
+        # check input
+        assert (gt_labels >= 0).all() and (gt_labels < 80).all()
+        assert (np.sum(gt_masks, axis=0) >= 0).all() and (np.sum(gt_masks, axis=0) <= 1).all()
+        # get box centers
+        h, w, _ = img.shape
+        num_inst = len(gt_labels)
+        segmentation = np.zeros((h, w, 3), dtype="uint8")
+        for idx in range(num_inst):
+            # iscrowd already filtered, and are stored in results['ann_info']['bboxes_ignore']
+            # but some iscrowd are not correctly labelled, e.g., 000000415447
+            # if (np.sum(gt_bboxes[idx] == results['ann_info']['bboxes_ignore'], axis=1) == 4).any():
+            # if len(results['ann_info']['bboxes_ignore']) > 0:
+            #     import pdb; pdb.set_trace()
+            if self.method == "geo_center":
+                box = gt_bboxes[idx]  # (4, )
+                center = (box[:2] + box[2:]) / 2  # (2, )
+                center_x, center_y = center
+            elif self.method == "mass_center":
+                mask = gt_masks[idx]  # (h, w)
+                center_x, center_y = self.center_of_mass(mask)
+            else:
+                raise NotImplementedError(self.method)
+            center_x_norm = int(center_x / w * 79)
+            center_y_norm = int(center_y / h * 79)
+            color = self.color_dict[(center_x_norm, center_y_norm)]
+            mask = gt_masks[idx].astype("bool")  # only bool can be used for slicing!
+            segmentation[mask] = color
+        if (segmentation == 0).all():
+            # pure black label
+            return results
+        # save files
+        output_dir = os.path.join(self.target_path, self.dir_name)
+        file_name = results['img_info']['file_name']
+        # images are loaded in bgr order, reverse before saving
+        img_pil = Image.fromarray(img[:, :, ::-1].astype('uint8'))
+        label_pil = Image.fromarray(segmentation)
+        image_path = os.path.join(output_dir, file_name.replace(".jpg", "_image_{}.png".format(self.dir_name)))
+        label_path = os.path.join(output_dir, file_name.replace(".jpg", "_label_{}.png".format(self.dir_name)))
+        # if os.path.exists(image_path) or os.path.exists(label_path):
+        #     print("{} exists!".format(image_path))
+        #     return results
+        aug_idx = 0
+        while os.path.exists(image_path) or os.path.exists(label_path):
+            aug_idx += 1
+            image_path = os.path.join(output_dir, file_name.replace(".jpg", "_image_{}_{}.png".format(self.dir_name, aug_idx)))
+            label_path = os.path.join(output_dir, file_name.replace(".jpg", "_label_{}_{}.png".format(self.dir_name, aug_idx)))
+        img_pil.save(image_path)
+        label_pil.save(label_path)
+
+        return results
+
+    def center_of_mass(self, mask, esp=1e-6):
+        """Calculate the centroid coordinates of the mask.
+
+        Args:
+            mask (Tensor): The mask to be calculated, shape (h, w).
+            esp (float): Avoid dividing by zero. Default: 1e-6.
+
+        Returns:
+            tuple[Tensor]: the coordinates of the center point of the mask.
+
+                - center_h (Tensor): the center point of the height.
+                - center_w (Tensor): the center point of the width.
+        """
+        h, w = mask.shape
+        grid_h = np.arange(h)[:, None]
+        grid_w = np.arange(w)
+        normalizer = mask.sum().astype("float").clip(min=esp)
+        center_h = (mask * grid_h).sum() / normalizer
+        center_w = (mask * grid_w).sum() / normalizer
+        return center_w, center_h
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(method={self.method})'
+        return repr_str
--- a/data/mmdet_custom/gen_json_coco_panoptic_inst.py
+++ b/data/mmdet_custom/gen_json_coco_panoptic_inst.py
+# --------------------------------------------------------
+# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
+# Github source: https://github.com/baaivision/Painter
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Xinlong Wang, Wen Wang
+# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
+# --------------------------------------------------------'
+
+import os
+import glob
+import json
+import tqdm
+import argparse
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('COCO class-agnostic instance segmentation preparation', add_help=False)
+    parser.add_argument('--split', type=str, help='dataset split', 
+                        choices=['train', 'val'], required=True)
+    parser.add_argument('--output_dir', type=str, help='path to output dir', 
+                        default='datasets/coco/pano_ca_inst')
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+
+    panoptic_dir = "datasets/coco/pano_ca_inst"
+    save_path = os.path.join(args.output_dir, "coco_{}_image_panoptic_inst.json".format(args.split))
+    print(save_path)
+
+    output_dict = []
+
+    image_path_list = glob.glob(os.path.join(panoptic_dir, '{}_*'.format(args.split), '*image*.png'))
+    for image_path in tqdm.tqdm(image_path_list):
+        image_dir, image_name = os.path.dirname(image_path), os.path.basename(image_path)
+        panoptic_path = os.path.join(image_dir, image_name.replace('image', 'label'))
+        assert os.path.isfile(image_path)
+        if not os.path.isfile(panoptic_path):
+            print("ignore {}".format(image_path))
+            continue
+        pair_dict = {}
+        pair_dict["image_path"] = image_path.replace('datasets/', '')
+        pair_dict["target_path"] = panoptic_path.replace('datasets/', '')
+        pair_dict["type"] = "coco_image2panoptic_inst"
+        output_dict.append(pair_dict)
+
+    json.dump(output_dict, open(save_path, 'w'))
--- a/data/mmdet_custom/tools/dist_test.sh
+++ b/data/mmdet_custom/tools/dist_test.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
--- a/data/mmdet_custom/tools/dist_train.sh
+++ b/data/mmdet_custom/tools/dist_train.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --seed 0 \
+    --launcher pytorch ${@:3}
--- a/data/mmdet_custom/tools/test.py
+++ b/data/mmdet_custom/tools/test.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import time
+import warnings
+import sys
+import tqdm
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet.apis import multi_gpu_test, single_gpu_test
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.models import build_detector
+from mmdet.utils import (build_ddp, build_dp, compat_cfg, get_device,
+                         replace_cfg_vals, setup_multi_processes,
+                         update_data_root)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold (default: 0.3)')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both '
+            'specified, --options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    cfg = compat_cfg(cfg)
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # if 'pretrained' in cfg.model:
+    #     cfg.model.pretrained = None
+    # elif 'init_cfg' in cfg.model.backbone:
+    #     cfg.model.backbone.init_cfg = None
+    #
+    # if cfg.model.get('neck'):
+    #     if isinstance(cfg.model.neck, list):
+    #         for neck_cfg in cfg.model.neck:
+    #             if neck_cfg.get('rfp_backbone'):
+    #                 if neck_cfg.rfp_backbone.get('pretrained'):
+    #                     neck_cfg.rfp_backbone.pretrained = None
+    #     elif cfg.model.neck.get('rfp_backbone'):
+    #         if cfg.model.neck.rfp_backbone.get('pretrained'):
+    #             cfg.model.neck.rfp_backbone.pretrained = None
+
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed testing. Use the first GPU '
+                      'in `gpu_ids` now.')
+    else:
+        cfg.gpu_ids = [args.gpu_id]
+    cfg.device = get_device()
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    test_dataloader_default_args = dict(
+        samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False)
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    test_loader_cfg = {
+        **test_dataloader_default_args,
+        **cfg.data.get('test_dataloader', {})
+    }
+
+    rank, _ = get_dist_info()
+    # allows not to create
+    if args.work_dir is not None and rank == 0:
+        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        json_file = osp.join(args.work_dir, f'eval_{timestamp}.json')
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
+
+    load_data_only = cfg.custom.get('load_data_only', False)
+    assert load_data_only
+    for _ in tqdm.tqdm(data_loader):
+        pass
+    print("dataset enumerated, exit!")
+    sys.exit()
+
+
+if __name__ == '__main__':
+    main()
--- a/data/mmdet_custom/tools/train.py
+++ b/data/mmdet_custom/tools/train.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist
+from mmcv.utils import get_git_hash
+
+from mmdet import __version__
+from mmdet.apis import init_random_seed, set_random_seed
+from mmdet.datasets import build_dataset
+from mmdet.models import build_detector
+from mmdet.utils import (collect_env, get_device, get_root_logger,
+                         replace_cfg_vals, setup_multi_processes,
+                         update_data_root)
+
+import sys
+sys.path.insert(0, './')
+from apis.train import train_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--auto-resume',
+        action='store_true',
+        help='resume from the latest checkpoint automatically')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='(Deprecated, please use --gpu-id) number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--diff-seed',
+        action='store_true',
+        help='Whether or not set different seeds for different ranks')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--auto-scale-lr',
+        action='store_true',
+        help='enable automatically scaling LR.')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.auto_scale_lr:
+        if 'auto_scale_lr' in cfg and \
+                'enable' in cfg.auto_scale_lr and \
+                'base_batch_size' in cfg.auto_scale_lr:
+            cfg.auto_scale_lr.enable = True
+        else:
+            warnings.warn('Can not find "auto_scale_lr" or '
+                          '"auto_scale_lr.enable" or '
+                          '"auto_scale_lr.base_batch_size" in your'
+                          ' configuration file. Please update all the '
+                          'configuration files to mmdet >= 2.24.1.')
+
+    # set multi-process settings
+    setup_multi_processes(cfg)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    cfg.auto_resume = args.auto_resume
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    cfg.device = get_device()
+    # set random seeds
+    seed = init_random_seed(args.seed, device=cfg.device)
+    seed = seed + dist.get_rank() if args.diff_seed else seed
+    logger.info(f'Set random seed to {seed}, '
+                f'deterministic: {args.deterministic}')
+    set_random_seed(seed, deterministic=args.deterministic)
+    cfg.seed = seed
+    meta['seed'] = seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    # model = build_detector(
+    #     cfg.model,
+    #     train_cfg=cfg.get('train_cfg'),
+    #     test_cfg=cfg.get('test_cfg'))
+    # model.init_weights()
+
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        assert 'val' in [mode for (mode, _) in cfg.workflow]
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.get(
+            'pipeline', cfg.data.train.dataset.get('pipeline'))
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=__version__ + get_git_hash()[:7],
+            CLASSES=datasets[0].CLASSES)
+    # add an attribute for visualization convenience
+    # model.CLASSES = datasets[0].CLASSES
+    model = None
+    train_detector(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
--- a/data/mmpose_custom/apis/test.py
+++ b/data/mmpose_custom/apis/test.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model, data_loader, pseudo_test=False):
+    """Test model with a single gpu.
+
+    This method tests model with a single gpu and displays test progress bar.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        pseudo_test: custom arg
+
+
+    Returns:
+        list: The prediction results.
+    """
+
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, pseudo_test=pseudo_test, **data)
+        results.append(result)
+
+        # use the first key as main key to calculate the batch size
+        batch_size = len(next(iter(data.values())))
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
--- a/data/mmpose_custom/apis/train.py
+++ b/data/mmpose_custom/apis/train.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+import warnings
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+import tqdm
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
+                         get_dist_info)
+from mmcv.utils import digit_version
+
+from mmpose.core import DistEvalHook, EvalHook, build_optimizers
+from mmpose.core.distributed_wrapper import DistributedDataParallelWrapper
+from mmpose.datasets import build_dataloader, build_dataset
+from mmpose.utils import get_root_logger
+
+try:
+    from mmcv.runner import Fp16OptimizerHook
+except ImportError:
+    warnings.warn(
+        'Fp16OptimizerHook from mmpose will be deprecated from '
+        'v0.15.0. Please install mmcv>=1.1.4', DeprecationWarning)
+    from mmpose.core import Fp16OptimizerHook
+
+
+def init_random_seed(seed=None, device='cuda'):
+    """Initialize random seed.
+
+    If the seed is not set, the seed will be automatically randomized,
+    and then broadcast to all processes to prevent some potential bugs.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is not None:
+        return seed
+
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    rank, world_size = get_dist_info()
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """Train model entry function.
+
+    Args:
+        model (nn.Module): The model to be trained.
+        dataset (Dataset): Train dataset.
+        cfg (dict): The config dict for training.
+        distributed (bool): Whether to use distributed training.
+            Default: False.
+        validate (bool): Whether to do evaluation. Default: False.
+        timestamp (str | None): Local time for runner. Default: None.
+        meta (dict | None): Meta dict to record some important information.
+            Default: None
+    """
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    # step 1: give default values and override (if exist) from cfg.data
+    loader_cfg = {
+        **dict(
+            seed=cfg.get('seed'),
+            drop_last=False,
+            dist=distributed,
+            num_gpus=len(cfg.gpu_ids)),
+        **({} if torch.__version__ != 'parrots' else dict(
+               prefetch_num=2,
+               pin_memory=False,
+           )),
+        **dict((k, cfg.data[k]) for k in [
+                   'samples_per_gpu',
+                   'workers_per_gpu',
+                   'shuffle',
+                   'seed',
+                   'drop_last',
+                   'prefetch_num',
+                   'pin_memory',
+                   'persistent_workers',
+               ] if k in cfg.data)
+    }
+
+    # step 2: cfg.data.train_dataloader has highest priority
+    train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {}))
+
+    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]
+
+    load_data_only = cfg.data.get('load_data_only', False)
+    assert load_data_only
+    # only enumerate dataset
+    for data_loader in data_loaders:
+        for _ in tqdm.tqdm(data_loader):
+            pass
+    print("dataset enumerated, exit!")
+    sys.exit()
--- a/data/mmpose_custom/configs/_base_/coco.py
+++ b/data/mmpose_custom/configs/_base_/coco.py
+dataset_info = dict(
+    dataset_name='coco',
+    paper_info=dict(
+        author='Lin, Tsung-Yi and Maire, Michael and '
+        'Belongie, Serge and Hays, James and '
+        'Perona, Pietro and Ramanan, Deva and '
+        r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+        title='Microsoft coco: Common objects in context',
+        container='European conference on computer vision',
+        year='2014',
+        homepage='http://cocodataset.org/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
--- a/data/mmpose_custom/configs/_base_/default_runtime.py
+++ b/data/mmpose_custom/configs/_base_/default_runtime.py
+checkpoint_config = dict(interval=10)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        # dict(type='PaviLoggerHook') # for internal services
+    ])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
--- a/data/mmpose_custom/configs/coco_256x192_gendata.py
+++ b/data/mmpose_custom/configs/coco_256x192_gendata.py
+_base_ = [
+    './_base_/default_runtime.py',
+    './_base_/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = None
+
+use_gt_bbox = True
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[192, 256],  # [48, 64]
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=use_gt_bbox,
+    det_bbox_thr=0.0,
+    bbox_file='datasets/coco_pose/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+sigma = [1.5, 3]
+aug_idx = 0
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    # dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
+    dict(type='TopDownRandomFlip', flip_prob=0),
+    # dict(
+    #     type='TopDownHalfBodyTransform',
+    #     num_joints_half_body=8,
+    #     prob_half_body=0.3),
+    # dict(
+    #     type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    # dict(type='ToTensor'),
+    # dict(
+    #     type='NormalizeTensor',
+    #     mean=[0.485, 0.456, 0.406],
+    #     std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTargetCustom',
+        sigma=sigma,
+        # the following are custom args
+        use_gt_bbox=use_gt_bbox,
+        dir_name='train_256x192_aug{}'.format(aug_idx),
+        target_path='datasets/coco_pose/data_pair',
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    # dict(type='TopDownRandomFlip', flip_prob=1),  # for flip test
+    dict(type='TopDownAffine'),
+    # dict(type='ToTensor'),
+    # dict(
+    #     type='NormalizeTensor',
+    #     mean=[0.485, 0.456, 0.406],
+    #     std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTargetCustom',
+        sigma=sigma,
+        # the following are custom args
+        use_gt_bbox=use_gt_bbox,
+        dir_name='val_256x192',
+        target_path='datasets/coco_pose/data_pair',
+    ),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'datasets/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    load_data_only=True,  # custom arg
+    train=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
+# import newly registered module
+custom_imports = dict(
+    imports=[
+        'model.top_down',
+        'data.topdown_coco_dataset',
+        'data.pipelines.top_down_transform',
+    ],
+    allow_failed_imports=False)
--- a/data/mmpose_custom/configs/coco_256x192_gendata_test.py
+++ b/data/mmpose_custom/configs/coco_256x192_gendata_test.py
+_base_ = [
+    './_base_/default_runtime.py',
+    './_base_/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = None
+
+use_gt_bbox = False
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[192, 256],  # [48, 64]
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=use_gt_bbox,
+    det_bbox_thr=0.0,
+    bbox_file='datasets/coco_pose/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+sigma = [1.5, 3]  # 2
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    # dict(type='ToTensor'),
+    # dict(
+    #     type='NormalizeTensor',
+    #     mean=[0.485, 0.456, 0.406],
+    #     std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTargetCustom',
+        sigma=sigma,
+        # the following are custom args
+        use_gt_bbox=use_gt_bbox,
+        dir_name='train_256x192_aug0',
+        target_path='datasets/coco_pose/data_pair',
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    # dict(type='TopDownRandomFlip', flip_prob=1),  # for flip test
+    dict(type='TopDownAffine'),
+    # dict(type='ToTensor'),
+    # dict(
+    #     type='NormalizeTensor',
+    #     mean=[0.485, 0.456, 0.406],
+    #     std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTargetCustom',
+        sigma=sigma,
+        # the following are custom args
+        use_gt_bbox=use_gt_bbox,
+        dir_name='test_256x192',
+        target_path='datasets/coco_pose/data_pair',
+    ),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'datasets/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    load_data_only=True,  # custom arg
+    train=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
+# import newly registered module
+custom_imports = dict(
+    imports=[
+        'model.top_down',
+        'data.topdown_coco_dataset',
+        'data.pipelines.top_down_transform',
+    ],
+    allow_failed_imports=False)
--- a/data/mmpose_custom/configs/coco_256x192_gendata_testflip.py
+++ b/data/mmpose_custom/configs/coco_256x192_gendata_testflip.py
+_base_ = [
+    './_base_/default_runtime.py',
+    './_base_/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = None
+
+use_gt_bbox = False
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[192, 256],  # [48, 64]
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=use_gt_bbox,
+    det_bbox_thr=0.0,
+    bbox_file='datasets/coco_pose/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+sigma = [1.5, 3]  # 2
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    # dict(type='ToTensor'),
+    # dict(
+    #     type='NormalizeTensor',
+    #     mean=[0.485, 0.456, 0.406],
+    #     std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTargetCustom',
+        sigma=sigma,
+        # the following are custom args
+        use_gt_bbox=use_gt_bbox,
+        dir_name='train_256x192_aug0',
+        target_path='datasets/coco_pose/data_pair',
+    ),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomFlip', flip_prob=1),  # for flip test
+    dict(type='TopDownAffine'),
+    # dict(type='ToTensor'),
+    # dict(
+    #     type='NormalizeTensor',
+    #     mean=[0.485, 0.456, 0.406],
+    #     std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTargetCustom',
+        sigma=sigma,
+        # the following are custom args
+        use_gt_bbox=use_gt_bbox,
+        dir_name='test_256x192_flip',
+        target_path='datasets/coco_pose/data_pair',
+    ),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'datasets/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    load_data_only=True,  # custom arg
+    train=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
+# import newly registered module
+custom_imports = dict(
+    imports=[
+        'model.top_down',
+        'data.topdown_coco_dataset',
+        'data.pipelines.top_down_transform',
+    ],
+    allow_failed_imports=False)
--- a/data/mmpose_custom/configs/coco_256x192_test_offline.py
+++ b/data/mmpose_custom/configs/coco_256x192_test_offline.py
+
+import os
+
+job_name = "painter_vit_large"
+ckpt_file = "painter_vit_large.pth"
+prompt = "000000000165_box0"
+
+image_dir = 'models_inference/{}/coco_pose_inference_{}_{}/'.format(job_name, ckpt_file, prompt)
+if not image_dir[-1] == "/":
+    image_dir = image_dir + '/'
+print(image_dir)
+
+
+_base_ = [
+    './_base_/default_runtime.py',
+    './_base_/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# fake model settings
+model = dict(
+    type='TopDownCustom',
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=32,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=17))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[192, 256],
+    # heatmap_size=[48, 64],
+    # image_size=[640, 320],  # w, h
+    # heatmap_size=[640, 320],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    imagename_with_boxid=True,  # custom
+    det_bbox_thr=0.0,
+    bbox_file='datasets/coco_pose/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+
+# sigma = [1.5, 3]  # 2
+sigma = 3  # use the hyper params of R, which is heatmap
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),  # load custom images according to filename and box_id, using topdown_coco_dataset
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'datasets/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    pseudo_test=True,  # custom arg
+    val=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        # img_prefix=f'{data_root}/val2017/',
+        img_prefix=image_dir,
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDatasetCustom',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        # img_prefix=f'{data_root}/val2017/',
+        img_prefix=image_dir,
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
+
+# import newly registered module
+custom_imports = dict(
+    imports=[
+        'model.top_down',
+        'data.topdown_coco_dataset',
+        'data.pipelines.top_down_transform',
+    ],
+    allow_failed_imports=False)
--- a/data/mmpose_custom/data/pipelines/custom_transform.py
+++ b/data/mmpose_custom/data/pipelines/custom_transform.py
+import os
+import random
+import warnings
+
+import cv2
+import numpy as np
+from PIL import Image
+
+
+def define_colors_gb_mean_sep(num_locations=17):
+    num_sep_per_channel = int(num_locations ** (1 / 2)) + 1  # 5
+    separation_per_channel = 256 // num_sep_per_channel  # 51
+
+    color_dict = {}
+    # R = G = B = 0
+    # B += separation_per_channel  # offset for the first loop
+    for location in range(num_locations):
+        num_seq_g = location // num_sep_per_channel
+        num_seq_b = location % num_sep_per_channel
+        assert (num_seq_g <= num_sep_per_channel) and (num_seq_b <= num_sep_per_channel)
+
+        G = 255 - num_seq_g * separation_per_channel
+        B = 255 - num_seq_b * separation_per_channel
+        assert (G < 256) and (B < 256)
+        assert (G >= 0) and (B >= 0)
+        assert (G, B) not in color_dict.values()
+
+        color_dict[location] = (G, B)
+        # print(location, (num_seq_g, num_seq_b), (G, B))
+
+    # colors = [v for k, v in color_dict.items()]
+    # min values in gb: [51, 51]
+    return color_dict
+
+
+color_dict = define_colors_gb_mean_sep()
+
+
+def encode_target_to_image(target, target_weight, target_dir, metas):
+    if len(target.shape) == 3:
+        return encode_rgb_target_to_image(
+            target_kernel=target, target_class=target,
+            target_weight_kernel=target_weight, target_weight_class=target_weight,
+            target_dir=target_dir, metas=metas,
+        )
+
+    assert len(target.shape) == 4
+    return encode_rgb_target_to_image(
+        target_kernel=target[1], target_class=target[0],
+        target_weight_kernel=target_weight[1], target_weight_class=target_weight[0],
+        target_dir=target_dir, metas=metas,
+    )
+
+
+def check_input(target_weight, target, metas):
+    if not ((target_weight.reshape(17, 1, 1) * target) == target).all():
+        print("useful target_weight!")
+        target = target_weight.reshape(17, 1, 1) * target
+    # make sure the invisible part is weighted zero, and thus not shown in target
+    if not (target_weight[np.sum(metas['joints_3d_visible'], axis=1) == 0] == 0).all():
+        print(metas['image_file'], "may have joints_3d_visible problems!")
+
+
+def encode_rgb_target_to_image(target_kernel, target_class, target_weight_kernel, target_weight_class, target_dir, metas):
+    """
+
+    Args:
+        target: ndarray (17, 256, 192)
+        target_weight: ndarray (17, 1)
+        metas: dict
+
+    Returns:
+        an RGB image, R encodes heatmap, GB encodes class
+
+    """
+    check_input(target_weight_kernel, target_kernel, metas)
+    check_input(target_weight_class, target_class, metas)
+
+    # 1. handle kernel in R channel
+    # get max value for collision area
+    sum_kernel = target_kernel.max(0)  # (256, 192)
+    max_kernel_indices = target_kernel.argmax(0)  # (256, 192)
+    R = sum_kernel[:, :, None] * 255.  # (256, 192, 1)
+
+    # 2. handle class in BG channels
+    K, H, W = target_class.shape
+    keypoint_areas_class = []
+    for keypoint_idx in range(K):
+        mask = target_class[keypoint_idx] != 0
+        keypoint_areas_class.append(mask)
+    keypoint_areas_class = np.stack(keypoint_areas_class)  # (17, 256, 192)
+    num_pos_per_location_class = keypoint_areas_class.sum(0)  # (256, 192)
+    collision_area_class = num_pos_per_location_class > 1  # (256, 192)
+
+    GB_MultiChannel = np.zeros((17, 256, 192, 2))
+    for keypoint_idx in range(K):
+        color = color_dict[keypoint_idx]
+        class_mask = keypoint_areas_class[keypoint_idx]
+        GB_MultiChannel[keypoint_idx][class_mask] = color
+    GB = GB_MultiChannel.sum(0)  # (256, 192, 2)
+
+    if np.sum(collision_area_class) != 0:
+        for keypoint_idx in range(K):
+            color = color_dict[keypoint_idx]
+            # mach more max_area_this_keypoint for 0, but removed by collision_area_class latter
+            max_area_this_keypoint = max_kernel_indices == keypoint_idx
+            area_of_interest = max_area_this_keypoint * collision_area_class
+            if not (area_of_interest == 0).all():
+                GB[area_of_interest] = color
+
+    # 3. get images / labels and save
+    image_label = np.concatenate([R, GB], axis=-1).astype(np.uint8)  # (256, 192, 3)
+    image_label = Image.fromarray(image_label)
+    image = metas['img']
+    image = Image.fromarray(image)
+
+    box_idx = metas['bbox_id']
+
+    _, filename = os.path.dirname(metas['image_file']), os.path.basename(metas['image_file'])
+    image_path = os.path.join(target_dir, filename.replace(".jpg", "_box{}_image.png".format(box_idx)))
+    label_path = os.path.join(target_dir, filename.replace(".jpg", "_box{}_label.png".format(box_idx)))
+
+    # if os.path.exists(image_path):
+    #     print(image_path, "exist! return!")
+    #     return
+
+    image.save(image_path)
+    image_label.save(label_path)
--- a/data/mmpose_custom/data/pipelines/top_down_transform.py
+++ b/data/mmpose_custom/data/pipelines/top_down_transform.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import os
+from PIL import Image
+import cv2
+import numpy as np
+
+from mmpose.core.bbox import bbox_xywh2cs
+from mmpose.core.post_processing import (affine_transform, fliplr_joints,
+                                         get_affine_transform, get_warp_matrix,
+                                         warp_affine_joints)
+from mmpose.datasets.builder import PIPELINES
+from mmpose.datasets.pipelines import TopDownGenerateTarget
+from .custom_transform import encode_target_to_image
+
+
+@PIPELINES.register_module()
+class TopDownGenerateTargetCustom(TopDownGenerateTarget):
+    """Generate the target heatmap.
+
+    Required key: 'joints_3d', 'joints_3d_visible', 'ann_info'.
+
+    Modified key: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian for 'MSRA' approach.
+        kernel: Kernel of heatmap gaussian for 'Megvii' approach.
+        encoding (str): Approach to generate target heatmaps.
+            Currently supported approaches: 'MSRA', 'Megvii', 'UDP'.
+            Default:'MSRA'
+        unbiased_encoding (bool): Option to use unbiased
+            encoding methods.
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        keypoint_pose_distance: Keypoint pose distance for UDP.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+        target_type (str): supported targets: 'GaussianHeatmap',
+            'CombinedTarget'. Default:'GaussianHeatmap'
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self,
+                 sigma=2,
+                 kernel=(11, 11),
+                 valid_radius_factor=0.0546875,
+                 target_type='GaussianHeatmap',
+                 encoding='MSRA',
+                 unbiased_encoding=False,
+                 # the following are custom args
+                 target_path=None,
+                 dir_name=None,
+                 use_gt_bbox=True):
+
+        super().__init__(
+            sigma=sigma,
+            kernel=kernel,
+            valid_radius_factor=valid_radius_factor,
+            target_type=target_type,
+            encoding=encoding,
+            unbiased_encoding=unbiased_encoding)
+
+        self.target_path = target_path
+        self.dir_name = dir_name
+        self.use_gt_bbox = use_gt_bbox
+
+        target_dir = os.path.join(self.target_path, self.dir_name)
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        assert self.encoding in ['MSRA', 'Megvii', 'UDP']
+
+        if self.encoding == 'MSRA':
+            if isinstance(self.sigma, list):
+                num_sigmas = len(self.sigma)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                heatmap_size = cfg['heatmap_size']
+
+                target = np.empty(
+                    (0, num_joints, heatmap_size[1], heatmap_size[0]),
+                    dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_sigmas):
+                    target_i, target_weight_i = self._msra_generate_target(
+                        cfg, joints_3d, joints_3d_visible, self.sigma[i])
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._msra_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible,
+                    self.sigma)
+
+        elif self.encoding == 'Megvii':
+            if isinstance(self.kernel, list):
+                num_kernels = len(self.kernel)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                W, H = cfg['heatmap_size']
+
+                target = np.empty((0, num_joints, H, W), dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_kernels):
+                    target_i, target_weight_i = self._megvii_generate_target(
+                        cfg, joints_3d, joints_3d_visible, self.kernel[i])
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._megvii_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible,
+                    self.kernel)
+
+        elif self.encoding == 'UDP':
+            if self.target_type.lower() == 'CombinedTarget'.lower():
+                factors = self.valid_radius_factor
+                channel_factor = 3
+            elif self.target_type.lower() == 'GaussianHeatmap'.lower():
+                factors = self.sigma
+                channel_factor = 1
+            else:
+                raise ValueError('target_type should be either '
+                                 "'GaussianHeatmap' or 'CombinedTarget'")
+            if isinstance(factors, list):
+                num_factors = len(factors)
+                cfg = results['ann_info']
+                num_joints = cfg['num_joints']
+                W, H = cfg['heatmap_size']
+
+                target = np.empty((0, channel_factor * num_joints, H, W),
+                                  dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_factors):
+                    target_i, target_weight_i = self._udp_generate_target(
+                        cfg, joints_3d, joints_3d_visible, factors[i],
+                        self.target_type)
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._udp_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible, factors,
+                    self.target_type)
+        else:
+            raise ValueError(
+                f'Encoding approach {self.encoding} is not supported!')
+
+        results['target'] = target
+        results['target_weight'] = target_weight
+
+        target_dir = os.path.join(self.target_path, self.dir_name)
+
+        if not self.use_gt_bbox:
+            box_idx = results['bbox_id']
+            image = results['img']
+            image = Image.fromarray(image)
+
+            _, filename = os.path.dirname(results['image_file']), os.path.basename(results['image_file'])
+            image_path = os.path.join(target_dir,
+                                      filename.replace(".jpg", "_box{}_image.png".format(box_idx)))
+            if os.path.exists(image_path):
+                print(image_path, "exist! return!")
+                return results
+            image.save(image_path)
+        else:
+            # filter all black target
+            if (target.sum((1, 2)) == 0).all():
+                return results
+            # encode target to image (save is also done inside)
+            encode_target_to_image(target, target_weight, target_dir=target_dir, metas=results)
+
+        return results
--- a/data/mmpose_custom/data/topdown_coco_dataset.py
+++ b/data/mmpose_custom/data/topdown_coco_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+
+import json_tricks as json
+import numpy as np
+from mmcv import Config, deprecated_api_warning
+from xtcocotools.cocoeval import COCOeval
+
+from mmpose.core.post_processing import oks_nms, soft_oks_nms
+from mmpose.datasets.builder import DATASETS
+# from mmpose.datasets.datasets.base import Kpt2dSviewRgbImgTopDownDataset
+from mmpose.datasets.datasets.top_down import TopDownCocoDataset
+
+
+@DATASETS.register_module()
+class TopDownCocoDatasetCustom(TopDownCocoDataset):
+    """CocoDataset dataset for top-down pose estimation.
+
+    "Microsoft COCO: Common Objects in Context", ECCV'2014.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1405.0312>`__ .
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        img_prefix (str): Path to a directory where images are held.
+            Default: None.
+        data_cfg (dict): config
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_info (DatasetInfo): A class containing all dataset info.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 dataset_info=None,
+                 test_mode=False):
+
+        super().__init__(
+            ann_file,
+            img_prefix,
+            data_cfg,
+            pipeline,
+            dataset_info=dataset_info,
+            test_mode=test_mode)
+
+        self.imagename_with_boxid = data_cfg.get('imagename_with_boxid', False)
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+
+        Args:
+            img_id: coco image id
+
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+        num_joints = self.ann_info['num_joints']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w))
+            y2 = min(height - 1, y1 + max(0, h))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            image_file = osp.join(self.img_prefix, self.id2name[img_id])
+            self.imagename_with_boxid = False
+            if self.imagename_with_boxid:
+                # gt bbox label example: 000000342971_box0_image.png
+                image_file = image_file.replace(".jpg", "_box{}_image.png".format(bbox_id))
+            rec.append({
+                'image_file': image_file,
+                'bbox': obj['clean_bbox'][:4],
+                'rotation': 0,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'dataset': self.dataset_name,
+                'bbox_score': 1,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+
+        return rec
+
+    def _load_coco_person_detection_results(self):
+        """Load coco person detection results."""
+        num_joints = self.ann_info['num_joints']
+        all_boxes = None
+        with open(self.bbox_file, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            raise ValueError('=> Load %s fail!' % self.bbox_file)
+
+        print(f'=> Total boxes: {len(all_boxes)}')
+
+        kpt_db = []
+        bbox_id = 0
+        for det_res in all_boxes:
+            if det_res['category_id'] != 1:
+                continue
+
+            image_file = osp.join(self.img_prefix,
+                                  self.id2name[det_res['image_id']])
+            box = det_res['bbox']
+            score = det_res['score']
+
+            if score < self.det_bbox_thr:
+                continue
+
+            joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
+            joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32)
+            self.imagename_with_boxid = False
+            if self.imagename_with_boxid:
+                image_file = image_file.replace(".jpg", "_box{}_image.png".format(bbox_id))
+            kpt_db.append({
+                'image_file': image_file,
+                'rotation': 0,
+                'bbox': box[:4],
+                'bbox_score': score,
+                'dataset': self.dataset_name,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'bbox_id': bbox_id
+            })
+            bbox_id = bbox_id + 1
+        print(f'=> Total boxes after filter '
+              f'low score@{self.det_bbox_thr}: {bbox_id}')
+        return kpt_db
+
+    @deprecated_api_warning(name_dict=dict(outputs='results'))
+    def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
+        """Evaluate coco keypoint results. The pose prediction results will be
+        saved in ``${res_folder}/result_keypoints.json``.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmap height: H
+            - heatmap width: W
+
+        Args:
+            results (list[dict]): Testing results containing the following
+                items:
+
+                - preds (np.ndarray[N,K,3]): The first two dimensions are \
+                    coordinates, score is the third dimension of the array.
+                - boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
+                    scale[1],area, score]
+                - image_paths (list[str]): For example, ['data/coco/val2017\
+                    /000000393226.jpg']
+                - heatmap (np.ndarray[N, K, H, W]): model output heatmap
+                - bbox_id (list(int)).
+            res_folder (str, optional): The folder to save the testing
+                results. If not specified, a temp folder will be created.
+                Default: None.
+            metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        if res_folder is not None:
+            tmp_folder = None
+            res_file = osp.join(res_folder, 'result_keypoints.json')
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
+
+        kpts = defaultdict(list)
+
+        for result in results:
+            preds = result['preds']
+            boxes = result['boxes']
+            image_paths = result['image_paths']
+            self.imagename_with_boxid = False
+            if self.imagename_with_boxid:
+                for idx, img_path in enumerate(image_paths):
+                    image_dir, file_name = os.path.dirname(img_path), os.path.basename(img_path)
+                    file_name = file_name.split("_")[0] + ".jpg"
+                    img_path = os.path.join(image_dir, file_name)
+                    image_paths[idx] = img_path
+
+            bbox_ids = result['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+                kpts[image_id].append({
+                    'keypoints': preds[i],
+                    'center': boxes[i][0:2],
+                    'scale': boxes[i][2:4],
+                    'area': boxes[i][4],
+                    'score': boxes[i][5],
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        # rescoring and oks nms
+        num_joints = self.ann_info['num_joints']
+        vis_thr = self.vis_thr
+        oks_thr = self.oks_thr
+        valid_kpts = []
+        for image_id in kpts.keys():
+            img_kpts = kpts[image_id]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                if kwargs.get('rle_score', False):
+                    pose_score = n_p['keypoints'][:, 2]
+                    n_p['score'] = float(box_score + np.mean(pose_score) +
+                                         np.max(pose_score))
+                else:
+                    kpt_score = 0
+                    valid_num = 0
+                    for n_jt in range(0, num_joints):
+                        t_s = n_p['keypoints'][n_jt][2]
+                        if t_s > vis_thr:
+                            kpt_score = kpt_score + t_s
+                            valid_num = valid_num + 1
+                    if valid_num != 0:
+                        kpt_score = kpt_score / valid_num
+                    # rescoring
+                    n_p['score'] = kpt_score * box_score
+
+            if self.use_nms:
+                nms = soft_oks_nms if self.soft_nms else oks_nms
+                keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
+                valid_kpts.append([img_kpts[_keep] for _keep in keep])
+            else:
+                valid_kpts.append(img_kpts)
+
+        self._write_coco_keypoint_results(valid_kpts, res_file)
+
+        # do evaluation only if the ground truth keypoint annotations exist
+        if 'annotations' in self.coco.dataset:
+            info_str = self._do_python_keypoint_eval(res_file)
+            name_value = OrderedDict(info_str)
+
+            if tmp_folder is not None:
+                tmp_folder.cleanup()
+        else:
+            warnings.warn(f'Due to the absence of ground truth keypoint'
+                          f'annotations, the quantitative evaluation can not'
+                          f'be conducted. The prediction results have been'
+                          f'saved at: {osp.abspath(res_file)}')
+            name_value = {}
+
+        return name_value
--- a/data/mmpose_custom/gen_json_coco_pose.py
+++ b/data/mmpose_custom/gen_json_coco_pose.py
+# --------------------------------------------------------
+# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
+# Github source: https://github.com/baaivision/Painter
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Xinlong Wang, Wen Wang
+# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
+# --------------------------------------------------------'
+
+import os
+import glob
+import json
+import tqdm
+import argparse
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('COCO pose estimation preparation', add_help=False)
+    parser.add_argument('--split', type=str, help='dataset split', 
+                        choices=['train', 'val'], required=True)
+    parser.add_argument('--output_dir', type=str, help='path to output dir', 
+                        default='datasets/coco_pose')
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    split = args.split
+
+    if split == "train":
+        aug_list = [
+            "_aug0", "_aug1", "_aug2", "_aug3", "_aug4",
+            "_aug5", "_aug6", "_aug7", "_aug8", "_aug9",
+            "_aug10", "_aug11", "_aug12", "_aug13", "_aug14",
+            "_aug15", "_aug16", "_aug17", "_aug18", "_aug19",
+        ]
+    elif split == "val":
+        aug_list = ["", "_flip"]
+    else:
+        raise NotImplementedError
+
+    save_path = os.path.join(args.output_dir, "coco_pose_256x192_{}.json".format(split))
+    print(save_path)
+
+    output_dict = []
+
+    for aug_idx in aug_list:
+        image_dir = "datasets/coco_pose/data_pair/{}_256x192{}".format(split, aug_idx)
+        print(aug_idx, image_dir)
+        image_path_list = glob.glob(os.path.join(image_dir, '*image.png'))
+
+        for image_path in tqdm.tqdm(image_path_list):
+            label_path = image_path.replace("image.png", "label.png")
+            assert label_path != image_path
+            assert os.path.isfile(image_path)
+            if not os.path.isfile(label_path):
+                print("ignoring {}".format(label_path))
+                continue
+            pair_dict = {}
+            pair_dict["image_path"] = image_path.replace('datasets/', '')
+            pair_dict["target_path"] = label_path.replace('datasets/', '')
+            pair_dict["type"] = "coco_image2pose"
+            output_dict.append(pair_dict)
+
+    json.dump(output_dict, open(save_path, 'w'))