add data directory

799c795c · chenych · cce6e1bf · 799c795c · 799c795c · 799c795c
Commit 799c795c authored Nov 22, 2023 by chenych
8 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -200,7 +200,7 @@ $RECYCLE.BIN/
 .vscode/
 output/
 exp/
-data/
+# data/
 *.pyc
 *.mp4
 *.zip
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -39,9 +39,9 @@ python -m pip install -e detectron2
 ```
 docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest

-docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
+docker run -it -v /parastor/home/chenych/textDetection/deepsolo_pytorch/:/home/deepsolo_pytorch/ --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name cyc_deepsolo 2bb84d403fac bash

-cd /your_code_path/deepsolo_pytorch
+cd /home/deepsolo_pytorch
 pip install -r requirements.txt
 python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
 bash make.sh

--- a/adet/data/__init__.py
+++ b/adet/data/__init__.py
+from . import builtin  # ensure the builtin datasets are registered
+from .dataset_mapper import DatasetMapperWithBasis
+
+
+__all__ = ["DatasetMapperWithBasis"]
--- a/adet/data/augmentation.py
+++ b/adet/data/augmentation.py
+import random
+from typing import Tuple
+import sys
+from PIL import Image
+import numpy as np
+from fvcore.transforms import transform as T
+from detectron2.data.transforms import RandomCrop, StandardAugInput
+from detectron2.structures import BoxMode
+import torch
+from detectron2.data.transforms import Augmentation, PadTransform
+from fvcore.transforms.transform import Transform, NoOpTransform
+
+
+def gen_crop_transform_with_instance(crop_size, image_size, instances, crop_box=True):
+    """
+    Generate a CropTransform so that the cropping region contains
+    the center of the given instance.
+
+    Args:
+        crop_size (tuple): h, w in pixels
+        image_size (tuple): h, w
+        instance (dict): an annotation dict of one instance, in Detectron2's
+            dataset format.
+    """
+    bbox = random.choice(instances)
+    crop_size = np.asarray(crop_size, dtype=np.int32)
+    center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
+    assert (
+        image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
+    ), "The annotation bounding box is outside of the image!"
+    assert (
+        image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
+    ), "Crop size is larger than image size!"
+
+    min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
+    max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
+    max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
+
+    y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
+    x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
+
+    # if some instance is cropped extend the box
+    if not crop_box:
+        num_modifications = 0
+        modified = True
+
+        # convert crop_size to float
+        crop_size = crop_size.astype(np.float32)
+        while modified:
+            modified, x0, y0, crop_size = adjust_crop(x0, y0, crop_size, instances)
+            num_modifications += 1
+            if num_modifications > 25:
+                raise ValueError(
+                    "Cannot finished cropping adjustment within 25 tries (#instances {}).".format(
+                        len(instances)
+                    )
+                )
+                return T.CropTransform(0, 0, image_size[1], image_size[0])
+
+    return T.CropTransform(*map(int, (x0, y0, crop_size[1], crop_size[0])))
+
+
+def adjust_crop(x0, y0, crop_size, instances, eps=1e-3):
+    modified = False
+
+    x1 = x0 + crop_size[1]
+    y1 = y0 + crop_size[0]
+
+    for bbox in instances:
+
+        if bbox[0] < x0 - eps and bbox[2] > x0 + eps:
+            crop_size[1] += x0 - bbox[0]
+            x0 = bbox[0]
+            modified = True
+
+        if bbox[0] < x1 - eps and bbox[2] > x1 + eps:
+            crop_size[1] += bbox[2] - x1
+            x1 = bbox[2]
+            modified = True
+
+        if bbox[1] < y0 - eps and bbox[3] > y0 + eps:
+            crop_size[0] += y0 - bbox[1]
+            y0 = bbox[1]
+            modified = True
+
+        if bbox[1] < y1 - eps and bbox[3] > y1 + eps:
+            crop_size[0] += bbox[3] - y1
+            y1 = bbox[3]
+            modified = True
+
+    return modified, x0, y0, crop_size
+
+
+class RandomCropWithInstance(RandomCrop):
+    """ Instance-aware cropping.
+    """
+
+    def __init__(self, crop_type, crop_size, crop_instance=True):
+        """
+        Args:
+            crop_instance (bool): if False, extend cropping boxes to avoid cropping instances
+        """
+        super().__init__(crop_type, crop_size)
+        self.crop_instance = crop_instance  # relative range
+        self.input_args = ("image", "boxes")
+
+    def get_transform(self, img, boxes):
+        image_size = img.shape[:2]
+        crop_size = self.get_crop_size(image_size)
+        return gen_crop_transform_with_instance(
+            crop_size, image_size, boxes, crop_box=self.crop_instance
+        )
+
+
+class Pad(Augmentation):
+    def __init__(self, divisible_size = 32):
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        ori_h, ori_w = img.shape[:2]  # h, w
+        if ori_h % 32 == 0:
+            pad_h = 0
+        else:
+            pad_h = 32 - ori_h % 32
+        if ori_w % 32 == 0:
+            pad_w = 0
+        else:
+            pad_w = 32 - ori_w % 32
+        # pad_h, pad_w = 32 - ori_h % 32, 32 - ori_w % 32
+        return PadTransform(
+            0, 0, pad_w, pad_h, pad_value=0
+        )
\ No newline at end of file
--- a/adet/data/builtin.py
+++ b/adet/data/builtin.py
+import os
+import argparse
+from detectron2.data.datasets.register_coco import register_coco_instances
+from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
+from .datasets.text import register_text_instances
+from adet.config import get_cfg
+from detectron2.engine import default_argument_parser
+
+_PREDEFINED_SPLITS_PIC = {
+    "pic_person_train": ("pic/image/train", "pic/annotations/train_person.json"),
+    "pic_person_val": ("pic/image/val", "pic/annotations/val_person.json"),
+}
+
+metadata_pic = {
+    "thing_classes": ["person"]
+}
+
+_PREDEFINED_SPLITS_TEXT = {
+    # 37 voc_size
+    "syntext1": ("syntext1/train_images", "syntext1/annotations/train_37voc.json"),
+    "syntext2": ("syntext2/train_images", "syntext2/annotations/train_37voc.json"),
+    "mlt": ("mlt2017/train_images", "mlt2017/train_37voc.json"),
+    "totaltext_train": ("totaltext/train_images", "totaltext/train_37voc.json"),
+    "ic13_train": ("ic13/train_images", "ic13/train_37voc.json"),
+    "ic15_train": ("ic15/train_images", "ic15/train_37voc.json"),
+    "textocr1": ("textocr/train_images", "textocr/train_37voc_1.json"),
+    "textocr2": ("textocr/train_images", "textocr/train_37voc_2.json"),
+
+    # 96 voc_size
+    "syntext1_96voc": ("syntext1/train_images", "syntext1/annotations/train_96voc.json"),
+    "syntext2_96voc": ("syntext2/train_images", "syntext2/annotations/train_96voc.json"),
+    "mlt_96voc": ("mlt2017/train_images", "mlt2017/train_96voc.json"),
+    "totaltext_train_96voc": ("totaltext/train_images", "totaltext/train_96voc.json"),
+    "ic13_train_96voc": ("ic13/train_images", "ic13/train_96voc.json"),
+    "ic15_train_96voc": ("ic15/train_images", "ic15/train_96voc.json"),
+    "ctw1500_train_96voc": ("ctw1500/train_images", "ctw1500/train_96voc.json"),
+    # 样例
+    "simple_train": ("simple/train_images", "simple/train.json"),
+
+    # Chinese
+    "chnsyn_train": ("chnsyntext/syn_130k_images", "chnsyntext/chn_syntext.json"),
+    "rects_train": ("ReCTS/ReCTS_train_images", "ReCTS/rects_train.json"),
+    "rects_val": ("ReCTS/ReCTS_val_images", "ReCTS/rects_val.json"),
+    "lsvt_train": ("LSVT/rename_lsvtimg_train", "LSVT/lsvt_train.json"),
+    "art_train": ("ArT/rename_artimg_train", "ArT/art_train.json"),
+
+    # evaluation, just for reading images, annotations may be empty
+    "totaltext_test": ("totaltext/test_images", "totaltext/test.json"),
+    "ic15_test": ("ic15/test_images", "ic15/test.json"),
+    "ctw1500_test": ("ctw1500/test_images", "ctw1500/test.json"),
+    "inversetext_test": ("inversetext/test_images", "inversetext/test.json"),
+    "rects_test": ("ReCTS/ReCTS_test_images", "ReCTS/rects_test.json"),
+    # 样例
+    "simple_test": ("simple/test_images", "simple/test.json"),
+}
+
+metadata_text = {
+    "thing_classes": ["text"]
+}
+
+
+def register_all_coco(root="datasets", voc_size_cfg=37, num_pts_cfg=25):
+    for key, (image_root, json_file) in _PREDEFINED_SPLITS_PIC.items():
+        # Assume pre-defined datasets live in `./datasets`.
+        register_coco_instances(
+            key,
+            metadata_pic,
+            os.path.join(root, json_file) if "://" not in json_file else json_file,
+            os.path.join(root, image_root),
+        )
+    for key, (image_root, json_file) in _PREDEFINED_SPLITS_TEXT.items():
+        # Assume pre-defined datasets live in `./datasets`.
+        register_text_instances(
+            key,
+            metadata_text,
+            os.path.join(root, json_file) if "://" not in json_file else json_file,
+            os.path.join(root, image_root),
+            voc_size_cfg,
+            num_pts_cfg
+        )
+
+
+# get the vocabulary size and number of point queries in each instance
+# to eliminate blank text and sample gt according to Bezier control points
+parser = default_argument_parser()
+# add the following argument to avoid some errors while running demo/demo.py
+parser.add_argument("--input", nargs="+", help="A list of space separated input images")
+parser.add_argument(
+    "--output",
+    help="A file or directory to save output visualizations. "
+    "If not given, will show output in an OpenCV window.",
+)
+parser.add_argument(
+    "--opts",
+    help="Modify config options using the command-line 'KEY VALUE' pairs",
+    default=[],
+    nargs=argparse.REMAINDER,
+    )
+args = parser.parse_args()
+cfg = get_cfg()
+cfg.merge_from_file(args.config_file)
+register_all_coco(voc_size_cfg=cfg.MODEL.TRANSFORMER.VOC_SIZE, num_pts_cfg=cfg.MODEL.TRANSFORMER.NUM_POINTS)
--- a/adet/data/dataset_mapper.py
+++ b/adet/data/dataset_mapper.py
+import copy
+import logging
+import os.path as osp
+
+import numpy as np
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image
+from pycocotools import mask as maskUtils
+
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.detection_utils import SizeMismatchError
+from detectron2.structures import BoxMode
+
+from .augmentation import RandomCropWithInstance
+from .detection_utils import (annotations_to_instances, build_augmentation,
+                              transform_instance_annotations)
+
+"""
+This file contains the default mapping that's applied to "dataset dicts".
+"""
+
+__all__ = ["DatasetMapperWithBasis"]
+
+logger = logging.getLogger(__name__)
+
+
+def segmToRLE(segm, img_size):
+    h, w = img_size
+    if type(segm) == list:
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(segm, h, w)
+        rle = maskUtils.merge(rles)
+    elif type(segm["counts"]) == list:
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(segm, h, w)
+    else:
+        # rle
+        rle = segm
+    return rle
+
+
+def segmToMask(segm, img_size):
+    rle = segmToRLE(segm, img_size)
+    m = maskUtils.decode(rle)
+    return m
+
+def filter_empty_instances(instances):
+    """
+    Filter out empty instances in an `Instances` object.
+
+    Args:
+        instances (Instances):
+        by_box (bool): whether to filter out instances with empty boxes
+        by_mask (bool): whether to filter out instances with empty masks
+        box_threshold (float): minimum width and height to be considered non-empty
+        return_mask (bool): whether to return boolean mask of filtered instances
+
+    Returns:
+        Instances: the filtered instances.
+        tensor[bool], optional: boolean mask of filtered instances
+    """
+    pass
+    r = []
+    r.append(instances.gt_boxes.nonempty())
+
+    if not r:
+        return instances
+    m = r[0]
+    for x in r[1:]:
+        m = m & x
+
+    return instances[m]
+
+
+class DatasetMapperWithBasis(DatasetMapper):
+    """
+    This caller enables the default Detectron2 mapper to read an additional basis semantic label
+    """
+
+    def __init__(self, cfg, is_train=True):
+        super().__init__(cfg, is_train)
+
+        # Rebuild augmentations
+        logger.info(
+            "Rebuilding the augmentations. The previous augmentations will be overridden."
+        )
+        self.augmentation = build_augmentation(cfg, is_train)
+
+        if cfg.INPUT.CROP.ENABLED and is_train and cfg.MODEL.TRANSFORMER.BOUNDARY_HEAD:
+            self.augmentation.insert(
+                0,
+                RandomCropWithInstance(
+                    cfg.INPUT.CROP.TYPE,
+                    cfg.INPUT.CROP.SIZE,
+                    cfg.INPUT.CROP.CROP_INSTANCE,
+                ),
+            )
+            logging.getLogger(__name__).info(
+                "Cropping used in training: " + str(self.augmentation[0])
+            )
+        if cfg.INPUT.ROTATE and is_train:
+            if cfg.MODEL.TRANSFORMER.BOUNDARY_HEAD:
+                self.augmentation.insert(0, T.RandomRotation(angle=[-45, 45]))
+            else:
+                self.augmentation.insert(0, T.RandomRotation(angle=[-90, 90]))
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        # USER: Write your own image loading if it's not from a file
+        try:
+            image = utils.read_image(
+                dataset_dict["file_name"], format=self.image_format
+            )
+        except Exception as e:
+            print(dataset_dict["file_name"])
+            print(e)
+            raise e
+        try:
+            utils.check_image_size(dataset_dict, image)
+        except SizeMismatchError as e:
+            expected_wh = (dataset_dict["width"], dataset_dict["height"])
+            image_wh = (image.shape[1], image.shape[0])
+            if (image_wh[1], image_wh[0]) == expected_wh:
+                print("transposing image {}".format(dataset_dict["file_name"]))
+                image = image.transpose(1, 0, 2)
+            else:
+                raise e
+
+        ######################################################################
+        boxes = np.asarray(
+            [
+                BoxMode.convert(
+                    instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS
+                )
+                for instance in dataset_dict["annotations"]
+            ]
+        )
+        ######################################################################
+
+        # aug_input = T.StandardAugInput(image)
+        aug_input = T.StandardAugInput(image, boxes=boxes)
+
+        transforms = aug_input.apply_augmentations(self.augmentation)
+        image = aug_input.image
+
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(
+            np.ascontiguousarray(image.transpose(2, 0, 1))
+        )
+
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            dataset_dict.pop("sem_seg_file_name", None)
+            dataset_dict.pop("pano_seg_file_name", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.use_instance_mask:
+                    anno.pop("segmentation", None)
+                if not self.use_keypoint:
+                    anno.pop("keypoints", None)
+
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                transform_instance_annotations(
+                    obj,
+                    transforms,
+                    image_shape,
+                    keypoint_hflip_indices=self.keypoint_hflip_indices,
+                )
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = annotations_to_instances(
+                annos, image_shape, mask_format=self.instance_mask_format
+            )
+
+            # dataset_dict["instances"] = instances
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+
+        return dataset_dict
--- a/adet/data/datasets/text.py
+++ b/adet/data/datasets/text.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import io
+import logging
+import os
+from fvcore.common.timer import Timer
+from fvcore.common.file_io import PathManager
+import numpy as np
+from detectron2.structures import BoxMode
+import sys
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+"""
+This file contains functions to parse COCO-format text annotations into dicts in "Detectron2 format".
+"""
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_text_json", "register_text_instances"]
+
+
+def register_text_instances(name, metadata, json_file, image_root, voc_size_cfg, num_pts_cfg):
+    """
+    Register a dataset in json annotation format for text detection and recognition.
+
+    Args:
+        name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
+        metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str or path-like): directory which contains all the images.
+    """
+    DatasetCatalog.register(
+        name, lambda: load_text_json(json_file, image_root, name, voc_size_cfg=voc_size_cfg, num_pts_cfg=num_pts_cfg)
+    )
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="text", **metadata
+    )
+
+
+def load_text_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None, voc_size_cfg=37, num_pts_cfg=25):
+    """
+    Load a json file with totaltext annotation format.
+    Currently supports text detection and recognition.
+
+    Args:
+        json_file (str): full path to the json file in totaltext annotation format.
+        image_root (str or path-like): the directory where the images in this json file exists.
+        dataset_name (str): the name of the dataset (e.g., coco_2017_train).
+            If provided, this function will also put "thing_classes" into
+            the metadata associated with this dataset.
+        extra_annotation_keys (list[str]): list of per-annotation keys that should also be
+            loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints",
+            "category_id", "segmentation"). The values for these keys will be returned as-is.
+            For example, the densepose annotations are loaded in this way.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard dataset dicts format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+
+    Notes:
+        1. This function does not read the image files.
+           The results do not have the "image" field.
+    """
+    from pycocotools.coco import COCO
+
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+    id_map = None
+    if dataset_name is not None:
+        meta = MetadataCatalog.get(dataset_name)
+        cat_ids = sorted(coco_api.getCatIds())
+        # print(f'cat_ids: {cat_ids}')
+        cats = coco_api.loadCats(cat_ids)
+        # print(f'cats: {cats}')
+        # The categories in a custom json file may not be sorted.
+        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+        # print(f'thing_classes: {thing_classes}')
+        meta.thing_classes = thing_classes
+
+        # In COCO, certain category ids are artificially removed,
+        # and by convention they are always ignored.
+        # We deal with COCO's id issue and translate
+        # the category ids to contiguous ids in [0, 80).
+
+        # It works by looking at the "categories" field in the json, therefore
+        # if users' own json also have incontiguous ids, we'll
+        # apply this mapping as well but print a warning.
+        if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+            if "coco" not in dataset_name:
+                logger.warning(
+                    """
+Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+"""
+                )
+        id_map = {v: i for i, v in enumerate(cat_ids)}
+        meta.thing_dataset_id_to_contiguous_id = id_map
+
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'rec': [84, 72, ... 96],
+    #   'bezier_pts': [169.0, 425.0, ..., ]
+    #   'iscrowd': 0,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+
+    if "minival" not in json_file:
+        # The popular valminusminival & minival annotations for COCO2014 contain this bug.
+        # However the ratio of buggy annotations there is tiny and does not affect accuracy.
+        # Therefore we explicitly white-list them.
+        ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+        assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+            json_file
+        )
+
+    imgs_anns = list(zip(imgs, anns))
+
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file))
+
+    dataset_dicts = []
+
+    ann_keys = ["iscrowd", "category_id"] + (extra_annotation_keys or [])
+
+    num_instances_without_valid_segmentation = 0
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        image_id = record["image_id"] = img_dict["id"]
+
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+
+            # The original COCO valminusminival2014 & minival2014 annotation files
+            # actually contains bugs that, together with certain ways of using COCO API,
+            # can trigger this assertion.
+            assert anno["image_id"] == image_id
+
+            assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.'
+
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+
+            segm = anno.get("segmentation", None)
+            if segm:  # either list[list[float]] or dict(RLE)
+                if not isinstance(segm, dict):
+                    # filter out invalid polygons (< 3 points)
+                    segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+                    if len(segm) == 0:
+                        num_instances_without_valid_segmentation += 1
+                        continue  # ignore this instance
+                obj["segmentation"] = segm
+
+            bboxs = anno.get("bbox", None)
+            if bboxs:
+                obj["bbox"] = bboxs
+                obj["bbox_mode"] = BoxMode.XYWH_ABS
+
+            bezierpts = anno.get("bezier_pts", None)
+            if bezierpts:
+                bezierpts = np.array(bezierpts).reshape(-1, 2)
+                center_bezierpts = (bezierpts[:4] + bezierpts[4:][::-1, :]) / 2
+                obj["beziers"] = center_bezierpts
+                bezierpts = bezierpts.reshape(2, 4, 2).transpose(0, 2, 1).reshape(4, 4)
+                u = np.linspace(0, 1, num_pts_cfg)
+                boundary = np.outer((1 - u) ** 3, bezierpts[:, 0]) \
+                         + np.outer(3 * u * ((1 - u) ** 2), bezierpts[:, 1]) \
+                         + np.outer(3 * (u ** 2) * (1 - u), bezierpts[:, 2]) \
+                         + np.outer(u ** 3, bezierpts[:, 3])
+                obj["boundary"] = np.hstack([boundary[:, :2], boundary[:, 2:][::-1, :]]).reshape(-1, 2)
+                obj["polyline"] = (boundary[:, :2] + boundary[:, 2:][::-1, :]) / 2
+
+            text = anno.get("rec", None)
+            if text:
+                text_check = np.array(text)
+                text_check = np.sum(text_check != voc_size_cfg)
+                # filter the instance without text script
+                if text_check == 0:
+                    continue
+                obj["text"] = text
+
+            if id_map:
+                obj["category_id"] = id_map[obj["category_id"]]
+
+            objs.append(obj)
+
+        if objs == []:
+            if 'test' not in dataset_name and 'val' not in dataset_name:
+                continue
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+
+    if num_instances_without_valid_segmentation > 0:
+        logger.warning(
+            "Filtered out {} instances without valid segmentation. "
+            "There might be issues in your dataset generation process.".format(
+                num_instances_without_valid_segmentation
+            )
+        )
+    return dataset_dicts
\ No newline at end of file
--- a/adet/data/detection_utils.py
+++ b/adet/data/detection_utils.py
+import logging
+
+import numpy as np
+import torch
+from detectron2.structures import Instances
+from detectron2.data import transforms as T
+from detectron2.data.detection_utils import \
+    annotations_to_instances as d2_anno_to_inst
+from detectron2.data.detection_utils import \
+    transform_instance_annotations as d2_transform_inst_anno
+from .augmentation import Pad
+import random
+
+
+def transform_instance_annotations(
+    annotation, transforms, image_size, *, keypoint_hflip_indices=None
+):
+
+    annotation = d2_transform_inst_anno(
+        annotation,
+        transforms,
+        image_size,
+        keypoint_hflip_indices=keypoint_hflip_indices,
+    )
+
+    if "beziers" in annotation:
+        beziers = transform_ctrl_pnts_annotations(annotation["beziers"], transforms)
+        annotation["beziers"] = beziers
+
+    if "polyline" in annotation:
+        polys = transform_ctrl_pnts_annotations(annotation["polyline"], transforms)
+        annotation["polyline"] = polys
+
+    if "boundary" in annotation:
+        boundary = transform_ctrl_pnts_annotations(annotation["boundary"], transforms)
+        annotation["boundary"] = boundary
+
+    return annotation
+
+
+def transform_ctrl_pnts_annotations(pnts, transforms):
+    """
+    Transform keypoint annotations of an image.
+
+    Args:
+        beziers (list[float]): Nx16 float in Detectron2 Dataset format.
+        transforms (TransformList):
+    """
+    # (N*2,) -> (N, 2)
+    pnts = np.asarray(pnts, dtype="float64").reshape(-1, 2)
+    pnts = transforms.apply_coords(pnts).reshape(-1)
+
+    # This assumes that HorizFlipTransform is the only one that does flip
+    do_hflip = (
+        sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+    )
+    if do_hflip:
+        raise ValueError("Flipping text data is not supported (also disencouraged).")
+
+    return pnts
+
+
+def annotations_to_instances(annos, image_size, mask_format="polygon"):
+    """for line only annotations"""
+    # instance = Instances(image_size)
+    #
+    # classes = [int(obj["category_id"]) for obj in annos]
+    # classes = torch.tensor(classes, dtype=torch.int64)
+    # instance.gt_classes = classes
+
+    instance = d2_anno_to_inst(annos, image_size, mask_format)
+
+    if not annos:
+        return instance
+
+    # add attributes
+    if "beziers" in annos[0]:
+        beziers = [obj.get("beziers", []) for obj in annos]
+        instance.beziers = torch.as_tensor(beziers, dtype=torch.float32)
+
+    if "polyline" in annos[0]:
+        polys = [obj.get("polyline", []) for obj in annos]
+        instance.polyline = torch.as_tensor(polys, dtype=torch.float32)
+
+    if "boundary" in annos[0]:
+        boundary = [obj.get("boundary", []) for obj in annos]
+        instance.boundary = torch.as_tensor(boundary, dtype=torch.float32)
+
+    if "text" in annos[0]:
+        texts = [obj.get("text", []) for obj in annos]
+        instance.texts = torch.as_tensor(texts, dtype=torch.int32)
+
+    return instance
+
+
+def build_augmentation(cfg, is_train):
+    """
+    With option to don't use hflip
+
+    Returns:
+        list[Augmentation]
+    """
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+    if sample_style == "range":
+        assert (
+            len(min_size) == 2
+        ), "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
+
+    logger = logging.getLogger(__name__)
+
+    augmentation = []
+    augmentation.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+
+    if is_train:
+        augmentation.append(T.RandomContrast(0.3, 1.7))
+        augmentation.append(T.RandomBrightness(0.3, 1.7))
+        augmentation.append(T.RandomLighting(random.random() + 0.5))
+        augmentation.append(T.RandomSaturation(0.3, 1.7))
+        logger.info("Augmentations used in training: " + str(augmentation))
+    if cfg.MODEL.BACKBONE.NAME == "build_vitaev2_backbone":
+        augmentation.append(Pad(divisible_size=32))
+    return augmentation
+
+
+build_transform_gen = build_augmentation
+"""
+Alias for backward-compatibility.
+"""