registry and copy keys for extended coco load

Summary: 1. Add registry for coco injection to allow for easier overriding of cococ injections 2. Coco loading currently is limited to certain keys. Adding option to allow for copying certain keys from the outputs. Reviewed By: zhanghang1989 Differential Revision: D33132517 fbshipit-source-id: 57ac4994a66f9c75457cada7e85fb15da4818f3e

registry and copy keys for extended coco load
Summary: 1. Add registry for coco injection to allow for easier overriding of cococ injections 2. Coco loading currently is limited to certain keys. Adding option to allow for copying certain keys from the outputs. Reviewed By: zhanghang1989 Differential Revision: D33132517 fbshipit-source-id: 57ac4994a66f9c75457cada7e85fb15da4818f3e
bfd78461 · Sam Tsai · Facebook GitHub Bot · f3a4a534 · bfd78461 · bfd78461
Commit bfd78461 authored Dec 22, 2021 by Sam Tsai Committed by Facebook GitHub Bot Dec 22, 2021
4 changed files
--- a/d2go/data/config.py
+++ b/d2go/data/config.py
@@ -28,6 +28,7 @@ def add_d2go_data_default_configs(_C):
    _C.D2GO_DATA.DATASETS.COCO_INJECTION.IM_DIRS = []
    _C.D2GO_DATA.DATASETS.COCO_INJECTION.JSON_FILES = []
    _C.D2GO_DATA.DATASETS.COCO_INJECTION.KEYPOINT_METADATA = []
+    _C.D2GO_DATA.DATASETS.COCO_INJECTION.REGISTER_FUNCTION = "_register_extended_coco"
    # On-the-fly register a list of datasets located under detectron2go/datasets
    # by specifying the filename (without .py).

--- a/d2go/data/datasets.py
+++ b/d2go/data/datasets.py
@@ -10,6 +10,7 @@ import os
 from d2go.utils.helper import get_dir_path
 from d2go.utils.misc import fb_overwritable
 from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.registry import Registry
 from .extended_coco import coco_text_load, extended_coco_load
 from .extended_lvis import extended_lvis_load
@@ -22,6 +23,14 @@ D2GO_DATASETS_BASE_MODULE = "d2go.datasets"
 IM_DIR = "image_directory"
 ANN_FN = "annotation_file"
+COCO_REGISTER_FUNCTION_REGISTRY = Registry("COCO_REGISTER_FUNCTION_REGISTRY")
+COCO_REGISTER_FUNCTION_REGISTRY.__doc__ = "Registry - coco register function"
+def get_coco_register_function(cfg):
+    name = cfg.D2GO_DATA.DATASETS.COCO_INJECTION.REGISTER_FUNCTION
+    return COCO_REGISTER_FUNCTION_REGISTRY.get(name)
 def _import_dataset(module_name):
    return importlib.import_module(
@@ -29,6 +38,7 @@ def _import_dataset(module_name):
    )
+@COCO_REGISTER_FUNCTION_REGISTRY.register()
 def _register_extended_coco(dataset_name, split_dict):
    json_file = split_dict[ANN_FN]
    image_root = split_dict[IM_DIR]
@@ -113,6 +123,7 @@ def inject_coco_datasets(cfg):
    im_dirs = cfg.D2GO_DATA.DATASETS.COCO_INJECTION.IM_DIRS
    json_files = cfg.D2GO_DATA.DATASETS.COCO_INJECTION.JSON_FILES
    metadata_type = cfg.D2GO_DATA.DATASETS.COCO_INJECTION.KEYPOINT_METADATA
+    _register_func = get_coco_register_function(cfg)
    assert len(names) == len(im_dirs) == len(json_files)
    for ds_index, (name, im_dir, json_file) in enumerate(
@@ -122,7 +133,7 @@ def inject_coco_datasets(cfg):
        if len(metadata_type) != 0:
            split_dict["meta_data"] = get_keypoint_metadata(metadata_type[ds_index])
        logger.info("Inject coco dataset {}: {}".format(name, split_dict))
-        _register_extended_coco(name, split_dict)
+        _register_func(name, split_dict)
 def register_dataset_split(dataset_name, split_dict):

--- a/d2go/data/extended_coco.py
+++ b/d2go/data/extended_coco.py
@@ -7,6 +7,7 @@ import logging
 import shlex
 import subprocess
 from collections import defaultdict
+from typing import Optional, List, Dict
 import detectron2.utils.comm as comm
 from detectron2.data import MetadataCatalog
@@ -43,7 +44,7 @@ class InMemoryCOCO(COCO):
        self.createIndex()
-def extract_archive_file(archive_fn, im_dir):
+def extract_archive_file(archive_fn: str, im_dir: str):
    if not os.path.exists(im_dir) or not os.listdir(im_dir):
        # Dataset is not deployed. Deploy it.
        archive_fns = archive_fn
@@ -71,8 +72,12 @@ def extract_archive_file(archive_fn, im_dir):
 def convert_coco_text_to_coco_detection_json(
-    source_json, target_json, set_type=None, min_img_size=100, text_cat_id=1
+    source_json: str,
-):
+    target_json: str,
+    set_type: Optional[str] = None,
+    min_img_size: int = 100,
+    text_cat_id: int = 1,
+) -> Dict:
    """
    This function converts a COCOText style JSON to a COCODetection style
    JSON.
@@ -125,7 +130,7 @@ def convert_coco_text_to_coco_detection_json(
    return coco_text_json
-def valid_bbox(bbox_xywh, img_w, img_h):
+def valid_bbox(bbox_xywh: List[int], img_w: int, img_h: int) -> bool:
    if (
        bbox_xywh is None
        or (bbox_xywh[3] == 0 or bbox_xywh[2] == 0)
@@ -136,7 +141,14 @@ def valid_bbox(bbox_xywh, img_w, img_h):
    return True
-def convert_to_dict_list(image_root, id_map, imgs, anns, dataset_name=None):
+def convert_to_dict_list(
+    image_root: str,
+    id_map: Dict,
+    imgs: Dict,
+    anns: Dict,
+    dataset_name: Optional[str] = None,
+    image_direct_copy_keys: List[str] = None,
+) -> List[Dict]:
    num_instances_without_valid_segmentation = 0
    num_instances_without_valid_bounding_box = 0
    dataset_dicts = []
@@ -160,6 +172,13 @@ def convert_to_dict_list(image_root, id_map, imgs, anns, dataset_name=None):
                    )
            record["file_name"] = img_dict["file_name"]
+        if image_direct_copy_keys:
+            for copy_key in image_direct_copy_keys:
+                assert (
+                    copy_key in img_dict
+                ), f"{copy_key} not in coco image dictionary entry"
+                record[copy_key] = img_dict[copy_key]
        if "height" in img_dict or "width" in img_dict:
            record["height"] = img_dict["height"]
            record["width"] = img_dict["width"]
@@ -265,12 +284,12 @@ def convert_to_dict_list(image_root, id_map, imgs, anns, dataset_name=None):
 def coco_text_load(
-    coco_json_file,
+    coco_json_file: str,
-    image_root,
+    image_root: str,
-    source_json_file=None,
+    source_json_file: Optional[str] = None,
-    dataset_name=None,
+    dataset_name: Optional[str] = None,
-    archive_file=None,
+    archive_file: Optional[str] = None,
-):
+) -> List[Dict]:
    if archive_file is not None:
        if comm.get_rank() == 0:
            extract_archive_file(archive_file, image_root)
@@ -288,7 +307,13 @@ def coco_text_load(
    )
-def extended_coco_load(json_file, image_root, dataset_name=None, loaded_json=None):
+def extended_coco_load(
+    json_file: str,
+    image_root: str,
+    dataset_name: Optional[str] = None,
+    loaded_json: Optional[str] = None,
+    image_direct_copy_keys: List[str] = None,
+) -> List[Dict]:
    """
    Load a json file with COCO's annotation format.
    Currently only supports instance segmentation annotations.
@@ -352,7 +377,14 @@ def extended_coco_load(json_file, image_root, dataset_name=None, loaded_json=Non
    logger.info("Loaded {} images from {}".format(len(imgs), json_file))
    # Return the coco converted to record list
-    return convert_to_dict_list(image_root, id_map, imgs, anns, dataset_name)
+    return convert_to_dict_list(
+        image_root,
+        id_map,
+        imgs,
+        anns,
+        dataset_name,
+        image_direct_copy_keys=image_direct_copy_keys,
+    )
 if __name__ == "__main__":

--- a/tests/data/test_d2go_datasets.py
+++ b/tests/data/test_d2go_datasets.py
@@ -4,6 +4,7 @@
 import copy
 import json
 import os
+import tempfile
 import unittest
 import d2go.data.extended_coco as extended_coco
@@ -175,6 +176,29 @@ class TestD2GoDatasets(unittest.TestCase):
            self.assertEqual(dic["width"], 80)
            self.assertEqual(dic["height"], 60)
+    @tempdir
+    def test_direct_copy_keys(self, tmp_dir):
+        image_dir, json_file = create_test_images_and_dataset_json(tmp_dir)
+        with tempfile.NamedTemporaryFile(prefix=tmp_dir, suffix=".json") as h_temp:
+            new_json_file = h_temp.name
+            with open(json_file, "r") as h_in:
+                ds = json.load(h_in)
+                for idx, x in enumerate(ds["images"]):
+                    x["key1"] = idx
+                    x["key2"] = idx
+                with open(new_json_file, "w") as h_out:
+                    json.dump(ds, h_out)
+            loaded_ds = extended_coco.extended_coco_load(new_json_file, image_dir)
+            self.assertTrue("key1" not in loaded_ds[0])
+            self.assertTrue("key2" not in loaded_ds[0])
+            loaded_ds = extended_coco.extended_coco_load(
+                new_json_file, image_dir, image_direct_copy_keys=["key1"]
+            )
+            self.assertTrue("key1" in loaded_ds[0])
+            self.assertTrue("key2" not in loaded_ds[0])
    @tempdir
    def test_sub_dataset(self, tmp_dir):
        image_dir, json_file = create_test_images_and_dataset_json(tmp_dir)