Release code for iNaturalist 2018 (#197)

f37f9c2a · zhe chen · cfd24625 · f37f9c2a · f37f9c2a · f37f9c2a
Commit f37f9c2a authored Jan 17, 2025 by zhe chen
8 changed files
--- a/classification/README.md
+++ b/classification/README.md
@@ -167,6 +167,27 @@ We use standard ImageNet dataset, you can download it from http://image-net.org/
 </details>
+<details>
+  <summary>iNaturalist 2018</summary>
+- For the iNaturalist 2018, please download the dataset from the [official repository](https://github.com/visipedia/inat_comp/blob/master/2018/README.md).
+  The file structure should look like:
+  ```bash
+    $ tree inat2018/
+    inat2018/
+    ├── categories.json
+    ├── test2018
+    ├── test2018.json
+    ├── train2018.json
+    ├── train2018_locations.json
+    ├── val2018
+    ├── val2018.json
+    └── val2018_locations.json
+  ```
+</details>
 ## Released Models
 <details open>
@@ -204,6 +225,19 @@ We use standard ImageNet dataset, you can download it from http://image-net.org/
 </details>
+<details open>
+<summary> iNaturalist 2018 Image Classification </summary>
+<br>
+<div>
+|     name      |  pretrain  | resolution | acc@1 | #param |                                    download                                     |
+| :-----------: | :--------: | :--------: | :---: | :----: | :-----------------------------------------------------------------------------: |
+| InternImage-H | Joint 427M |  384x384   | 92.6  |  1.1B  | [ckpt](<>) \| [cfg](configs/inaturalist2018/internimage_h_22ktoinat18_384.yaml) |
+</div>
+</details>
 ## Evaluation
 To evaluate a pretrained `InternImage` on ImageNet val, run:

--- a/classification/configs/inaturalist2018/internimage_h_22ktoinat18_384.yaml
+++ b/classification/configs/inaturalist2018/internimage_h_22ktoinat18_384.yaml
 DATA:
  IMG_SIZE: 384
-  DATASET: inat18
  IMG_ON_MEMORY: False
-  DATA_PATH: "data/inat2018/"
+  DATASET: inat18
 AUG:
  MIXUP: 0.0
  CUTMIX: 0.0
+  REPROB: 0.0
 MODEL:
-  PRETRAINED: './pretrained/internimage_h_jointto22k_384.pth'
+  TYPE: intern_image_meta_former
-  TYPE: intern_image_with_meta
+  DROP_PATH_RATE: 0.6
-  DROP_PATH_RATE: 0.2
  LABEL_SMOOTHING: 0.3
  INTERN_IMAGE:
    CORE_OP: 'DCNv3'
@@ -26,22 +25,22 @@ MODEL:
    LEVEL2_POST_NORM_BLOCK_IDS: [5, 11, 17, 23, 29]
    CENTER_FEATURE_SCALE: True
    USE_CLIP_PROJECTOR: True
+  PRETRAINED: 'pretrained/internimage_h_jointto22k_384.pth'
 TRAIN:
  EMA:
-    ENABLE: false
+    ENABLE: true
-    DECAY: 0.9998
+    DECAY: 0.9999
  EPOCHS: 100
  WARMUP_EPOCHS: 0
-  WEIGHT_DECAY: 1e-8
+  WEIGHT_DECAY: 0.05
-  BASE_LR: 3e-05 # 512
+  BASE_LR: 2e-05 # 512
-  WARMUP_LR: 3e-08
+  WARMUP_LR: .0
-  MIN_LR: 3e-07
+  MIN_LR: .0
  LR_LAYER_DECAY: true
-  LR_LAYER_DECAY_RATIO: 0.8
+  LR_LAYER_DECAY_RATIO: 0.9
+  USE_CHECKPOINT: true
  RAND_INIT_FT_HEAD: true
-  USE_CHECKPOINT: false
  OPTIMIZER:
-    USE_ZERO: True
    DCN_LR_MUL: 0.1
 AMP_OPT_LEVEL: O0
 EVAL_FREQ: 1
--- a/classification/dataset/build.py
+++ b/classification/dataset/build.py
@@ -12,7 +12,9 @@ import torch.distributed as dist
 from timm.data import Mixup, create_transform
 from torchvision import transforms
-from .cached_image_folder import CachedImageFolder, ImageCephDataset
+from .cached_image_folder import (CachedImageFolder, ImageCephDataset,
+                                  INat18ImageCephDataset,
+                                  INat18ParserCephImage)
 from .samplers import NodeDistributedSampler, SubsetRandomSampler
 try:
@@ -229,6 +231,15 @@ def build_dataset(split, config):
            root = os.path.join(config.DATA.DATA_PATH, 'val')
            dataset = ImageCephDataset(root, 'val', transform=transform)
            nb_classes = 1000
+    elif config.DATA.DATASET == 'inat18':
+        if prefix == 'train' and not config.EVAL_MODE:
+            root = config.DATA.DATA_PATH
+            dataset = INat18ImageCephDataset(
+                root, 'train', transform=transform, on_memory=config.DATA.IMG_ON_MEMORY)
+        elif prefix == 'val':
+            root = config.DATA.DATA_PATH
+            dataset = INat18ImageCephDataset(root, 'val', transform=transform)
+        nb_classes = 8142
    else:
        raise NotImplementedError(
            f'build_dataset does support {config.DATA.DATASET}')

--- a/classification/dataset/cached_image_folder.py
+++ b/classification/dataset/cached_image_folder.py
@@ -340,6 +340,55 @@ class ImageCephDataset(data.Dataset):
        return self.parser.filenames(basename, absolute)
+class INat18ImageCephDataset(data.Dataset):
+    def __init__(self,
+                 root,
+                 split,
+                 parser=None,
+                 transform=None,
+                 target_transform=None,
+                 on_memory=False):
+        if split == 'train':
+            annotation_root = osp.join(root, 'train2018.json')
+        elif split == 'val':
+            annotation_root = osp.join(root, 'val2018.json')
+        elif split == 'test':
+            annotation_root = osp.join(root, 'test2018.json')
+        if parser is None or isinstance(parser, str):
+            parser = INat18ParserCephImage(root=root,
+                                           split=split,
+                                           annotation_root=annotation_root,
+                                           on_memory=on_memory)
+        self.parser = parser
+        self.transform = transform
+        self.target_transform = target_transform
+        self._consecutive_errors = 0
+    def __getitem__(self, index):
+        img, temporal_info, spatial_info, target = self.parser[index]
+        self._consecutive_errors = 0
+        if self.transform is not None:
+            img = self.transform(img)
+        if target is None:
+            target = -1
+        elif self.target_transform is not None:
+            target = self.target_transform(target)
+        temporal_info = torch.tensor(temporal_info).to(torch.float32)
+        spatial_info = torch.tensor(spatial_info).to(torch.float32)
+        return [img, temporal_info, spatial_info], target
+    def __len__(self):
+        return len(self.parser)
+    def filename(self, index, basename=False, absolute=False):
+        return self.parser.filename(index, basename, absolute)
+    def filenames(self, basename=False, absolute=False):
+        return self.parser.filenames(basename, absolute)
 class Parser:
    def __init__(self):
@@ -372,7 +421,7 @@ class ParserCephImage(Parser):
        self.file_client = None
        self.kwargs = kwargs
-        self.root = root  # dataset:s3://imagenet22k
+        self.root = root
        if '22k' in root:
            self.io_backend = 'petrel'
            with open(osp.join(annotation_root, '22k_class_to_idx.json'),
@@ -497,7 +546,7 @@ class ParserCephImage(Parser):
            else:
                target = int(target)
        except:
-            print('aaaaaaaaaaaa', filepath, target)
+            print(filepath, target)
            exit()
        return img, target
@@ -512,6 +561,87 @@ class ParserCephImage(Parser):
        return filename
+class INat18ParserCephImage(Parser):
+    def __init__(self,
+                 root,
+                 split,
+                 annotation_root,
+                 on_memory=False,
+                 **kwargs):
+        super().__init__()
+        self.file_client = None
+        self.kwargs = kwargs
+        self.split = split
+        self.root = root
+        self.io_backend = 'disk'
+        data = mmcv.load(annotation_root)
+        self.samples = data['annotations']
+        self.file_names = [each['file_name'] for each in data['images']]
+        self.meta_data = mmcv.load(
+            annotation_root.replace('2018.json', '2018_locations.json'))
+        self.class_to_idx = {}
+        for i, each in enumerate(data['categories']):
+            self.class_to_idx[each['id']] = i
+        self.on_memory = on_memory
+        self._consecutive_errors = 0
+        # TODO: support on_memory function
+    def __getitem__(self, index):
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+        anns = self.samples[index]
+        filename = self.file_names[index]
+        img_id = anns['image_id']
+        target = anns['category_id']
+        # load meta information from json file
+        meta = self.meta_data[index]
+        date = meta['date']
+        latitude = meta['lat']
+        longitude = meta['lon']
+        location_uncertainty = meta['loc_uncert']
+        temporal_info = get_temporal_info(date, miss_hour=True)
+        spatial_info = get_spatial_info(latitude, longitude)
+        filepath = osp.join(self.root, filename)
+        try:
+            if self.on_memory:
+                img_bytes = self.holder[filepath]
+            else:
+                img_bytes = self.file_client.get(filepath)
+            img = mmcv.imfrombytes(img_bytes)[:, :, ::-1]
+        except Exception as e:
+            _logger.warning(
+                f'Skipped sample (index {index}, file {filepath}). {str(e)}')
+            self._consecutive_errors += 1
+            if self._consecutive_errors < _ERROR_RETRY:
+                return self.__getitem__((index + 1) % len(self))
+            else:
+                raise e
+        self._consecutive_errors = 0
+        img = Image.fromarray(img)
+        if self.class_to_idx is not None:
+            target = self.class_to_idx[target]
+        else:
+            target = int(target)
+        return img, temporal_info, spatial_info, target
+    def __len__(self):
+        return len(self.samples)
+    def _filename(self, index, basename=False, absolute=False):
+        filename, _ = self.samples[index].split(' ')
+        filename = osp.join(self.root, filename)
+        return filename
 def get_temporal_info(date, miss_hour=False):
    try:
        if date:

--- a/classification/main.py
+++ b/classification/main.py
@@ -74,8 +74,7 @@ def parse_option():
                        type=str,
                        help='dataset name',
                        default=None)
-    parser.add_argument('--data-path', type=str, help='path to dataset',
+    parser.add_argument('--data-path', type=str, help='path to dataset')
-                        default='data/imagenet')
    parser.add_argument('--zip',
                        action='store_true',
                        help='use zipped dataset instead of folder dataset')
@@ -146,6 +145,9 @@ def throughput(data_loader, model, logger):
    model.eval()
    for idx, (images, _) in enumerate(data_loader):
+        if type(images) == list:
+            images = [item.cuda(non_blocking=True) for item in images]
+        else:
            images = images.cuda(non_blocking=True)
        batch_size = images.shape[0]
        for i in range(50):
@@ -403,6 +405,9 @@ def train_one_epoch(config,
    amp_type = torch.float16 if config.AMP_TYPE == 'float16' else torch.bfloat16
    for idx, (samples, targets) in enumerate(data_loader):
        iter_begin_time = time.time()
+        if type(samples) == list:
+            samples = [item.cuda(non_blocking=True) for item in samples]
+        else:
            samples = samples.cuda(non_blocking=True)
        targets = targets.cuda(non_blocking=True)
@@ -528,6 +533,9 @@ def validate(config, data_loader, model, epoch=None):
    end = time.time()
    for idx, (images, target) in enumerate(data_loader):
+        if type(images) == list:
+            images = [item.cuda(non_blocking=True) for item in images]
+        else:
            images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)
        output = model(images)

--- a/classification/models/build.py
+++ b/classification/models/build.py
@@ -5,6 +5,7 @@
 # --------------------------------------------------------
 from .intern_image import InternImage
+from .intern_image_meta_former import InternImageMetaFormer
 def build_model(config):
@@ -30,6 +31,27 @@ def build_model(config):
            center_feature_scale=config.MODEL.INTERN_IMAGE.CENTER_FEATURE_SCALE, # for InternImage-H/G
            remove_center=config.MODEL.INTERN_IMAGE.REMOVE_CENTER,
        )
+    elif model_type == 'intern_image_meta_former':
+        model = InternImageMetaFormer(
+            core_op=config.MODEL.INTERN_IMAGE.CORE_OP,
+            num_classes=config.MODEL.NUM_CLASSES,
+            channels=config.MODEL.INTERN_IMAGE.CHANNELS,
+            depths=config.MODEL.INTERN_IMAGE.DEPTHS,
+            groups=config.MODEL.INTERN_IMAGE.GROUPS,
+            layer_scale=config.MODEL.INTERN_IMAGE.LAYER_SCALE,
+            offset_scale=config.MODEL.INTERN_IMAGE.OFFSET_SCALE,
+            post_norm=config.MODEL.INTERN_IMAGE.POST_NORM,
+            mlp_ratio=config.MODEL.INTERN_IMAGE.MLP_RATIO,
+            with_cp=config.TRAIN.USE_CHECKPOINT,
+            drop_path_rate=config.MODEL.DROP_PATH_RATE,
+            res_post_norm=config.MODEL.INTERN_IMAGE.RES_POST_NORM,  # for InternImage-H/G
+            dw_kernel_size=config.MODEL.INTERN_IMAGE.DW_KERNEL_SIZE,  # for InternImage-H/G
+            use_clip_projector=config.MODEL.INTERN_IMAGE.USE_CLIP_PROJECTOR,  # for InternImage-H/G
+            level2_post_norm=config.MODEL.INTERN_IMAGE.LEVEL2_POST_NORM,  # for InternImage-H/G
+            level2_post_norm_block_ids=config.MODEL.INTERN_IMAGE.LEVEL2_POST_NORM_BLOCK_IDS,  # for InternImage-H/G
+            center_feature_scale=config.MODEL.INTERN_IMAGE.CENTER_FEATURE_SCALE,  # for InternImage-H/G
+            remove_center=config.MODEL.INTERN_IMAGE.REMOVE_CENTER,
+        )
    else:
        raise NotImplementedError(f'Unkown model: {model_type}')

--- a/classification/models/intern_image_meta_former.py
+++ b/classification/models/intern_image_meta_former.py
--- a/classification/train_inat18.sh
+++ b/classification/train_inat18.sh
+#!/usr/bin/env bash
+set -x
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-12}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+    srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    --quotatype=reserved \
+    ${SRUN_ARGS} \
+    python -u main.py \
+    --cfg ${CONFIG} \
+    --accumulation-steps 1 \
+    --local-rank 0 \
+    --data-path data/inat2018 \
+    --output work_dirs ${@:4}