Initial commit

1d5a34cf · wanglch · 1d5a34cf · 1d5a34cf · 1d5a34cf · 1d5a34cf
Commit 1d5a34cf authored Jul 31, 2024 by wanglch
20 changed files
--- a/classification/configs/intern_vit_6b_1k_224_test_imagenetv2.yaml
+++ b/classification/configs/intern_vit_6b_1k_224_test_imagenetv2.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 128
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: False
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/dataset/__init__.py
+++ b/classification/dataset/__init__.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .build import build_loader, build_loader2
--- a/classification/dataset/build.py
+++ b/classification/dataset/build.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import os
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from timm.data import Mixup, create_transform
+from torchvision import transforms
+from torchvision.datasets import ImageFolder
+
+from .cached_image_folder import ImageCephDataset
+from .samplers import NodeDistributedSampler, SubsetRandomSampler
+
+try:
+    from torchvision.transforms import InterpolationMode
+
+    def _pil_interp(method):
+        if method == 'bicubic':
+            return InterpolationMode.BICUBIC
+        elif method == 'lanczos':
+            return InterpolationMode.LANCZOS
+        elif method == 'hamming':
+            return InterpolationMode.HAMMING
+        else:
+            return InterpolationMode.BILINEAR
+except:
+    from timm.data.transforms import _pil_interp
+
+
+class TTA(torch.nn.Module):
+
+    def __init__(self, size, scales=[1.0, 1.05, 1.1]):
+        super().__init__()
+        self.size = size
+        self.scales = scales
+
+    def forward(self, img):
+        out = []
+        cc = transforms.CenterCrop(self.size)
+        for scale in self.scales:
+            size_ = int(scale * self.size)
+            rs = transforms.Resize(size_, interpolation=_pil_interp('bicubic'))
+            img_ = rs(img)
+            img_ = cc(img_)
+            out.append(img_)
+
+        return out
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(size={self.size}, scale={self.scales})'
+
+
+def build_loader(config):
+    config.defrost()
+    dataset_train, config.MODEL.NUM_CLASSES = build_dataset('train', config=config)
+    config.freeze()
+    print(f'local rank {config.LOCAL_RANK} / global rank {dist.get_rank()}'
+          'successfully build train dataset')
+
+    dataset_val, _ = build_dataset('val', config=config)
+    print(f'local rank {config.LOCAL_RANK} / global rank {dist.get_rank()}'
+          'successfully build val dataset')
+
+    dataset_test, _ = build_dataset('test', config=config)
+    print(f'local rank {config.LOCAL_RANK} / global rank {dist.get_rank()}'
+          'successfully build test dataset')
+
+    num_tasks = dist.get_world_size()
+    global_rank = dist.get_rank()
+
+    if dataset_train is not None:
+        if config.DATA.IMG_ON_MEMORY:
+            sampler_train = NodeDistributedSampler(dataset_train)
+        else:
+            if config.DATA.ZIP_MODE and config.DATA.CACHE_MODE == 'part':
+                indices = np.arange(dist.get_rank(), len(dataset_train), dist.get_world_size())
+                sampler_train = SubsetRandomSampler(indices)
+            else:
+                sampler_train = torch.utils.data.DistributedSampler(
+                    dataset_train,
+                    num_replicas=num_tasks,
+                    rank=global_rank,
+                    shuffle=True)
+
+    if dataset_val is not None:
+        if config.TEST.SEQUENTIAL:
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+        else:
+            sampler_val = torch.utils.data.distributed.DistributedSampler(dataset_val, shuffle=False)
+
+    if dataset_test is not None:
+        if config.TEST.SEQUENTIAL:
+            sampler_test = torch.utils.data.SequentialSampler(dataset_test)
+        else:
+            sampler_test = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train,
+        sampler=sampler_train,
+        batch_size=config.DATA.BATCH_SIZE,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=True,
+        persistent_workers=True) if dataset_train is not None else None
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val,
+        sampler=sampler_val,
+        batch_size=config.DATA.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=False,
+        persistent_workers=True) if dataset_val is not None else None
+
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test,
+        sampler=sampler_test,
+        batch_size=config.DATA.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=False,
+        persistent_workers=True) if dataset_test is not None else None
+
+    # setup mixup / cutmix
+    mixup_fn = None
+    mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
+    if mixup_active:
+        mixup_fn = Mixup(mixup_alpha=config.AUG.MIXUP,
+                         cutmix_alpha=config.AUG.CUTMIX,
+                         cutmix_minmax=config.AUG.CUTMIX_MINMAX,
+                         prob=config.AUG.MIXUP_PROB,
+                         switch_prob=config.AUG.MIXUP_SWITCH_PROB,
+                         mode=config.AUG.MIXUP_MODE,
+                         label_smoothing=config.MODEL.LABEL_SMOOTHING,
+                         num_classes=config.MODEL.NUM_CLASSES)
+
+    return dataset_train, dataset_val, dataset_test, data_loader_train, \
+           data_loader_val, data_loader_test, mixup_fn
+
+
+def build_loader2(config):
+    config.defrost()
+    dataset_train, config.MODEL.NUM_CLASSES = build_dataset('train', config=config)
+    config.freeze()
+    dataset_val, _ = build_dataset('val', config=config)
+    dataset_test, _ = build_dataset('test', config=config)
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train,
+        shuffle=True,
+        batch_size=config.DATA.BATCH_SIZE,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=True,
+        persistent_workers=True) if dataset_train is not None else None
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val,
+        batch_size=config.DATA.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=False,
+        persistent_workers=True) if dataset_val is not None else None
+
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test,
+        batch_size=config.DATA.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=False,
+        persistent_workers=True) if dataset_test is not None else None
+
+    # setup mixup / cutmix
+    mixup_fn = None
+    mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
+    if mixup_active:
+        mixup_fn = Mixup(mixup_alpha=config.AUG.MIXUP,
+                         cutmix_alpha=config.AUG.CUTMIX,
+                         cutmix_minmax=config.AUG.CUTMIX_MINMAX,
+                         prob=config.AUG.MIXUP_PROB,
+                         switch_prob=config.AUG.MIXUP_SWITCH_PROB,
+                         mode=config.AUG.MIXUP_MODE,
+                         label_smoothing=config.MODEL.LABEL_SMOOTHING,
+                         num_classes=config.MODEL.NUM_CLASSES)
+
+    return dataset_train, dataset_val, dataset_test, data_loader_train, \
+           data_loader_val, data_loader_test, mixup_fn
+
+
+def build_dataset(split, config):
+    if config.DATA.TRANSFORM == 'build_transform':
+        transform = build_transform(split == 'train', config)
+    elif config.DATA.TRANSFORM == 'build_transform_for_linear_probe':
+        transform = build_transform_for_linear_probe(split == 'train', config)
+    else:
+        raise NotImplementedError
+    print(split, transform)
+    dataset = None
+    nb_classes = None
+    prefix = split
+    if config.DATA.DATASET == 'imagenet' or config.DATA.DATASET == 'imagenet-real':
+        if prefix == 'train' and not config.EVAL_MODE:
+            root = os.path.join(config.DATA.DATA_PATH, 'train')
+            dataset = ImageCephDataset(root, 'train',
+                                       transform=transform,
+                                       on_memory=config.DATA.IMG_ON_MEMORY)
+        elif prefix == 'val':
+            root = os.path.join(config.DATA.DATA_PATH, 'val')
+            dataset = ImageCephDataset(root, 'val', transform=transform)
+        nb_classes = 1000
+    elif config.DATA.DATASET == 'imagenet22K':
+        if prefix == 'train':
+            if not config.EVAL_MODE:
+                root = config.DATA.DATA_PATH
+                dataset = ImageCephDataset(root, 'train',
+                                           transform=transform,
+                                           on_memory=config.DATA.IMG_ON_MEMORY)
+            nb_classes = 21841
+        elif prefix == 'val':
+            root = os.path.join(config.DATA.DATA_PATH, 'val')
+            dataset = ImageCephDataset(root, 'val', transform=transform)
+            nb_classes = 1000
+    elif config.DATA.DATASET == 'imagenetv2':
+        from .imagenetv2 import ImageNetV2Dataset
+        if prefix == 'train' and not config.EVAL_MODE:
+            print(f'Only test split available for {config.DATA.DATASET}')
+        else:
+            dataset = ImageNetV2Dataset(variant='matched-frequency',
+                                        transform=transform,
+                                        location=config.DATA.DATA_PATH)
+            nb_classes = 1000
+    elif config.DATA.DATASET == 'imagenet_sketch':
+        if prefix == 'train' and not config.EVAL_MODE:
+            print(f'Only test split available for {config.DATA.DATASET}')
+        else:
+            dataset = ImageFolder(root=config.DATA.DATA_PATH, transform=transform)
+            nb_classes = 1000
+    elif config.DATA.DATASET == 'imagenet_a':
+        if prefix == 'train' and not config.EVAL_MODE:
+            print(f'Only test split available for {config.DATA.DATASET}')
+        else:
+            dataset = ImageFolder(root=config.DATA.DATA_PATH, transform=transform)
+            nb_classes = 1000  # actual number of classes is 200
+    elif config.DATA.DATASET == 'imagenet_r':
+        if prefix == 'train' and not config.EVAL_MODE:
+            print(f'Only test split available for {config.DATA.DATASET}')
+        else:
+            dataset = ImageFolder(root=config.DATA.DATA_PATH, transform=transform)
+            nb_classes = 1000  # actual number of classes is 200
+    else:
+        raise NotImplementedError(
+            f'build_dataset does support {config.DATA.DATASET}')
+
+    return dataset, nb_classes
+
+
+def build_transform_for_linear_probe(is_train, config):
+    # linear probe: weak augmentation
+    if is_train:
+        transform = transforms.Compose([
+            transforms.RandomResizedCrop(
+                config.DATA.IMG_SIZE, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=config.AUG.MEAN, std=config.AUG.STD)
+        ])
+    else:
+        transform = transforms.Compose([
+            transforms.Resize(
+                config.DATA.IMG_SIZE, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(config.DATA.IMG_SIZE),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=config.AUG.MEAN, std=config.AUG.STD)
+        ])
+    return transform
+
+
+def build_transform(is_train, config):
+    resize_im = config.DATA.IMG_SIZE > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=config.DATA.IMG_SIZE,
+            is_training=True,
+            color_jitter=config.AUG.COLOR_JITTER
+            if config.AUG.COLOR_JITTER > 0 else None,
+            auto_augment=config.AUG.AUTO_AUGMENT
+            if config.AUG.AUTO_AUGMENT != 'none' else None,
+            re_prob=config.AUG.REPROB,
+            re_mode=config.AUG.REMODE,
+            re_count=config.AUG.RECOUNT,
+            interpolation=config.DATA.INTERPOLATION,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(config.DATA.IMG_SIZE, padding=4)
+
+        return transform
+
+    t = []
+    if resize_im:
+        if config.TEST.CROP:
+            size = int(1.0 * config.DATA.IMG_SIZE)
+            t.append(
+                transforms.Resize(size, interpolation=_pil_interp(config.DATA.INTERPOLATION)),
+                # to maintain same ratio w.r.t. 224 images
+            )
+            t.append(transforms.CenterCrop(config.DATA.IMG_SIZE))
+        elif config.AUG.RANDOM_RESIZED_CROP:
+            t.append(
+                transforms.RandomResizedCrop(
+                    (config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
+                    interpolation=_pil_interp(config.DATA.INTERPOLATION)))
+        else:
+            t.append(
+                transforms.Resize(
+                    (config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
+                    interpolation=_pil_interp(config.DATA.INTERPOLATION)))
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(config.AUG.MEAN, config.AUG.STD))
+
+    return transforms.Compose(t)
--- a/classification/dataset/cached_image_folder.py
+++ b/classification/dataset/cached_image_folder.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import io
+import json
+import logging
+import math
+import os
+import os.path as osp
+import re
+import time
+from abc import abstractmethod
+
+import mmcv
+import torch
+import torch.distributed as dist
+import torch.utils.data as data
+from mmcv.fileio import FileClient
+from PIL import Image
+from tqdm import tqdm, trange
+
+from .zipreader import ZipReader, is_zip_path
+
+_logger = logging.getLogger(__name__)
+
+_ERROR_RETRY = 50
+
+
+def has_file_allowed_extension(filename, extensions):
+    """Checks if a file is an allowed extension.
+
+    Args:
+        filename (string): path to a file
+    Returns:
+        bool: True if the filename ends with a known image extension
+    """
+    filename_lower = filename.lower()
+    return any(filename_lower.endswith(ext) for ext in extensions)
+
+
+def find_classes(dir):
+    classes = [
+        d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))
+    ]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+
+
+def make_dataset(dir, class_to_idx, extensions):
+    images = []
+    dir = os.path.expanduser(dir)
+    for target in sorted(os.listdir(dir)):
+        d = os.path.join(dir, target)
+        if not os.path.isdir(d):
+            continue
+        for root, _, fnames in sorted(os.walk(d)):
+            for fname in sorted(fnames):
+                if has_file_allowed_extension(fname, extensions):
+                    path = os.path.join(root, fname)
+                    item = (path, class_to_idx[target])
+                    images.append(item)
+
+    return images
+
+
+def make_dataset_with_ann(ann_file, img_prefix, extensions):
+    images = []
+    with open(ann_file, 'r') as f:
+        contents = f.readlines()
+        for line_str in contents:
+            path_contents = [c for c in line_str.split('\t')]
+            im_file_name = path_contents[0]
+            class_index = int(path_contents[1])
+            assert str.lower(os.path.splitext(im_file_name)[-1]) in extensions
+            item = (os.path.join(img_prefix, im_file_name), class_index)
+            images.append(item)
+
+    return images
+
+
+class DatasetFolder(data.Dataset):
+    """A generic data loader where the samples are arranged in this way: ::
+
+    root/class_x/xxx.ext
+        root/class_x/xxy.ext
+        root/class_x/xxz.ext
+        root/class_y/123.ext
+        root/class_y/nsdf3.ext
+        root/class_y/asd932_.ext
+    Args:
+        root (string): Root directory path.
+        loader (callable): A function to load a sample given its path.
+        extensions (list[string]): A list of allowed extensions.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+     Attributes:
+        samples (list): List of (sample path, class_index) tuples
+    """
+
+    def __init__(self,
+                 root,
+                 loader,
+                 extensions,
+                 ann_file='',
+                 img_prefix='',
+                 transform=None,
+                 target_transform=None,
+                 cache_mode='no'):
+        # image folder mode
+        if ann_file == '':
+            _, class_to_idx = find_classes(root)
+            samples = make_dataset(root, class_to_idx, extensions)
+        # zip mode
+        else:
+            samples = make_dataset_with_ann(os.path.join(root, ann_file),
+                                            os.path.join(root, img_prefix),
+                                            extensions)
+
+        if len(samples) == 0:
+            raise (RuntimeError('Found 0 files in subfolders of: ' + root +
+                                '\n' + 'Supported extensions are: ' +
+                                ','.join(extensions)))
+
+        self.root = root
+        self.loader = loader
+        self.extensions = extensions
+
+        self.samples = samples
+        self.labels = [y_1k for _, y_1k in samples]
+        self.classes = list(set(self.labels))
+
+        self.transform = transform
+        self.target_transform = target_transform
+
+        self.cache_mode = cache_mode
+        if self.cache_mode != 'no':
+            self.init_cache()
+
+    def init_cache(self):
+        assert self.cache_mode in ['part', 'full']
+        n_sample = len(self.samples)
+        global_rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        samples_bytes = [None for _ in range(n_sample)]
+        start_time = time.time()
+        for index in range(n_sample):
+            if index % (n_sample // 10) == 0:
+                t = time.time() - start_time
+                print(
+                    f'global_rank {dist.get_rank()} cached {index}/{n_sample} takes {t:.2f}s per block'
+                )
+                start_time = time.time()
+            path, target = self.samples[index]
+            if self.cache_mode == 'full':
+                samples_bytes[index] = (ZipReader.read(path), target)
+            elif self.cache_mode == 'part' and index % world_size == global_rank:
+                samples_bytes[index] = (ZipReader.read(path), target)
+            else:
+                samples_bytes[index] = (path, target)
+        self.samples = samples_bytes
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return sample, target
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __repr__(self):
+        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
+        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
+        fmt_str += '    Root Location: {}\n'.format(self.root)
+        tmp = '    Transforms (if any): '
+        fmt_str += '{0}{1}\n'.format(
+            tmp,
+            self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        tmp = '    Target Transforms (if any): '
+        fmt_str += '{0}{1}'.format(
+            tmp,
+            self.target_transform.__repr__().replace('\n',
+                                                     '\n' + ' ' * len(tmp)))
+
+        return fmt_str
+
+
+IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    if isinstance(path, bytes):
+        img = Image.open(io.BytesIO(path))
+    elif is_zip_path(path):
+        data = ZipReader.read(path)
+        img = Image.open(io.BytesIO(data))
+    else:
+        with open(path, 'rb') as f:
+            img = Image.open(f)
+            return img.convert('RGB')
+
+    return img.convert('RGB')
+
+
+def accimage_loader(path):
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+
+
+def default_img_loader(path):
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+
+
+class CachedImageFolder(DatasetFolder):
+    """A generic data loader where the images are arranged in this way: ::
+
+    root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/xxz.png
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/asd932_.png
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+     Attributes:
+        imgs (list): List of (image path, class_index) tuples
+    """
+
+    def __init__(self,
+                 root,
+                 ann_file='',
+                 img_prefix='',
+                 transform=None,
+                 target_transform=None,
+                 loader=default_img_loader,
+                 cache_mode='no'):
+        super(CachedImageFolder,
+              self).__init__(root,
+                             loader,
+                             IMG_EXTENSIONS,
+                             ann_file=ann_file,
+                             img_prefix=img_prefix,
+                             transform=transform,
+                             target_transform=target_transform,
+                             cache_mode=cache_mode)
+        self.imgs = self.samples
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        image = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(image)
+        else:
+            img = image
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+
+class ImageCephDataset(data.Dataset):
+
+    def __init__(self,
+                 root,
+                 split,
+                 parser=None,
+                 transform=None,
+                 target_transform=None,
+                 on_memory=False):
+        if '22k' in root:
+            # Imagenet 22k
+            annotation_root = 'meta_data/'
+        else:
+            # Imagenet
+            annotation_root = 'meta_data/'
+        if parser is None or isinstance(parser, str):
+            parser = ParserCephImage(root=root,
+                                     split=split,
+                                     annotation_root=annotation_root,
+                                     on_memory=on_memory)
+        self.parser = parser
+        self.transform = transform
+        self.target_transform = target_transform
+        self._consecutive_errors = 0
+
+    def __getitem__(self, index):
+        img, target = self.parser[index]
+        self._consecutive_errors = 0
+        if self.transform is not None:
+            img = self.transform(img)
+        if target is None:
+            target = -1
+        elif self.target_transform is not None:
+            target = self.target_transform(target)
+        return img, target
+
+    def __len__(self):
+        return len(self.parser)
+
+    def filename(self, index, basename=False, absolute=False):
+        return self.parser.filename(index, basename, absolute)
+
+    def filenames(self, basename=False, absolute=False):
+        return self.parser.filenames(basename, absolute)
+
+
+class Parser:
+
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def _filename(self, index, basename=False, absolute=False):
+        pass
+
+    def filename(self, index, basename=False, absolute=False):
+        return self._filename(index, basename=basename, absolute=absolute)
+
+    def filenames(self, basename=False, absolute=False):
+        return [
+            self._filename(index, basename=basename, absolute=absolute)
+            for index in range(len(self))
+        ]
+
+
+class ParserCephImage(Parser):
+
+    def __init__(self,
+                 root,
+                 split,
+                 annotation_root,
+                 on_memory=False,
+                 **kwargs):
+        super().__init__()
+
+        self.file_client = None
+        self.kwargs = kwargs
+
+        self.root = root  # dataset:s3://imagenet22k
+        if '22k' in root:
+            self.io_backend = 'petrel'
+            with open(osp.join(annotation_root, '22k_class_to_idx.json'),
+                      'r') as f:
+                self.class_to_idx = json.loads(f.read())
+            with open(osp.join(annotation_root, '22k_label.txt'), 'r') as f:
+                self.samples = f.read().splitlines()
+        else:
+            self.io_backend = 'disk'
+            self.class_to_idx = None
+            with open(osp.join(annotation_root, f'{split}.txt'), 'r') as f:
+                self.samples = f.read().splitlines()
+        local_rank = None
+        local_size = None
+        self._consecutive_errors = 0
+        self.on_memory = on_memory
+        if on_memory:
+            self.holder = {}
+            if local_rank is None:
+                local_rank = int(os.environ.get('LOCAL_RANK', 0))
+            if local_size is None:
+                local_size = int(os.environ.get('LOCAL_SIZE', 1))
+            self.local_rank = local_rank
+            self.local_size = local_size
+            self.rank = int(os.environ['RANK'])
+            self.world_size = int(os.environ['WORLD_SIZE'])
+            self.num_replicas = int(os.environ['WORLD_SIZE'])
+            self.num_parts = local_size
+            self.num_samples = int(
+                math.ceil(len(self.samples) * 1.0 / self.num_replicas))
+            self.total_size = self.num_samples * self.num_replicas
+            self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
+            self.load_onto_memory_v2()
+
+    def load_onto_memory(self):
+        print('Loading images onto memory...', self.local_rank,
+              self.local_size)
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+        for index in trange(len(self.samples)):
+            if index % self.local_size != self.local_rank:
+                continue
+            path, _ = self.samples[index].split(' ')
+            path = osp.join(self.root, path)
+            img_bytes = self.file_client.get(path)
+            self.holder[path] = img_bytes
+
+        print('Loading complete!')
+
+    def load_onto_memory_v2(self):
+        # print("Loading images onto memory...", self.local_rank, self.local_size)
+        t = torch.Generator()
+        t.manual_seed(0)
+        indices = torch.randperm(len(self.samples), generator=t).tolist()
+        # indices = range(len(self.samples))
+        indices = [i for i in indices if i % self.num_parts == self.local_rank]
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size_parts - len(indices))]
+        assert len(indices) == self.total_size_parts
+
+        # subsample
+        indices = indices[self.rank // self.num_parts:self.
+                          total_size_parts:self.num_replicas // self.num_parts]
+        assert len(indices) == self.num_samples
+
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+        for index in tqdm(indices):
+            if index % self.local_size != self.local_rank:
+                continue
+            path, _ = self.samples[index].split(' ')
+            path = osp.join(self.root, path)
+            img_bytes = self.file_client.get(path)
+
+            self.holder[path] = img_bytes
+
+        print('Loading complete!')
+
+    def __getitem__(self, index):
+        if self.file_client is None:
+            self.file_client = FileClient(self.io_backend, **self.kwargs)
+
+        filepath, target = self.samples[index].split(' ')
+        filepath = osp.join(self.root, filepath)
+
+        try:
+            if self.on_memory:
+                img_bytes = self.holder[filepath]
+            else:
+                # pass
+                img_bytes = self.file_client.get(filepath)
+            img = mmcv.imfrombytes(img_bytes)[:, :, ::-1]
+        except Exception as e:
+            _logger.warning(
+                f'Skipped sample (index {index}, file {filepath}). {str(e)}')
+            self._consecutive_errors += 1
+            if self._consecutive_errors < _ERROR_RETRY:
+                return self.__getitem__((index + 1) % len(self))
+            else:
+                raise e
+        self._consecutive_errors = 0
+
+        img = Image.fromarray(img)
+        try:
+            if self.class_to_idx is not None:
+                target = self.class_to_idx[target]
+            else:
+                target = int(target)
+        except:
+            print(filepath, target)
+            exit()
+
+        return img, target
+
+    def __len__(self):
+        return len(self.samples)
+
+    def _filename(self, index, basename=False, absolute=False):
+        filename, _ = self.samples[index].split(' ')
+        filename = osp.join(self.root, filename)
+
+        return filename
+
+
+def get_temporal_info(date, miss_hour=False):
+    try:
+        if date:
+            if miss_hour:
+                pattern = re.compile(r'(\d*)-(\d*)-(\d*)', re.I)
+            else:
+                pattern = re.compile(r'(\d*)-(\d*)-(\d*) (\d*):(\d*):(\d*)',
+                                     re.I)
+            m = pattern.match(date.strip())
+
+            if m:
+                year = int(m.group(1))
+                month = int(m.group(2))
+                day = int(m.group(3))
+                x_month = math.sin(2 * math.pi * month / 12)
+                y_month = math.cos(2 * math.pi * month / 12)
+                if miss_hour:
+                    x_hour = 0
+                    y_hour = 0
+                else:
+                    hour = int(m.group(4))
+                    x_hour = math.sin(2 * math.pi * hour / 24)
+                    y_hour = math.cos(2 * math.pi * hour / 24)
+                return [x_month, y_month, x_hour, y_hour]
+            else:
+                return [0, 0, 0, 0]
+        else:
+            return [0, 0, 0, 0]
+    except:
+        return [0, 0, 0, 0]
+
+
+def get_spatial_info(latitude, longitude):
+    if latitude and longitude:
+        latitude = math.radians(latitude)
+        longitude = math.radians(longitude)
+        x = math.cos(latitude) * math.cos(longitude)
+        y = math.cos(latitude) * math.sin(longitude)
+        z = math.sin(latitude)
+        return [x, y, z]
+    else:
+        return [0, 0, 0]
--- a/classification/dataset/imagenet_a_r_indices.py
+++ b/classification/dataset/imagenet_a_r_indices.py
+"""Code from https://github.com/baaivision/EVA/blob/master/EVA-02/asuka/imagenet_a_r_indices.py
+Thanks to the authors of EVA."""
+
+all_wnids = [
+    'n01440764', 'n01443537', 'n01484850', 'n01491361', 'n01494475',
+    'n01496331', 'n01498041', 'n01514668', 'n01514859', 'n01518878',
+    'n01530575', 'n01531178', 'n01532829', 'n01534433', 'n01537544',
+    'n01558993', 'n01560419', 'n01580077', 'n01582220', 'n01592084',
+    'n01601694', 'n01608432', 'n01614925', 'n01616318', 'n01622779',
+    'n01629819', 'n01630670', 'n01631663', 'n01632458', 'n01632777',
+    'n01641577', 'n01644373', 'n01644900', 'n01664065', 'n01665541',
+    'n01667114', 'n01667778', 'n01669191', 'n01675722', 'n01677366',
+    'n01682714', 'n01685808', 'n01687978', 'n01688243', 'n01689811',
+    'n01692333', 'n01693334', 'n01694178', 'n01695060', 'n01697457',
+    'n01698640', 'n01704323', 'n01728572', 'n01728920', 'n01729322',
+    'n01729977', 'n01734418', 'n01735189', 'n01737021', 'n01739381',
+    'n01740131', 'n01742172', 'n01744401', 'n01748264', 'n01749939',
+    'n01751748', 'n01753488', 'n01755581', 'n01756291', 'n01768244',
+    'n01770081', 'n01770393', 'n01773157', 'n01773549', 'n01773797',
+    'n01774384', 'n01774750', 'n01775062', 'n01776313', 'n01784675',
+    'n01795545', 'n01796340', 'n01797886', 'n01798484', 'n01806143',
+    'n01806567', 'n01807496', 'n01817953', 'n01818515', 'n01819313',
+    'n01820546', 'n01824575', 'n01828970', 'n01829413', 'n01833805',
+    'n01843065', 'n01843383', 'n01847000', 'n01855032', 'n01855672',
+    'n01860187', 'n01871265', 'n01872401', 'n01873310', 'n01877812',
+    'n01882714', 'n01883070', 'n01910747', 'n01914609', 'n01917289',
+    'n01924916', 'n01930112', 'n01943899', 'n01944390', 'n01945685',
+    'n01950731', 'n01955084', 'n01968897', 'n01978287', 'n01978455',
+    'n01980166', 'n01981276', 'n01983481', 'n01984695', 'n01985128',
+    'n01986214', 'n01990800', 'n02002556', 'n02002724', 'n02006656',
+    'n02007558', 'n02009229', 'n02009912', 'n02011460', 'n02012849',
+    'n02013706', 'n02017213', 'n02018207', 'n02018795', 'n02025239',
+    'n02027492', 'n02028035', 'n02033041', 'n02037110', 'n02051845',
+    'n02056570', 'n02058221', 'n02066245', 'n02071294', 'n02074367',
+    'n02077923', 'n02085620', 'n02085782', 'n02085936', 'n02086079',
+    'n02086240', 'n02086646', 'n02086910', 'n02087046', 'n02087394',
+    'n02088094', 'n02088238', 'n02088364', 'n02088466', 'n02088632',
+    'n02089078', 'n02089867', 'n02089973', 'n02090379', 'n02090622',
+    'n02090721', 'n02091032', 'n02091134', 'n02091244', 'n02091467',
+    'n02091635', 'n02091831', 'n02092002', 'n02092339', 'n02093256',
+    'n02093428', 'n02093647', 'n02093754', 'n02093859', 'n02093991',
+    'n02094114', 'n02094258', 'n02094433', 'n02095314', 'n02095570',
+    'n02095889', 'n02096051', 'n02096177', 'n02096294', 'n02096437',
+    'n02096585', 'n02097047', 'n02097130', 'n02097209', 'n02097298',
+    'n02097474', 'n02097658', 'n02098105', 'n02098286', 'n02098413',
+    'n02099267', 'n02099429', 'n02099601', 'n02099712', 'n02099849',
+    'n02100236', 'n02100583', 'n02100735', 'n02100877', 'n02101006',
+    'n02101388', 'n02101556', 'n02102040', 'n02102177', 'n02102318',
+    'n02102480', 'n02102973', 'n02104029', 'n02104365', 'n02105056',
+    'n02105162', 'n02105251', 'n02105412', 'n02105505', 'n02105641',
+    'n02105855', 'n02106030', 'n02106166', 'n02106382', 'n02106550',
+    'n02106662', 'n02107142', 'n02107312', 'n02107574', 'n02107683',
+    'n02107908', 'n02108000', 'n02108089', 'n02108422', 'n02108551',
+    'n02108915', 'n02109047', 'n02109525', 'n02109961', 'n02110063',
+    'n02110185', 'n02110341', 'n02110627', 'n02110806', 'n02110958',
+    'n02111129', 'n02111277', 'n02111500', 'n02111889', 'n02112018',
+    'n02112137', 'n02112350', 'n02112706', 'n02113023', 'n02113186',
+    'n02113624', 'n02113712', 'n02113799', 'n02113978', 'n02114367',
+    'n02114548', 'n02114712', 'n02114855', 'n02115641', 'n02115913',
+    'n02116738', 'n02117135', 'n02119022', 'n02119789', 'n02120079',
+    'n02120505', 'n02123045', 'n02123159', 'n02123394', 'n02123597',
+    'n02124075', 'n02125311', 'n02127052', 'n02128385', 'n02128757',
+    'n02128925', 'n02129165', 'n02129604', 'n02130308', 'n02132136',
+    'n02133161', 'n02134084', 'n02134418', 'n02137549', 'n02138441',
+    'n02165105', 'n02165456', 'n02167151', 'n02168699', 'n02169497',
+    'n02172182', 'n02174001', 'n02177972', 'n02190166', 'n02206856',
+    'n02219486', 'n02226429', 'n02229544', 'n02231487', 'n02233338',
+    'n02236044', 'n02256656', 'n02259212', 'n02264363', 'n02268443',
+    'n02268853', 'n02276258', 'n02277742', 'n02279972', 'n02280649',
+    'n02281406', 'n02281787', 'n02317335', 'n02319095', 'n02321529',
+    'n02325366', 'n02326432', 'n02328150', 'n02342885', 'n02346627',
+    'n02356798', 'n02361337', 'n02363005', 'n02364673', 'n02389026',
+    'n02391049', 'n02395406', 'n02396427', 'n02397096', 'n02398521',
+    'n02403003', 'n02408429', 'n02410509', 'n02412080', 'n02415577',
+    'n02417914', 'n02422106', 'n02422699', 'n02423022', 'n02437312',
+    'n02437616', 'n02441942', 'n02442845', 'n02443114', 'n02443484',
+    'n02444819', 'n02445715', 'n02447366', 'n02454379', 'n02457408',
+    'n02480495', 'n02480855', 'n02481823', 'n02483362', 'n02483708',
+    'n02484975', 'n02486261', 'n02486410', 'n02487347', 'n02488291',
+    'n02488702', 'n02489166', 'n02490219', 'n02492035', 'n02492660',
+    'n02493509', 'n02493793', 'n02494079', 'n02497673', 'n02500267',
+    'n02504013', 'n02504458', 'n02509815', 'n02510455', 'n02514041',
+    'n02526121', 'n02536864', 'n02606052', 'n02607072', 'n02640242',
+    'n02641379', 'n02643566', 'n02655020', 'n02666196', 'n02667093',
+    'n02669723', 'n02672831', 'n02676566', 'n02687172', 'n02690373',
+    'n02692877', 'n02699494', 'n02701002', 'n02704792', 'n02708093',
+    'n02727426', 'n02730930', 'n02747177', 'n02749479', 'n02769748',
+    'n02776631', 'n02777292', 'n02782093', 'n02783161', 'n02786058',
+    'n02787622', 'n02788148', 'n02790996', 'n02791124', 'n02791270',
+    'n02793495', 'n02794156', 'n02795169', 'n02797295', 'n02799071',
+    'n02802426', 'n02804414', 'n02804610', 'n02807133', 'n02808304',
+    'n02808440', 'n02814533', 'n02814860', 'n02815834', 'n02817516',
+    'n02823428', 'n02823750', 'n02825657', 'n02834397', 'n02835271',
+    'n02837789', 'n02840245', 'n02841315', 'n02843684', 'n02859443',
+    'n02860847', 'n02865351', 'n02869837', 'n02870880', 'n02871525',
+    'n02877765', 'n02879718', 'n02883205', 'n02892201', 'n02892767',
+    'n02894605', 'n02895154', 'n02906734', 'n02909870', 'n02910353',
+    'n02916936', 'n02917067', 'n02927161', 'n02930766', 'n02939185',
+    'n02948072', 'n02950826', 'n02951358', 'n02951585', 'n02963159',
+    'n02965783', 'n02966193', 'n02966687', 'n02971356', 'n02974003',
+    'n02977058', 'n02978881', 'n02979186', 'n02980441', 'n02981792',
+    'n02988304', 'n02992211', 'n02992529', 'n02999410', 'n03000134',
+    'n03000247', 'n03000684', 'n03014705', 'n03016953', 'n03017168',
+    'n03018349', 'n03026506', 'n03028079', 'n03032252', 'n03041632',
+    'n03042490', 'n03045698', 'n03047690', 'n03062245', 'n03063599',
+    'n03063689', 'n03065424', 'n03075370', 'n03085013', 'n03089624',
+    'n03095699', 'n03100240', 'n03109150', 'n03110669', 'n03124043',
+    'n03124170', 'n03125729', 'n03126707', 'n03127747', 'n03127925',
+    'n03131574', 'n03133878', 'n03134739', 'n03141823', 'n03146219',
+    'n03160309', 'n03179701', 'n03180011', 'n03187595', 'n03188531',
+    'n03196217', 'n03197337', 'n03201208', 'n03207743', 'n03207941',
+    'n03208938', 'n03216828', 'n03218198', 'n03220513', 'n03223299',
+    'n03240683', 'n03249569', 'n03250847', 'n03255030', 'n03259280',
+    'n03271574', 'n03272010', 'n03272562', 'n03290653', 'n03291819',
+    'n03297495', 'n03314780', 'n03325584', 'n03337140', 'n03344393',
+    'n03345487', 'n03347037', 'n03355925', 'n03372029', 'n03376595',
+    'n03379051', 'n03384352', 'n03388043', 'n03388183', 'n03388549',
+    'n03393912', 'n03394916', 'n03400231', 'n03404251', 'n03417042',
+    'n03424325', 'n03425413', 'n03443371', 'n03444034', 'n03445777',
+    'n03445924', 'n03447447', 'n03447721', 'n03450230', 'n03452741',
+    'n03457902', 'n03459775', 'n03461385', 'n03467068', 'n03476684',
+    'n03476991', 'n03478589', 'n03481172', 'n03482405', 'n03483316',
+    'n03485407', 'n03485794', 'n03492542', 'n03494278', 'n03495258',
+    'n03496892', 'n03498962', 'n03527444', 'n03529860', 'n03530642',
+    'n03532672', 'n03534580', 'n03535780', 'n03538406', 'n03544143',
+    'n03584254', 'n03584829', 'n03590841', 'n03594734', 'n03594945',
+    'n03595614', 'n03598930', 'n03599486', 'n03602883', 'n03617480',
+    'n03623198', 'n03627232', 'n03630383', 'n03633091', 'n03637318',
+    'n03642806', 'n03649909', 'n03657121', 'n03658185', 'n03661043',
+    'n03662601', 'n03666591', 'n03670208', 'n03673027', 'n03676483',
+    'n03680355', 'n03690938', 'n03691459', 'n03692522', 'n03697007',
+    'n03706229', 'n03709823', 'n03710193', 'n03710637', 'n03710721',
+    'n03717622', 'n03720891', 'n03721384', 'n03724870', 'n03729826',
+    'n03733131', 'n03733281', 'n03733805', 'n03742115', 'n03743016',
+    'n03759954', 'n03761084', 'n03763968', 'n03764736', 'n03769881',
+    'n03770439', 'n03770679', 'n03773504', 'n03775071', 'n03775546',
+    'n03776460', 'n03777568', 'n03777754', 'n03781244', 'n03782006',
+    'n03785016', 'n03786901', 'n03787032', 'n03788195', 'n03788365',
+    'n03791053', 'n03792782', 'n03792972', 'n03793489', 'n03794056',
+    'n03796401', 'n03803284', 'n03804744', 'n03814639', 'n03814906',
+    'n03825788', 'n03832673', 'n03837869', 'n03838899', 'n03840681',
+    'n03841143', 'n03843555', 'n03854065', 'n03857828', 'n03866082',
+    'n03868242', 'n03868863', 'n03871628', 'n03873416', 'n03874293',
+    'n03874599', 'n03876231', 'n03877472', 'n03877845', 'n03884397',
+    'n03887697', 'n03888257', 'n03888605', 'n03891251', 'n03891332',
+    'n03895866', 'n03899768', 'n03902125', 'n03903868', 'n03908618',
+    'n03908714', 'n03916031', 'n03920288', 'n03924679', 'n03929660',
+    'n03929855', 'n03930313', 'n03930630', 'n03933933', 'n03935335',
+    'n03937543', 'n03938244', 'n03942813', 'n03944341', 'n03947888',
+    'n03950228', 'n03954731', 'n03956157', 'n03958227', 'n03961711',
+    'n03967562', 'n03970156', 'n03976467', 'n03976657', 'n03977966',
+    'n03980874', 'n03982430', 'n03983396', 'n03991062', 'n03992509',
+    'n03995372', 'n03998194', 'n04004767', 'n04005630', 'n04008634',
+    'n04009552', 'n04019541', 'n04023962', 'n04026417', 'n04033901',
+    'n04033995', 'n04037443', 'n04039381', 'n04040759', 'n04041544',
+    'n04044716', 'n04049303', 'n04065272', 'n04067472', 'n04069434',
+    'n04070727', 'n04074963', 'n04081281', 'n04086273', 'n04090263',
+    'n04099969', 'n04111531', 'n04116512', 'n04118538', 'n04118776',
+    'n04120489', 'n04125021', 'n04127249', 'n04131690', 'n04133789',
+    'n04136333', 'n04141076', 'n04141327', 'n04141975', 'n04146614',
+    'n04147183', 'n04149813', 'n04152593', 'n04153751', 'n04154565',
+    'n04162706', 'n04179913', 'n04192698', 'n04200800', 'n04201297',
+    'n04204238', 'n04204347', 'n04208210', 'n04209133', 'n04209239',
+    'n04228054', 'n04229816', 'n04235860', 'n04238763', 'n04239074',
+    'n04243546', 'n04251144', 'n04252077', 'n04252225', 'n04254120',
+    'n04254680', 'n04254777', 'n04258138', 'n04259630', 'n04263257',
+    'n04264628', 'n04265275', 'n04266014', 'n04270147', 'n04273569',
+    'n04275548', 'n04277352', 'n04285008', 'n04286575', 'n04296562',
+    'n04310018', 'n04311004', 'n04311174', 'n04317175', 'n04325704',
+    'n04326547', 'n04328186', 'n04330267', 'n04332243', 'n04335435',
+    'n04336792', 'n04344873', 'n04346328', 'n04347754', 'n04350905',
+    'n04355338', 'n04355933', 'n04356056', 'n04357314', 'n04366367',
+    'n04367480', 'n04370456', 'n04371430', 'n04371774', 'n04372370',
+    'n04376876', 'n04380533', 'n04389033', 'n04392985', 'n04398044',
+    'n04399382', 'n04404412', 'n04409515', 'n04417672', 'n04418357',
+    'n04423845', 'n04428191', 'n04429376', 'n04435653', 'n04442312',
+    'n04443257', 'n04447861', 'n04456115', 'n04458633', 'n04461696',
+    'n04462240', 'n04465501', 'n04467665', 'n04476259', 'n04479046',
+    'n04482393', 'n04483307', 'n04485082', 'n04486054', 'n04487081',
+    'n04487394', 'n04493381', 'n04501370', 'n04505470', 'n04507155',
+    'n04509417', 'n04515003', 'n04517823', 'n04522168', 'n04523525',
+    'n04525038', 'n04525305', 'n04532106', 'n04532670', 'n04536866',
+    'n04540053', 'n04542943', 'n04548280', 'n04548362', 'n04550184',
+    'n04552348', 'n04553703', 'n04554684', 'n04557648', 'n04560804',
+    'n04562935', 'n04579145', 'n04579432', 'n04584207', 'n04589890',
+    'n04590129', 'n04591157', 'n04591713', 'n04592741', 'n04596742',
+    'n04597913', 'n04599235', 'n04604644', 'n04606251', 'n04612504',
+    'n04613696', 'n06359193', 'n06596364', 'n06785654', 'n06794110',
+    'n06874185', 'n07248320', 'n07565083', 'n07579787', 'n07583066',
+    'n07584110', 'n07590611', 'n07613480', 'n07614500', 'n07615774',
+    'n07684084', 'n07693725', 'n07695742', 'n07697313', 'n07697537',
+    'n07711569', 'n07714571', 'n07714990', 'n07715103', 'n07716358',
+    'n07716906', 'n07717410', 'n07717556', 'n07718472', 'n07718747',
+    'n07720875', 'n07730033', 'n07734744', 'n07742313', 'n07745940',
+    'n07747607', 'n07749582', 'n07753113', 'n07753275', 'n07753592',
+    'n07754684', 'n07760859', 'n07768694', 'n07802026', 'n07831146',
+    'n07836838', 'n07860988', 'n07871810', 'n07873807', 'n07875152',
+    'n07880968', 'n07892512', 'n07920052', 'n07930864', 'n07932039',
+    'n09193705', 'n09229709', 'n09246464', 'n09256479', 'n09288635',
+    'n09332890', 'n09399592', 'n09421951', 'n09428293', 'n09468604',
+    'n09472597', 'n09835506', 'n10148035', 'n10565667', 'n11879895',
+    'n11939491', 'n12057211', 'n12144580', 'n12267677', 'n12620546',
+    'n12768682', 'n12985857', 'n12998815', 'n13037406', 'n13040303',
+    'n13044778', 'n13052670', 'n13054560', 'n13133613', 'n15075141'
+]
+
+imagenet_a_wnids = [
+    'n01498041', 'n01531178', 'n01534433', 'n01558993', 'n01580077',
+    'n01614925', 'n01616318', 'n01631663', 'n01641577', 'n01669191',
+    'n01677366', 'n01687978', 'n01694178', 'n01698640', 'n01735189',
+    'n01770081', 'n01770393', 'n01774750', 'n01784675', 'n01819313',
+    'n01820546', 'n01833805', 'n01843383', 'n01847000', 'n01855672',
+    'n01882714', 'n01910747', 'n01914609', 'n01924916', 'n01944390',
+    'n01985128', 'n01986214', 'n02007558', 'n02009912', 'n02037110',
+    'n02051845', 'n02077923', 'n02085620', 'n02099601', 'n02106550',
+    'n02106662', 'n02110958', 'n02119022', 'n02123394', 'n02127052',
+    'n02129165', 'n02133161', 'n02137549', 'n02165456', 'n02174001',
+    'n02177972', 'n02190166', 'n02206856', 'n02219486', 'n02226429',
+    'n02231487', 'n02233338', 'n02236044', 'n02259212', 'n02268443',
+    'n02279972', 'n02280649', 'n02281787', 'n02317335', 'n02325366',
+    'n02346627', 'n02356798', 'n02361337', 'n02410509', 'n02445715',
+    'n02454379', 'n02486410', 'n02492035', 'n02504458', 'n02655020',
+    'n02669723', 'n02672831', 'n02676566', 'n02690373', 'n02701002',
+    'n02730930', 'n02777292', 'n02782093', 'n02787622', 'n02793495',
+    'n02797295', 'n02802426', 'n02814860', 'n02815834', 'n02837789',
+    'n02879718', 'n02883205', 'n02895154', 'n02906734', 'n02948072',
+    'n02951358', 'n02980441', 'n02992211', 'n02999410', 'n03014705',
+    'n03026506', 'n03124043', 'n03125729', 'n03187595', 'n03196217',
+    'n03223299', 'n03250847', 'n03255030', 'n03291819', 'n03325584',
+    'n03355925', 'n03384352', 'n03388043', 'n03417042', 'n03443371',
+    'n03444034', 'n03445924', 'n03452741', 'n03483316', 'n03584829',
+    'n03590841', 'n03594945', 'n03617480', 'n03666591', 'n03670208',
+    'n03717622', 'n03720891', 'n03721384', 'n03724870', 'n03775071',
+    'n03788195', 'n03804744', 'n03837869', 'n03840681', 'n03854065',
+    'n03888257', 'n03891332', 'n03935335', 'n03982430', 'n04019541',
+    'n04033901', 'n04039381', 'n04067472', 'n04086273', 'n04099969',
+    'n04118538', 'n04131690', 'n04133789', 'n04141076', 'n04146614',
+    'n04147183', 'n04179913', 'n04208210', 'n04235860', 'n04252077',
+    'n04252225', 'n04254120', 'n04270147', 'n04275548', 'n04310018',
+    'n04317175', 'n04344873', 'n04347754', 'n04355338', 'n04366367',
+    'n04376876', 'n04389033', 'n04399382', 'n04442312', 'n04456115',
+    'n04482393', 'n04507155', 'n04509417', 'n04532670', 'n04540053',
+    'n04554684', 'n04562935', 'n04591713', 'n04606251', 'n07583066',
+    'n07695742', 'n07697313', 'n07697537', 'n07714990', 'n07718472',
+    'n07720875', 'n07734744', 'n07749582', 'n07753592', 'n07760859',
+    'n07768694', 'n07831146', 'n09229709', 'n09246464', 'n09472597',
+    'n09835506', 'n11879895', 'n12057211', 'n12144580', 'n12267677'
+]
+
+imagenet_a_mask = [wnid in set(imagenet_a_wnids) for wnid in all_wnids]
+
+imagenet_r_wnids = {
+    'n01443537', 'n01484850', 'n01494475', 'n01498041', 'n01514859',
+    'n01518878', 'n01531178', 'n01534433', 'n01614925', 'n01616318',
+    'n01630670', 'n01632777', 'n01644373', 'n01677366', 'n01694178',
+    'n01748264', 'n01770393', 'n01774750', 'n01784675', 'n01806143',
+    'n01820546', 'n01833805', 'n01843383', 'n01847000', 'n01855672',
+    'n01860187', 'n01882714', 'n01910747', 'n01944390', 'n01983481',
+    'n01986214', 'n02007558', 'n02009912', 'n02051845', 'n02056570',
+    'n02066245', 'n02071294', 'n02077923', 'n02085620', 'n02086240',
+    'n02088094', 'n02088238', 'n02088364', 'n02088466', 'n02091032',
+    'n02091134', 'n02092339', 'n02094433', 'n02096585', 'n02097298',
+    'n02098286', 'n02099601', 'n02099712', 'n02102318', 'n02106030',
+    'n02106166', 'n02106550', 'n02106662', 'n02108089', 'n02108915',
+    'n02109525', 'n02110185', 'n02110341', 'n02110958', 'n02112018',
+    'n02112137', 'n02113023', 'n02113624', 'n02113799', 'n02114367',
+    'n02117135', 'n02119022', 'n02123045', 'n02128385', 'n02128757',
+    'n02129165', 'n02129604', 'n02130308', 'n02134084', 'n02138441',
+    'n02165456', 'n02190166', 'n02206856', 'n02219486', 'n02226429',
+    'n02233338', 'n02236044', 'n02268443', 'n02279972', 'n02317335',
+    'n02325366', 'n02346627', 'n02356798', 'n02363005', 'n02364673',
+    'n02391049', 'n02395406', 'n02398521', 'n02410509', 'n02423022',
+    'n02437616', 'n02445715', 'n02447366', 'n02480495', 'n02480855',
+    'n02481823', 'n02483362', 'n02486410', 'n02510455', 'n02526121',
+    'n02607072', 'n02655020', 'n02672831', 'n02701002', 'n02749479',
+    'n02769748', 'n02793495', 'n02797295', 'n02802426', 'n02808440',
+    'n02814860', 'n02823750', 'n02841315', 'n02843684', 'n02883205',
+    'n02906734', 'n02909870', 'n02939185', 'n02948072', 'n02950826',
+    'n02951358', 'n02966193', 'n02980441', 'n02992529', 'n03124170',
+    'n03272010', 'n03345487', 'n03372029', 'n03424325', 'n03452741',
+    'n03467068', 'n03481172', 'n03494278', 'n03495258', 'n03498962',
+    'n03594945', 'n03602883', 'n03630383', 'n03649909', 'n03676483',
+    'n03710193', 'n03773504', 'n03775071', 'n03888257', 'n03930630',
+    'n03947888', 'n04086273', 'n04118538', 'n04133789', 'n04141076',
+    'n04146614', 'n04147183', 'n04192698', 'n04254680', 'n04266014',
+    'n04275548', 'n04310018', 'n04325704', 'n04347754', 'n04389033',
+    'n04409515', 'n04465501', 'n04487394', 'n04522168', 'n04536866',
+    'n04552348', 'n04591713', 'n07614500', 'n07693725', 'n07695742',
+    'n07697313', 'n07697537', 'n07714571', 'n07714990', 'n07718472',
+    'n07720875', 'n07734744', 'n07742313', 'n07745940', 'n07749582',
+    'n07753275', 'n07753592', 'n07768694', 'n07873807', 'n07880968',
+    'n07920052', 'n09472597', 'n09835506', 'n10565667', 'n12267677'
+}
+
+imagenet_r_mask = [wnid in imagenet_r_wnids for wnid in all_wnids]
--- a/classification/dataset/imagenet_real.py
+++ b/classification/dataset/imagenet_real.py
+# --------------------------------------------------------
+# EVA: Exploring the Limits of Masked Visual Representation Learning at Scale (https://arxiv.org/abs/2211.07636)
+# Github source: https://github.com/baaivision/EVA
+# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Yuxin Fang
+# Based on timm, DINO, DeiT and BEiT codebases
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/facebookresearch/deit
+# https://github.com/facebookresearch/dino
+# https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------'
+
+import json
+import os
+
+import numpy as np
+
+
+class RealLabelsImagenet:
+
+    def __init__(self, filenames, real_json='real.json', topk=(1, 5)):
+        with open(real_json) as real_labels:
+            real_labels = json.load(real_labels)
+            real_labels = {f'ILSVRC2012_val_{i + 1:08d}.JPEG': labels for i, labels in enumerate(real_labels)}
+        self.real_labels = real_labels
+        self.filenames = filenames
+        assert len(self.filenames) == len(self.real_labels)
+        self.topk = topk
+        self.is_correct = {k: [] for k in topk}
+        self.sample_idx = 0
+
+    def add_result(self, output):
+        maxk = max(self.topk)
+        _, pred_batch = output.topk(maxk, 1, True, True)
+        pred_batch = pred_batch.cpu().numpy()
+        for pred in pred_batch:
+            filename = self.filenames[self.sample_idx]
+            filename = os.path.basename(filename)
+            if self.real_labels[filename]:
+                for k in self.topk:
+                    self.is_correct[k].append(
+                        any([p in self.real_labels[filename] for p in pred[:k]]))
+            self.sample_idx += 1
+
+    def get_accuracy(self, k=None):
+        if k is None:
+            return {k: float(np.mean(self.is_correct[k] for k in self.topk))}
+        else:
+            return float(np.mean(self.is_correct[k])) * 100
--- a/classification/dataset/imagenetv2.py
+++ b/classification/dataset/imagenetv2.py
+"""Code from https://github.com/mlfoundations/wise-ft/blob/master/src/datasets/imagenetv2.py
+Thanks to the authors of wise-ft."""
+import pathlib
+import shutil
+import tarfile
+
+import requests
+from PIL import Image
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+URLS = {'matched-frequency': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-matched-frequency.tar.gz',
+        'threshold-0.7': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-threshold0.7.tar.gz',
+        'top-images': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-top-images.tar.gz',
+        'val': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenet_validation.tar.gz'}
+
+FNAMES = {'matched-frequency': 'imagenetv2-matched-frequency-format-val',
+          'threshold-0.7': 'imagenetv2-threshold0.7-format-val',
+          'top-images': 'imagenetv2-top-images-format-val',
+          'val': 'imagenet_validation'}
+
+V2_DATASET_SIZE = 10000
+VAL_DATASET_SIZE = 50000
+
+
+class ImageNetV2Dataset(Dataset):
+    def __init__(self, variant='matched-frequency', transform=None, location='.'):
+        self.dataset_root = pathlib.Path(f'{location}/ImageNetV2-{variant}/')
+        self.tar_root = pathlib.Path(f'{location}/ImageNetV2-{variant}.tar.gz')
+        self.fnames = list(self.dataset_root.glob('**/*.jpeg'))
+        self.transform = transform
+        assert variant in URLS, f'unknown V2 Variant: {variant}'
+        if not self.dataset_root.exists() or len(self.fnames) != V2_DATASET_SIZE:
+            if not self.tar_root.exists():
+                print(f'Dataset {variant} not found on disk, downloading....')
+                response = requests.get(URLS[variant], stream=True)
+                total_size_in_bytes = int(response.headers.get('content-length', 0))
+                block_size = 1024  # 1 Kibibyte
+                progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+                with open(self.tar_root, 'wb') as f:
+                    for data in response.iter_content(block_size):
+                        progress_bar.update(len(data))
+                        f.write(data)
+                progress_bar.close()
+                if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+                    assert False, f'Downloading from {URLS[variant]} failed'
+            print('Extracting....')
+            tarfile.open(self.tar_root).extractall(f'{location}')
+            shutil.move(f'{location}/{FNAMES[variant]}', self.dataset_root)
+            self.fnames = list(self.dataset_root.glob('**/*.jpeg'))
+
+    def __len__(self):
+        return len(self.fnames)
+
+    def __getitem__(self, i):
+        img, label = Image.open(self.fnames[i]), int(self.fnames[i].parent.name)
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, label
--- a/classification/dataset/samplers.py
+++ b/classification/dataset/samplers.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import math
+import os
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import Sampler
+
+
+class SubsetRandomSampler(torch.utils.data.Sampler):
+    """Samples elements randomly from a given list of indices, without
+    replacement.
+
+    Arguments:
+        indices (sequence): a sequence of indices
+    """
+
+    def __init__(self, indices):
+        self.epoch = 0
+        self.indices = indices
+
+    def __iter__(self):
+        return (self.indices[i] for i in torch.randperm(len(self.indices)))
+
+    def __len__(self):
+        return len(self.indices)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class NodeDistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 local_rank=None,
+                 local_size=None):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'Requires distributed package to be available')
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'Requires distributed package to be available')
+            rank = dist.get_rank()
+        if local_rank is None:
+            local_rank = int(os.environ.get('LOCAL_RANK', 0))
+        if local_size is None:
+            local_size = int(os.environ.get('LOCAL_SIZE', 1))
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.num_parts = local_size
+        self.rank = rank
+        self.local_rank = local_rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+
+        self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        t = torch.Generator()
+        t.manual_seed(0)
+
+        indices = torch.randperm(len(self.dataset), generator=t).tolist()
+        # indices = range(len(self.dataset))
+        indices = [i for i in indices if i % self.num_parts == self.local_rank]
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size_parts - len(indices))]
+        assert len(indices) == self.total_size_parts
+
+        # subsample
+        indices = indices[self.rank // self.num_parts:self.
+                          total_size_parts:self.num_replicas // self.num_parts]
+
+        index = torch.randperm(len(indices), generator=g).tolist()
+        indices = list(np.array(indices)[index])
+
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
--- a/classification/dataset/zipreader.py
+++ b/classification/dataset/zipreader.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import io
+import os
+import zipfile
+
+import numpy as np
+from PIL import Image, ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def is_zip_path(img_or_path):
+    """judge if this is a zip path."""
+    return '.zip@' in img_or_path
+
+
+class ZipReader(object):
+    """A class to read zipped files."""
+    zip_bank = dict()
+
+    def __init__(self):
+        super(ZipReader, self).__init__()
+
+    @staticmethod
+    def get_zipfile(path):
+        zip_bank = ZipReader.zip_bank
+        if path not in zip_bank:
+            zfile = zipfile.ZipFile(path, 'r')
+            zip_bank[path] = zfile
+        return zip_bank[path]
+
+    @staticmethod
+    def split_zip_style_path(path):
+        pos_at = path.index('@')
+        assert pos_at != -1, "character '@' is not found from the given path '%s'" % path
+
+        zip_path = path[0:pos_at]
+        folder_path = path[pos_at + 1:]
+        folder_path = str.strip(folder_path, '/')
+        return zip_path, folder_path
+
+    @staticmethod
+    def list_folder(path):
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+
+        zfile = ZipReader.get_zipfile(zip_path)
+        folder_list = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    len(os.path.splitext(file_foler_name)[-1]) == 0 and \
+                    file_foler_name != folder_path:
+                if len(folder_path) == 0:
+                    folder_list.append(file_foler_name)
+                else:
+                    folder_list.append(file_foler_name[len(folder_path) + 1:])
+
+        return folder_list
+
+    @staticmethod
+    def list_files(path, extension=None):
+        if extension is None:
+            extension = ['.*']
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+
+        zfile = ZipReader.get_zipfile(zip_path)
+        file_lists = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    str.lower(os.path.splitext(file_foler_name)[-1]) in extension:
+                if len(folder_path) == 0:
+                    file_lists.append(file_foler_name)
+                else:
+                    file_lists.append(file_foler_name[len(folder_path) + 1:])
+
+        return file_lists
+
+    @staticmethod
+    def read(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        return data
+
+    @staticmethod
+    def imread(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        try:
+            im = Image.open(io.BytesIO(data))
+        except:
+            print('ERROR IMG LOADED: ', path_img)
+            random_img = np.random.rand(224, 224, 3) * 255
+            im = Image.fromarray(np.uint8(random_img))
+        return im
--- a/classification/ddp_hooks.py
+++ b/classification/ddp_hooks.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Any, Callable
+
+import torch
+import torch.distributed as dist
+
+
+def _allreduce_fut(process_group: dist.ProcessGroup,
+                   tensor: torch.Tensor) -> torch.futures.Future[torch.Tensor]:
+    'Averages the input gradient tensor by allreduce and returns a future.'
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+
+    # Apply the division first to avoid overflow, especially for FP16.
+    tensor.div_(group_to_use.size())
+
+    return (dist.all_reduce(
+        tensor, group=group_to_use,
+        async_op=True).get_future().then(lambda fut: fut.value()[0]))
+
+
+def allreduce_hook(
+        process_group: dist.ProcessGroup,
+        bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+    """
+    This DDP communication hook just calls ``allreduce`` using ``GradBucket``
+    tensors. Once gradient tensors are aggregated across all workers, its ``then``
+    callback takes the mean and returns the result. If user registers this hook,
+    DDP results is expected to be same as the case where no hook was registered.
+    Hence, this won't change behavior of DDP and user can use this as a reference
+    or modify this hook to log useful information or any other purposes while
+    unaffecting DDP behavior.
+
+    Example::
+        >>> ddp_model.register_comm_hook(process_group, allreduce_hook)
+    """
+    return _allreduce_fut(process_group, bucket.buffer())
+
+
+def fp16_compress_hook(
+        process_group: dist.ProcessGroup,
+        bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+    """
+    This DDP communication hook implements a simple gradient compression
+    approach that casts ``GradBucket`` tensor to half-precision floating-point format (``torch.float16``)
+    and then divides it by the process group size.
+    It allreduces those ``float16`` gradient tensors. Once compressed gradient
+    tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
+
+    Example::
+        >>> ddp_model.register_comm_hook(process_group, fp16_compress_hook)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    world_size = group_to_use.size()
+
+    compressed_tensor = bucket.buffer().to(torch.float16).div_(world_size)
+
+    fut = dist.all_reduce(compressed_tensor, group=group_to_use,
+                          async_op=True).get_future()
+
+    def decompress(fut):
+        decompressed_tensor = bucket.buffer()
+        # Decompress in place to reduce the peak memory.
+        # See: https://github.com/pytorch/pytorch/issues/45968
+        decompressed_tensor.copy_(fut.value()[0])
+        return decompressed_tensor
+
+    return fut.then(decompress)
+
+
+# TODO: create an internal helper function and extract the duplicate code in FP16_compress and BF16_compress.
+
+
+def bf16_compress_hook(
+        process_group: dist.ProcessGroup,
+        bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+    """
+    Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+    This DDP communication hook implements a simple gradient compression
+    approach that casts ``GradBucket`` tensor to half-precision
+    `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ (``torch.bfloat16``)
+    and then divides it by the process group size.
+    It allreduces those ``bfloat16`` gradient tensors. Once compressed gradient
+    tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
+
+    Example::
+        >>> ddp_model.register_comm_hook(process_group, bf16_compress_hook)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    world_size = group_to_use.size()
+
+    compressed_tensor = bucket.buffer().to(torch.bfloat16).div_(world_size)
+
+    fut = dist.all_reduce(compressed_tensor, group=group_to_use,
+                          async_op=True).get_future()
+
+    def decompress(fut):
+        decompressed_tensor = bucket.buffer()
+        # Decompress in place to reduce the peak memory.
+        # See: https://github.com/pytorch/pytorch/issues/45968
+        decompressed_tensor.copy_(fut.value()[0])
+        return decompressed_tensor
+
+    return fut.then(decompress)
+
+
+def fp16_compress_wrapper(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    """
+    This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
+    floating point format (``torch.float16``), and casts the resulting tensor of the given hook back to
+    the input data type, such as ``float32``.
+
+    Therefore, ``fp16_compress_hook`` is equivalent to ``fp16_compress_wrapper(allreduce_hook)``.
+
+    Example::
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        >>> ddp_model.register_comm_hook(state, fp16_compress_wrapper(powerSGD_hook))
+    """
+
+    def fp16_compress_wrapper_hook(
+            hook_state,
+            bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+        # Cast bucket tensor to FP16.
+        bucket.set_buffer(bucket.buffer().to(torch.float16))
+
+        fut = hook(hook_state, bucket)
+
+        def decompress(fut):
+            decompressed_tensor = bucket.buffer()
+            # Decompress in place to reduce the peak memory.
+            # See: https://github.com/pytorch/pytorch/issues/45968
+            decompressed_tensor.copy_(fut.value())
+            return decompressed_tensor
+
+        # Decompress after hook has run.
+        return fut.then(decompress)
+
+    return fp16_compress_wrapper_hook
+
+
+def bf16_compress_wrapper(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    """
+    Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+    This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
+    `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format> `_  (``torch.bfloat16``),
+    and casts the resulting tensor of the given hook back to the input data type, such as ``float32``.
+
+    Therefore, ``bf16_compress_hook`` is equivalent to ``bf16_compress_wrapper(allreduce_hook)``.
+
+    Example::
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        >>> ddp_model.register_comm_hook(state, bf16_compress_wrapper(powerSGD_hook))
+    """
+
+    def bf16_compress_wrapper_hook(
+            hook_state,
+            bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+        # Cast bucket tensor to BF16.
+        bucket.set_buffer(bucket.buffer().to(torch.bfloat16))
+
+        fut = hook(hook_state, bucket)
+
+        def decompress(fut):
+            decompressed_tensor = bucket.buffer()
+            # Decompress in place to reduce the peak memory.
+            # See: https://github.com/pytorch/pytorch/issues/45968
+            decompressed_tensor.copy_(fut.value())
+            return decompressed_tensor
+
+        # Decompress after hook has run.
+        return fut.then(decompress)
+
+    return bf16_compress_wrapper_hook
--- a/classification/ema_deepspeed.py
+++ b/classification/ema_deepspeed.py
+from contextlib import contextmanager
+
+import deepspeed
+import torch
+import torch.nn as nn
+from deepspeed.runtime.zero import GatheredParameters
+
+
+class EMADeepspeed(nn.Module):
+    """ migrated from https://github.com/microsoft/DeepSpeed/issues/2056
+    """
+
+    def __init__(self, model, decay=0.9999, use_num_updates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError('Decay must be between 0 and 1')
+
+        self.m_name2s_name = {}
+        self.decay = decay
+        self.num_updates = 0 if use_num_updates else -1
+
+        with GatheredParameters(model.parameters(), fwd_module=self):
+            for name, p in model.named_parameters():
+                if p.requires_grad:
+                    # remove as '.'-character is not allowed in buffers
+                    s_name = name.replace('.', '')
+                    self.m_name2s_name.update({name: s_name})
+                    self.register_buffer(s_name, p.clone().detach().data)
+                    # remove as '.'-character is not allowed in buffers
+        self.collected_params = []
+
+    def forward(self, model):
+        decay = self.decay
+
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
+
+        one_minus_decay = 1.0 - decay
+        shadow_params = dict(self.named_buffers())
+
+        with torch.no_grad():
+            with GatheredParameters(model.parameters()):
+                if deepspeed.comm.get_rank() == 0:
+                    m_param = dict(model.named_parameters())
+
+                    for key in m_param:
+                        if m_param[key].requires_grad:
+                            sname = self.m_name2s_name[key]
+                            shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                            shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
+                        else:
+                            assert key not in self.m_name2s_name
+
+    def copy_to(self, model):
+        shadow_params = dict(self.named_buffers())
+        with GatheredParameters(model.parameters(), modifier_rank=0):
+            if deepspeed.comm.get_rank() == 0:
+                m_param = dict(model.named_parameters())
+                for key in m_param:
+                    if m_param[key].requires_grad:
+                        m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+                    else:
+                        assert key not in self.m_name2s_name
+
+    def store(self, model):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          model: A model that parameters will be stored
+        """
+        with GatheredParameters(model.parameters()):
+            if deepspeed.comm.get_rank() == 0:
+                parameters = model.parameters()
+                self.collected_params = [param.clone() for param in parameters]
+
+    def restore(self, model):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          model: A model that to restore its parameters.
+        """
+        with GatheredParameters(model.parameters(), modifier_rank=0):
+            if deepspeed.comm.get_rank() == 0:
+                parameters = model.parameters()
+                for c_param, param in zip(self.collected_params, parameters):
+                    param.data.copy_(c_param.data)
+
+    @contextmanager
+    def activate(self, model):
+        try:
+            self.store(model)
+            self.copy_to(model)
+            yield
+        finally:
+            self.restore(model)
--- a/classification/export.py
+++ b/classification/export.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import argparse
+import os
+import time
+
+import torch
+from config import get_config
+from models import build_model
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str,
+                        default='internimage_t_1k_224')
+    parser.add_argument('--ckpt_dir', type=str,
+                        default='/mnt/petrelfs/share_data/huangzhenhang/code/internimage/checkpoint_dir/new/cls')
+    parser.add_argument('--onnx', default=False, action='store_true')
+    parser.add_argument('--trt', default=False, action='store_true')
+
+    args = parser.parse_args()
+    args.cfg = os.path.join('./configs', f'{args.model_name}.yaml')
+    args.ckpt = os.path.join(args.ckpt_dir, f'{args.model_name}.pth')
+    args.size = int(args.model_name.split('.')[0].split('_')[-1])
+
+    cfg = get_config(args)
+    return args, cfg
+
+
+def get_model(args, cfg):
+    model = build_model(cfg)
+    ckpt = torch.load(args.ckpt, map_location='cpu')['model']
+
+    model.load_state_dict(ckpt)
+    return model
+
+
+def speed_test(model, input):
+    # warm-up
+    for _ in tqdm(range(100)):
+        _ = model(input)
+
+    # speed test
+    torch.cuda.synchronize()
+    start = time.time()
+    for _ in tqdm(range(100)):
+        _ = model(input)
+    end = time.time()
+    th = 100 / (end - start)
+    print(f'using time: {end - start}, throughput {th}')
+
+
+def torch2onnx(args, cfg):
+    model = get_model(args, cfg).cuda()
+
+    # speed_test(model)
+
+    onnx_name = f'{args.model_name}.onnx'
+    torch.onnx.export(model,
+                      torch.rand(1, 3, args.size, args.size).cuda(),
+                      onnx_name,
+                      input_names=['input'],
+                      output_names=['output'])
+
+    return model
+
+
+def onnx2trt(args):
+    from mmdeploy.backend.tensorrt import from_onnx
+
+    onnx_name = f'{args.model_name}.onnx'
+    from_onnx(
+        onnx_name,
+        args.model_name,
+        dict(
+            input=dict(
+                min_shape=[1, 3, args.size, args.size],
+                opt_shape=[1, 3, args.size, args.size],
+                max_shape=[1, 3, args.size, args.size],
+            )
+        ),
+        max_workspace_size=2 ** 30,
+    )
+
+
+def check(args, cfg):
+    from mmdeploy.backend.tensorrt.wrapper import TRTWrapper
+
+    model = get_model(args, cfg).cuda()
+    model.eval()
+    trt_model = TRTWrapper(f'{args.model_name}.engine',
+                           ['output'])
+
+    x = torch.randn(1, 3, args.size, args.size).cuda()
+
+    torch_out = model(x)
+    trt_out = trt_model(dict(input=x))['output']
+
+    print('torch out shape:', torch_out.shape)
+    print('trt out shape:', trt_out.shape)
+
+    print('max delta:', (torch_out - trt_out).abs().max())
+    print('mean delta:', (torch_out - trt_out).abs().mean())
+
+    speed_test(model, x)
+    speed_test(trt_model, dict(input=x))
+
+
+def main():
+    args, cfg = get_args()
+
+    if args.onnx or args.trt:
+        torch2onnx(args, cfg)
+        print('torch -> onnx: succeess')
+
+    if args.trt:
+        onnx2trt(args)
+        print('onnx -> trt: success')
+        check(args, cfg)
+
+
+if __name__ == '__main__':
+    main()
--- a/classification/gflops.py
+++ b/classification/gflops.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import argparse
+import time
+
+import torch
+from mmcv.cnn import get_model_complexity_info
+from mmcv.cnn.utils.flops_counter import flops_to_string, params_to_string
+from models.intern_vit_6b import InternViT6B
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser(description='Hyperparams')
+parser.add_argument('config', nargs='?', type=str, default=None)
+args = parser.parse_args()
+
+configs = {
+    'a': {
+        'embed_dim': 3968,
+        'num_heads': 62,
+        'mlp_ratio': 4,
+        'depth': 32
+    },
+    'e': {
+        'embed_dim': 3200,
+        'num_heads': 50,
+        'mlp_ratio': 4,
+        'depth': 48
+    },
+    'f': {
+        'embed_dim': 3200,
+        'num_heads': 25,
+        'mlp_ratio': 4,
+        'depth': 48
+    },
+    'g': {
+        'embed_dim': 2496,
+        'num_heads': 39,
+        'mlp_ratio': 8,
+        'depth': 48
+    },
+    'i': {
+        'embed_dim': 2816,
+        'num_heads': 44,
+        'mlp_ratio': 4,
+        'depth': 64
+    },
+    'm': {
+        'embed_dim': 2496,
+        'num_heads': 39,
+        'mlp_ratio': 4,
+        'depth': 80
+    },
+}
+
+
+def sa_flops(h, w, dim):
+    return 2 * h * w * h * w * dim
+
+
+def get_flops(model, input_shape):
+    flops, params = get_model_complexity_info(model,
+                                              input_shape,
+                                              as_strings=False)
+    _, H, W = input_shape
+    print(flops, params)
+    for i in range(model.depth):
+        flops += sa_flops(H // model.patch_size, W // model.patch_size,
+                          model.embed_dim)
+    return flops_to_string(flops), params_to_string(params)
+
+
+if __name__ == '__main__':
+
+    input_shape = (3, 224, 224)
+
+    config = configs[args.config]
+    print(config)
+    model = InternViT6B(in_chans=3,
+                        patch_size=14,
+                        img_size=224,
+                        pretrain_size=224,
+                        qkv_bias=False,
+                        drop_path_rate=0.0,
+                        embed_dim=config['embed_dim'],
+                        num_heads=config['num_heads'],
+                        mlp_ratio=config['mlp_ratio'],
+                        init_values=0.1,
+                        qk_normalization=True,
+                        depth=config['depth'],
+                        use_flash_attn=True,
+                        with_cp=True,
+                        freeze_vit=True,
+                        cls_target='cls_patch_concat',
+                        num_classes=0,
+                        attn_pool_num_heads=16,
+                        clip_embed_dim=768,
+                        head_norm_type='bn').to(torch.bfloat16)
+
+    for k, v in model.named_parameters():
+        v.requires_grad = True
+
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+
+    flops, params = get_flops(model, input_shape)
+    split_line = '=' * 30
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+    image = torch.rand(128, 3, 224, 224).to(torch.bfloat16).cuda()
+    torch.cuda.synchronize()
+    start_time = time.time()
+    with torch.no_grad():
+        for i in tqdm(range(10)):
+            out = model(image)
+    torch.cuda.synchronize()
+    end_time = time.time()
+
+    print('warmup time: ', end_time - start_time)
+
+    torch.cuda.synchronize()
+    start_time = time.time()
+    with torch.no_grad():
+        for i in tqdm(range(50)):
+            out = model(image)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print('using time: ', (end_time - start_time))
+    print('FPS: ', 50 * 128 / (end_time - start_time))
+    print(config)
--- a/classification/logger.py
+++ b/classification/logger.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import functools
+import logging
+import os
+import sys
+
+from termcolor import colored
+
+
+@functools.lru_cache()
+def create_logger(output_dir, dist_rank=0, name=''):
+    # create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    # create formatter
+    fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
+    color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \
+        colored('(%(filename)s %(lineno)d)', 'yellow') + \
+        ': %(levelname)s %(message)s'
+
+    # create console handlers for master process
+    if dist_rank == 0:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(console_handler)
+
+    # create file handlers
+    file_handler = logging.FileHandler(os.path.join(
+        output_dir, f'log_rank{dist_rank}.txt'),
+                                       mode='a')
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(
+        logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+    logger.addHandler(file_handler)
+
+    return logger
--- a/classification/lr_scheduler.py
+++ b/classification/lr_scheduler.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import torch
+from timm.scheduler.cosine_lr import CosineLRScheduler
+from timm.scheduler.scheduler import Scheduler
+from timm.scheduler.step_lr import StepLRScheduler
+
+
+def build_scheduler(config, optimizer, n_iter_per_epoch):
+    num_steps = int(config.TRAIN.EPOCHS * n_iter_per_epoch)
+    warmup_steps = int(config.TRAIN.WARMUP_EPOCHS * n_iter_per_epoch)
+    decay_steps = int(config.TRAIN.LR_SCHEDULER.DECAY_EPOCHS *
+                      n_iter_per_epoch)
+
+    lr_scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == 'cosine':
+        lr_scheduler = CosineLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            # t_mul=1.,
+            lr_min=config.TRAIN.MIN_LR,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            cycle_limit=1,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'linear':
+        lr_scheduler = LinearLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            lr_min_rate=0.01,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'step':
+        lr_scheduler = StepLRScheduler(
+            optimizer,
+            decay_t=decay_steps,
+            decay_rate=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+
+    return lr_scheduler
+
+
+class LinearLRScheduler(Scheduler):
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        t_initial: int,
+        lr_min_rate: float,
+        warmup_t=0,
+        warmup_lr_init=0.,
+        t_in_epochs=True,
+        noise_range_t=None,
+        noise_pct=0.67,
+        noise_std=1.0,
+        noise_seed=42,
+        initialize=True,
+    ) -> None:
+        super().__init__(optimizer,
+                         param_group_field='lr',
+                         noise_range_t=noise_range_t,
+                         noise_pct=noise_pct,
+                         noise_std=noise_std,
+                         noise_seed=noise_seed,
+                         initialize=initialize)
+
+        self.t_initial = t_initial
+        self.lr_min_rate = lr_min_rate
+        self.warmup_t = warmup_t
+        self.warmup_lr_init = warmup_lr_init
+        self.t_in_epochs = t_in_epochs
+        if self.warmup_t:
+            self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t
+                                 for v in self.base_values]
+            super().update_groups(self.warmup_lr_init)
+        else:
+            self.warmup_steps = [1 for _ in self.base_values]
+
+    def _get_lr(self, t):
+        if t < self.warmup_t:
+            lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
+        else:
+            t = t - self.warmup_t
+            total_t = self.t_initial - self.warmup_t
+            lrs = [
+                v - ((v - v * self.lr_min_rate) * (t / total_t))
+                for v in self.base_values
+            ]
+        return lrs
+
+    def get_epoch_values(self, epoch: int):
+        if self.t_in_epochs:
+            return self._get_lr(epoch)
+        else:
+            return None
+
+    def get_update_values(self, num_updates: int):
+        if not self.t_in_epochs:
+            return self._get_lr(num_updates)
+        else:
+            return None
--- a/classification/main.py
+++ b/classification/main.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import argparse
+import datetime
+import os
+import random
+import subprocess
+import time
+from contextlib import suppress
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from config import get_config
+from dataset import build_loader
+from logger import create_logger
+from lr_scheduler import build_scheduler
+from models import build_model
+from optimizer import build_optimizer
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import ApexScaler, AverageMeter, ModelEma, accuracy
+from utils import MyAverageMeter
+from utils import NativeScalerWithGradNormCount as NativeScaler
+from utils import (auto_resume_helper, get_grad_norm, load_checkpoint,
+                   load_ema_checkpoint, load_pretrained, reduce_tensor,
+                   save_checkpoint)
+
+try:
+    from apex import amp
+
+    has_apex = True
+except ImportError:
+    has_apex = False
+# assert not has_apex, "The code is modified based on native amp"
+
+has_native_amp = False
+try:
+    if getattr(torch.cuda.amp, 'autocast') is not None:
+        has_native_amp = True
+except AttributeError:
+    pass
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def obsolete_torch_version(torch_version, version_threshold):
+    return torch_version == 'parrots' or torch_version <= version_threshold
+
+
+def parse_option():
+    parser = argparse.ArgumentParser(
+        'InternVL training and evaluation script', add_help=False)
+    parser.add_argument('--cfg',
+                        type=str,
+                        required=True,
+                        metavar='FILE',
+                        help='path to config file')
+    parser.add_argument(
+        '--opts',
+        help="Modify config options by adding 'KEY VALUE' pairs. ",
+        default=None,
+        nargs='+')
+
+    # easy config modification
+    parser.add_argument('--batch-size',
+                        type=int,
+                        help='batch size for single GPU')
+    parser.add_argument('--dataset',
+                        type=str,
+                        help='dataset name',
+                        default=None)
+    parser.add_argument('--data-path', type=str, help='path to dataset')
+    parser.add_argument('--zip',
+                        action='store_true',
+                        help='use zipped dataset instead of folder dataset')
+    parser.add_argument(
+        '--cache-mode',
+        type=str,
+        default='part',
+        choices=['no', 'full', 'part'],
+        help='no: no cache, '
+             'full: cache all data, '
+             'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
+    )
+    parser.add_argument(
+        '--pretrained',
+        help=
+        'pretrained weight from checkpoint, could be imagenet22k pretrained weight'
+    )
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--accumulation-steps',
+                        type=int,
+                        default=1,
+                        help='gradient accumulation steps')
+    parser.add_argument(
+        '--use-checkpoint',
+        action='store_true',
+        help='whether to use gradient checkpointing to save memory')
+    parser.add_argument(
+        '--amp-opt-level',
+        type=str,
+        default='O1',
+        choices=['O0', 'O1', 'O2'],
+        help='mixed precision opt level, if O0, no amp is used')
+    parser.add_argument(
+        '--output',
+        default='work_dirs',
+        type=str,
+        metavar='PATH',
+        help=
+        'root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
+    )
+    parser.add_argument('--tag', help='tag of experiment')
+    parser.add_argument('--eval',
+                        action='store_true',
+                        help='Perform evaluation only')
+    parser.add_argument('--throughput',
+                        action='store_true',
+                        help='Test throughput only')
+    parser.add_argument('--save-ckpt-num', default=1, type=int)
+    parser.add_argument(
+        '--use-zero',
+        action='store_true',
+        help='whether to use ZeroRedundancyOptimizer (ZeRO) to save memory')
+
+    # distributed training
+    parser.add_argument('--local-rank',
+                        type=int,
+                        required=True,
+                        help='local rank for DistributedDataParallel')
+    parser.add_argument('--launcher',
+                        choices=['pytorch', 'slurm'],
+                        default='pytorch')
+    args, unparsed = parser.parse_known_args()
+    config = get_config(args)
+
+    return args, config
+
+
+@torch.no_grad()
+def throughput(data_loader, model, logger):
+    model.eval()
+
+    for idx, (images, _) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        batch_size = images.shape[0]
+        for i in range(50):
+            model(images)
+        torch.cuda.synchronize()
+        logger.info(f'throughput averaged with 30 times')
+        tic1 = time.time()
+        for i in range(30):
+            model(images)
+        torch.cuda.synchronize()
+        tic2 = time.time()
+        logger.info(
+            f'batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}'
+        )
+        return
+
+
+def main(config):
+    # prepare data loaders
+    dataset_train, dataset_val, dataset_test, data_loader_train, \
+    data_loader_val, data_loader_test, mixup_fn = build_loader(config)
+
+    # build runner
+    logger.info(f'Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}')
+    model = build_model(config)
+    model.cuda()
+    logger.info(str(model))
+
+    # build optimizer
+    optimizer = build_optimizer(config, model)
+
+    if config.AMP_OPT_LEVEL != 'O0':
+        config.defrost()
+        if has_native_amp:
+            config.native_amp = True
+            use_amp = 'native'
+        elif has_apex:
+            config.apex_amp = True
+            use_amp = 'apex'
+        else:
+            use_amp = None
+            logger.warning(
+                'Neither APEX or native Torch AMP is available, using float32. '
+                'Install NVIDA apex or upgrade to PyTorch 1.6')
+        config.freeze()
+
+    # setup automatic mixed-precision (AMP) loss scaling and op casting
+    amp_autocast = suppress  # do nothing
+    loss_scaler = None
+    if config.AMP_OPT_LEVEL != 'O0':
+        if use_amp == 'apex':
+            model, optimizer = amp.initialize(model,
+                                              optimizer,
+                                              opt_level=config.AMP_OPT_LEVEL)
+            loss_scaler = ApexScaler()
+            if config.LOCAL_RANK == 0:
+                logger.info(
+                    'Using NVIDIA APEX AMP. Training in mixed precision.')
+        if use_amp == 'native':
+            amp_autocast = torch.cuda.amp.autocast
+            loss_scaler = NativeScaler()
+            if config.LOCAL_RANK == 0:
+                logger.info(
+                    'Using native Torch AMP. Training in mixed precision.')
+        else:
+            if config.LOCAL_RANK == 0:
+                logger.info('AMP not enabled. Training in float32.')
+
+    # put model on gpus
+    model = torch.nn.parallel.DistributedDataParallel(
+        model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False)
+
+    # try:
+    #     model.register_comm_hook(state=None, hook=fp16_compress_hook)
+    #     logger.info('using fp16_compress_hook!')
+    # except:
+    #     logger.info("cannot register fp16_compress_hook!")
+
+    model_without_ddp = model.module
+
+    n_parameters = sum(p.numel() for p in model.parameters()
+                       if p.requires_grad)
+    logger.info(f'number of params: {n_parameters}')
+    if hasattr(model_without_ddp, 'flops'):
+        flops = model_without_ddp.flops()
+        logger.info(f'number of GFLOPs: {flops / 1e9}')
+
+    # build learning rate scheduler
+    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train)) \
+        if not config.EVAL_MODE else None
+
+    # build criterion
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(
+            smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+
+    max_accuracy = 0.0
+    max_ema_accuracy = 0.0
+    # set auto resume
+    if config.MODEL.RESUME == '' and config.TRAIN.AUTO_RESUME:
+        resume_file = auto_resume_helper(config.OUTPUT)
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(
+                    f'auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}'
+                )
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(
+                f'no checkpoint found in {config.OUTPUT}, ignoring auto resume'
+            )
+
+    # set resume and pretrain
+    if config.MODEL.RESUME:
+        max_accuracy = load_checkpoint(config, model_without_ddp, optimizer,
+                                       lr_scheduler, loss_scaler, logger)
+
+        if data_loader_val is not None:
+            if config.DATA.DATASET == 'imagenet-real':
+                filenames = dataset_val.filenames()
+                filenames = [os.path.basename(item) for item in filenames]
+                from dataset.imagenet_real import RealLabelsImagenet
+                real_labels = RealLabelsImagenet(filenames, real_json='meta_data/real.json')
+                acc1, acc5, loss = validate_real(config, data_loader_val, model, real_labels, amp_autocast=amp_autocast)
+                logger.info(
+                    f'ReaL Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%'
+                )
+            else:
+                acc1, acc5, loss = validate(config, data_loader_val, model, amp_autocast=amp_autocast)
+                logger.info(
+                    f'Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%'
+                )
+    elif config.MODEL.PRETRAINED:
+        load_pretrained(config, model_without_ddp, logger)
+        if data_loader_val is not None:
+            acc1, acc5, loss = validate(config, data_loader_val, model, amp_autocast=amp_autocast)
+            logger.info(
+                f'Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%'
+            )
+
+    # evaluate EMA
+    model_ema = None
+    if config.TRAIN.EMA.ENABLE:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEma(model, decay=config.TRAIN.EMA.DECAY)
+        print('Using EMA with decay = %.8f' % config.TRAIN.EMA.DECAY)
+        if config.MODEL.RESUME:
+            load_ema_checkpoint(config, model_ema, logger)
+            if config.DATA.DATASET == 'imagenet-real':
+                # assert only one gpu
+                assert dist.get_world_size() == 1, 'imagenet-real should test with one gpu'
+                filenames = dataset_val.filenames()
+                filenames = [os.path.basename(item) for item in filenames]
+                from dataset.imagenet_real import RealLabelsImagenet
+                real_labels = RealLabelsImagenet(filenames, real_json='meta_data/real.json')
+                acc1, acc5, loss = validate_real(config, data_loader_val, model_ema.ema, real_labels,
+                                                 amp_autocast=amp_autocast)
+                logger.info(
+                    f'ReaL Accuracy of the ema network on the {len(dataset_val)} test images: {acc1:.1f}%'
+                )
+            else:
+                acc1, acc5, loss = validate(config, data_loader_val, model_ema.ema, amp_autocast=amp_autocast)
+                logger.info(
+                    f'Accuracy of the ema network on the {len(dataset_val)} test images: {acc1:.1f}%'
+                )
+
+    if config.THROUGHPUT_MODE:
+        throughput(data_loader_val, model, logger)
+
+    if config.EVAL_MODE:
+        return
+
+    # train
+    logger.info('Start training')
+    start_time = time.time()
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+
+        train_one_epoch(config,
+                        model,
+                        criterion,
+                        data_loader_train,
+                        optimizer,
+                        epoch,
+                        mixup_fn,
+                        lr_scheduler,
+                        amp_autocast,
+                        loss_scaler,
+                        model_ema=model_ema)
+        if (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)) and config.TRAIN.OPTIMIZER.USE_ZERO:
+            optimizer.consolidate_state_dict(to=0)
+        if dist.get_rank() == 0 and (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)):
+            save_checkpoint(config,
+                            epoch,
+                            model_without_ddp,
+                            max_accuracy,
+                            optimizer,
+                            lr_scheduler,
+                            loss_scaler,
+                            logger,
+                            model_ema=model_ema)
+        if data_loader_val is not None and epoch % config.EVAL_FREQ == 0:
+            acc1, acc5, loss = validate(config, data_loader_val, model, epoch, amp_autocast=amp_autocast)
+            logger.info(
+                f'Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%'
+            )
+            if dist.get_rank() == 0 and acc1 > max_accuracy:
+                save_checkpoint(config,
+                                epoch,
+                                model_without_ddp,
+                                max_accuracy,
+                                optimizer,
+                                lr_scheduler,
+                                loss_scaler,
+                                logger,
+                                model_ema=model_ema,
+                                best='best')
+            max_accuracy = max(max_accuracy, acc1)
+            logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+
+            if config.TRAIN.EMA.ENABLE:
+                acc1, acc5, loss = validate(config, data_loader_val,
+                                            model_ema.ema, epoch, amp_autocast=amp_autocast)
+                logger.info(
+                    f'Accuracy of the ema network on the {len(dataset_val)} test images: {acc1:.1f}%'
+                )
+                if dist.get_rank() == 0 and acc1 > max_ema_accuracy:
+                    save_checkpoint(config,
+                                    epoch,
+                                    model_without_ddp,
+                                    max_accuracy,
+                                    optimizer,
+                                    lr_scheduler,
+                                    loss_scaler,
+                                    logger,
+                                    model_ema=model_ema,
+                                    best='ema_best')
+                max_ema_accuracy = max(max_ema_accuracy, acc1)
+                logger.info(f'Max ema accuracy: {max_ema_accuracy:.2f}%')
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+
+
+def train_one_epoch(config,
+                    model,
+                    criterion,
+                    data_loader,
+                    optimizer,
+                    epoch,
+                    mixup_fn,
+                    lr_scheduler,
+                    amp_autocast=suppress,
+                    loss_scaler=None,
+                    model_ema=None):
+    model.train()
+    optimizer.zero_grad()
+
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    model_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = MyAverageMeter(300)
+
+    start = time.time()
+    end = time.time()
+
+    amp_type = torch.float16 if config.AMP_TYPE == 'float16' else torch.bfloat16
+    for idx, (samples, targets) in enumerate(data_loader):
+        iter_begin_time = time.time()
+        samples = samples.cuda(non_blocking=True)
+        targets = targets.cuda(non_blocking=True)
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+        if not obsolete_torch_version(TORCH_VERSION,
+                                      (1, 9)) and config.AMP_OPT_LEVEL != 'O0':
+            with amp_autocast(dtype=amp_type):
+                outputs = model(samples)
+        else:
+            with amp_autocast():
+                outputs = model(samples)
+
+        if config.TRAIN.ACCUMULATION_STEPS > 1:
+            if not obsolete_torch_version(
+                    TORCH_VERSION, (1, 9)) and config.AMP_OPT_LEVEL != 'O0':
+                with amp_autocast(dtype=amp_type):
+                    loss = criterion(outputs, targets)
+                    loss = loss / config.TRAIN.ACCUMULATION_STEPS
+            else:
+                with amp_autocast():
+                    loss = criterion(outputs, targets)
+                    loss = loss / config.TRAIN.ACCUMULATION_STEPS
+            if config.AMP_OPT_LEVEL != 'O0':
+                is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+                grad_norm = loss_scaler(loss,
+                                        optimizer,
+                                        clip_grad=config.TRAIN.CLIP_GRAD,
+                                        parameters=model.parameters(),
+                                        create_graph=is_second_order,
+                                        update_grad=(idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0)
+                if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+                    optimizer.zero_grad()
+                    if model_ema is not None:
+                        model_ema.update(model)
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(
+                        model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+                if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    if model_ema is not None:
+                        model_ema.update(model)
+            if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+                lr_scheduler.step_update(epoch * num_steps + idx)
+        else:
+            if not obsolete_torch_version(
+                    TORCH_VERSION, (1, 9)) and config.AMP_OPT_LEVEL != 'O0':
+                with amp_autocast(dtype=amp_type):
+                    loss = criterion(outputs, targets)
+            else:
+                with amp_autocast():
+                    loss = criterion(outputs, targets)
+            optimizer.zero_grad()
+            if config.AMP_OPT_LEVEL != 'O0':
+                is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+                grad_norm = loss_scaler(loss,
+                                        optimizer,
+                                        clip_grad=config.TRAIN.CLIP_GRAD,
+                                        parameters=model.parameters(),
+                                        create_graph=is_second_order,
+                                        update_grad=(idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0)
+                if model_ema is not None:
+                    model_ema.update(model)
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(
+                        model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+                optimizer.step()
+                if model_ema is not None:
+                    model_ema.update(model)
+
+            lr_scheduler.step_update(epoch * num_steps + idx)
+
+        torch.cuda.synchronize()
+
+        loss_meter.update(loss.item(), targets.size(0))
+        if grad_norm is not None:
+            norm_meter.update(grad_norm.item())
+        batch_time.update(time.time() - end)
+        model_time.update(time.time() - iter_begin_time)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'model_time {model_time.val:.4f} ({model_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f}/{norm_meter.var:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+    epoch_time = time.time() - start
+    logger.info(
+        f'EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}'
+    )
+
+
+@torch.no_grad()
+def validate_real(config, data_loader, model, real_labels, amp_autocast=suppress):
+    # https://github.com/baaivision/EVA/blob/master/EVA-01/eva/engine_for_finetuning.py#L195
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+
+    end = time.time()
+    amp_type = torch.float16 if config.AMP_TYPE == 'float16' else torch.bfloat16
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        if not obsolete_torch_version(TORCH_VERSION, (1, 9)) and config.AMP_OPT_LEVEL != 'O0':
+            with amp_autocast(dtype=amp_type):
+                output = model(images)
+        else:
+            with amp_autocast():
+                output = model(images)
+
+        # convert 22k to 1k to evaluate
+        if output.size(-1) == 21841:
+            convert_file = './meta_data/map22kto1k.txt'
+            with open(convert_file, 'r') as f:
+                convert_list = [int(line) for line in f.readlines()]
+            output = output[:, convert_list]
+
+        real_labels.add_result(output)
+
+        # measure accuracy and record loss
+        loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        loss = reduce_tensor(loss)
+
+        loss_meter.update(loss.item(), target.size(0))
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(f'Test: [{idx}/{len(data_loader)}]\t'
+                        f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                        f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                        f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                        f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                        f'Mem {memory_used:.0f}MB')
+
+    # real labels mode replaces topk values at the end
+    top1a, top5a = real_labels.get_accuracy(k=1), real_labels.get_accuracy(k=5)
+
+    print('* ReaL Acc@1 {:.3f} Acc@5 {:.3f} loss {losses:.3f}'
+          .format(top1a, top5a, losses=loss_meter.avg))
+
+    return top1a, top5a, loss_meter.avg
+
+
+@torch.no_grad()
+def validate(config, data_loader, model, epoch=None, amp_autocast=suppress):
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+
+    end = time.time()
+    amp_type = torch.float16 if config.AMP_TYPE == 'float16' else torch.bfloat16
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        if not obsolete_torch_version(TORCH_VERSION, (1, 9)) and config.AMP_OPT_LEVEL != 'O0':
+            with amp_autocast(dtype=amp_type):
+                output = model(images)
+        else:
+            with amp_autocast():
+                output = model(images)
+
+        # convert 22k to 1k to evaluate
+        if output.size(-1) == 21841:
+            convert_file = './meta_data/map22kto1k.txt'
+            with open(convert_file, 'r') as f:
+                convert_list = [int(line) for line in f.readlines()]
+            output = output[:, convert_list]
+
+        if config.DATA.DATASET == 'imagenet_a':
+            from dataset.imagenet_a_r_indices import imagenet_a_mask
+            output = output[:, imagenet_a_mask]
+        elif config.DATA.DATASET == 'imagenet_r':
+            from dataset.imagenet_a_r_indices import imagenet_r_mask
+            output = output[:, imagenet_r_mask]
+
+        # measure accuracy and record loss
+        loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        loss = reduce_tensor(loss)
+
+        loss_meter.update(loss.item(), target.size(0))
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(f'Test: [{idx}/{len(data_loader)}]\t'
+                        f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                        f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                        f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                        f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                        f'Mem {memory_used:.0f}MB')
+    if epoch is not None:
+        logger.info(
+            f'[Epoch:{epoch}] * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}'
+        )
+    else:
+        logger.info(
+            f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+
+    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
+
+
+if __name__ == '__main__':
+    _, config = parse_option()
+
+    if config.AMP_OPT_LEVEL != 'O0':
+        assert has_native_amp, 'Please update pytorch(1.6+) to support amp!'
+
+    # init distributed env
+    if _.launcher == 'slurm':
+        print('\nDist init: SLURM')
+        rank = int(os.environ['SLURM_PROCID'])
+        gpu = rank % torch.cuda.device_count()
+        config.defrost()
+        config.LOCAL_RANK = gpu
+        config.freeze()
+
+        world_size = int(os.environ['SLURM_NTASKS'])
+        if 'MASTER_PORT' not in os.environ:
+            os.environ['MASTER_PORT'] = '29501'
+        node_list = os.environ['SLURM_NODELIST']
+        addr = subprocess.getoutput(
+            f'scontrol show hostname {node_list} | head -n1')
+        if 'MASTER_ADDR' not in os.environ:
+            os.environ['MASTER_ADDR'] = addr
+
+        os.environ['RANK'] = str(rank)
+        os.environ['LOCAL_RANK'] = str(gpu)
+        os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count())
+        os.environ['WORLD_SIZE'] = str(world_size)
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f'RANK and WORLD_SIZE in environ: {rank}/{world_size}')
+    else:
+        rank = -1
+        world_size = -1
+
+    torch.cuda.set_device(config.LOCAL_RANK)
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://',
+                                         world_size=world_size,
+                                         rank=rank)
+    torch.distributed.barrier()
+
+    seed = config.SEED + dist.get_rank()
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    cudnn.benchmark = True
+
+    # linear scale the learning rate according to total batch size, may not be optimal
+    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    # gradient accumulation also need to scale the learning rate
+    if config.TRAIN.ACCUMULATION_STEPS > 1:
+        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+    config.defrost()
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    print(config.AMP_OPT_LEVEL, _.amp_opt_level)
+
+    config.freeze()
+
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    logger = create_logger(output_dir=config.OUTPUT,
+                           dist_rank=dist.get_rank(),
+                           name=f'{config.MODEL.NAME}')
+
+    if dist.get_rank() == 0:
+        path = os.path.join(config.OUTPUT, 'config.json')
+        with open(path, 'w') as f:
+            f.write(config.dump())
+        logger.info(f'Full config saved to {path}')
+
+    # print config
+    logger.info(config.dump())
+
+    main(config)
--- a/classification/main_accelerate.py
+++ b/classification/main_accelerate.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import argparse
+import datetime
+import logging
+import os
+import random
+import time
+import warnings
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from accelerate import Accelerator, GradScalerKwargs
+from accelerate.logging import get_logger
+from config import get_config
+from dataset import build_loader2
+from ddp_hooks import fp16_compress_hook
+from lr_scheduler import build_scheduler
+from models import build_model
+from optimizer import build_optimizer
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import AverageMeter, ModelEma, accuracy
+from tqdm import tqdm
+from utils import load_ema_checkpoint, load_pretrained
+
+logger = get_logger(__name__)
+warnings.filterwarnings('ignore')
+
+
+def parse_option():
+    parser = argparse.ArgumentParser(
+        'InternVL training and evaluation script', add_help=False)
+    parser.add_argument('--cfg', type=str, required=True, metavar='FILE', help='path to config file')
+    parser.add_argument('--opts', help="Modify config options by adding 'KEY VALUE' pairs. ", default=None, nargs='+')
+
+    # easy config modification
+    parser.add_argument('--batch-size', type=int, help='batch size for single GPU')
+    parser.add_argument('--dataset', type=str, help='dataset name', default=None)
+    parser.add_argument('--data-path', type=str, help='path to dataset')
+    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
+    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
+                        help='no: no cache, '
+                        'full: cache all data, '
+                        'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
+                        )
+    parser.add_argument('--pretrained', help='pretrained weight from checkpoint, could be imagenet22k pretrained weight')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--output', default='work_dirs', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
+                        )
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
+    parser.add_argument('--save-ckpt-num', default=1, type=int)
+    parser.add_argument('--accumulation-steps', type=int, default=1, help='gradient accumulation steps')
+    parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar')
+    parser.add_argument(
+        '--logger',
+        type=str,
+        default='tensorboard',
+        choices=['tensorboard', 'wandb'],
+        help=(
+            'Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)'
+            ' for experiment tracking and logging of model metrics and model checkpoints'
+        ),
+    )
+
+    args, unparsed = parser.parse_known_args()
+    config = get_config(args)
+    config.defrost()
+    config.TRAIN.OPTIMIZER.USE_ZERO = False
+    config.OUTPUT += '_deepspeed'
+    config.DATA.IMG_ON_MEMORY = False
+    config.freeze()
+    return args, config
+
+
+def seed_everything(seed, rank):
+    seed = seed + rank
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    cudnn.benchmark = True
+
+
+def save_config(config):
+    path = os.path.join(config.OUTPUT, 'config.json')
+    with open(path, 'w') as f:
+        f.write(config.dump())
+    logger.info(f'Full config saved to {path}')
+
+
+def build_criterion(config):
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(
+            smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+    return criterion
+
+
+def scale_learning_rate(config, num_processes):
+    # linear scale the learning rate according to total batch size, may not be optimal
+    linear_scaled_lr = config.TRAIN.BASE_LR * \
+        config.DATA.BATCH_SIZE * num_processes / 512.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * \
+        config.DATA.BATCH_SIZE * num_processes / 512.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * \
+        config.DATA.BATCH_SIZE * num_processes / 512.0
+    # gradient accumulation also need to scale the learning rate
+    if config.TRAIN.ACCUMULATION_STEPS > 1:
+        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+    config.defrost()
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    config.freeze()
+
+    logger.info('BASE_LR={}'.format(config.TRAIN.BASE_LR))
+    logger.info('WARMUP_LR={}'.format(config.TRAIN.WARMUP_LR))
+    logger.info('MIN_LR={}'.format(config.TRAIN.MIN_LR))
+
+
+def setup_autoresume(config):
+    if config.MODEL.RESUME == '' and config.TRAIN.AUTO_RESUME:
+        last_checkpoint = os.path.join(config.OUTPUT, 'last')
+        resume_file = last_checkpoint if os.path.exists(last_checkpoint) else None
+
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(f'auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}')
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
+
+
+def load_model_checkpoint(config, model, accelerator):
+    if config.MODEL.RESUME:
+        try:
+            checkpoint = torch.load(config.MODEL.RESUME)['model']
+            checkpoint = {k.replace('module.', ''): v for k, v in checkpoint.items()}
+            model.load_state_dict(checkpoint)
+        except:
+            accelerator.load_state(config.MODEL.RESUME)
+    elif config.MODEL.PRETRAINED:
+        try:
+            load_pretrained(config, model, logger)
+        except:
+            accelerator.load_state(config.MODEL.PRETRAINED)
+    return model
+
+
+def save_checkpoint(save_dir, accelerator, epoch, max_acc, config, lr_scheduler=None):
+    # let accelerator handle the model and optimizer state for ddp and deepspeed.
+    accelerator.save_state(save_dir)
+
+    if accelerator.is_main_process:
+        save_state = {
+            'lr_scheduler': lr_scheduler.state_dict(),
+            'max_acc': max_acc,
+            'epoch': epoch,
+            'config': config
+        }
+        torch.save(save_state, os.path.join(save_dir, 'additional_state.pth'))
+
+
+def load_checkpoint_if_needed(accelerator, config, lr_scheduler=None):
+    setup_autoresume(config)
+    save_dir = config.MODEL.RESUME
+    if not save_dir:
+        return 0.0
+    accelerator.load_state(save_dir)
+    checkpoint = torch.load(os.path.join(save_dir, 'additional_state.pth'), map_location='cpu')
+    if lr_scheduler is not None:
+        logger.info('resuming lr_scheduler')
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+    config.defrost()
+    config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
+    config.freeze()
+    max_acc = checkpoint.get('max_acc', 0.0)
+    logger.info(f"=> loaded successfully {config.MODEL.RESUME} (epoch {checkpoint['epoch']})")
+    return max_acc
+
+
+def log_model_statistic(model_wo_ddp):
+    n_parameters = sum(p.numel() for p in model_wo_ddp.parameters()
+                       if p.requires_grad)
+    logger.info(f'number of params: {n_parameters}')
+    if hasattr(model_wo_ddp, 'flops'):
+        flops = model_wo_ddp.flops()
+        logger.info(f'number of GFLOPs: {flops / 1e9}')
+
+
+def train_epoch(*, model, optimizer, data_loader, scheduler, criterion, mixup_fn,
+                accelerator: Accelerator, epoch, config):
+    model.train()
+
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    model_time = AverageMeter()
+    loss_meter = AverageMeter()
+
+    end = time.time()
+
+    gradient_accumulation_steps = config.TRAIN.ACCUMULATION_STEPS
+
+    for step, (samples, targets) in enumerate(data_loader):
+        iter_begin_time = time.time()
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+        with accelerator.accumulate(model):
+            outputs = model(samples)
+            loss = criterion(outputs, targets)
+            accelerator.backward(loss)
+            if accelerator.sync_gradients:
+                accelerator.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+            optimizer.step()
+            optimizer.zero_grad()
+
+        accelerator.wait_for_everyone()
+
+        if (step + 1) % gradient_accumulation_steps == 0:
+            if scheduler is not None:
+                scheduler.step_update((epoch * num_steps + step) // gradient_accumulation_steps)
+
+            batch_time.update(time.time() - end)
+            model_time.update(time.time() - iter_begin_time)
+            loss_meter.update(loss.item())
+            end = time.time()
+
+        if accelerator.is_main_process and step % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - step)
+
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{step}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.10f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'model_time {model_time.val:.4f} ({model_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.8f} ({loss_meter.avg:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+
+
+@torch.no_grad()
+def eval_epoch(*, config, data_loader, model, accelerator: Accelerator):
+    model.eval()
+
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+
+    for idx, (images, target) in enumerate(tqdm(data_loader, disable=accelerator.is_main_process)):
+        output = model(images)
+
+        # convert 22k to 1k to evaluate
+        if output.size(-1) == 21841:
+            convert_file = './meta_data/map22kto1k.txt'
+            with open(convert_file, 'r') as f:
+                convert_list = [int(line) for line in f.readlines()]
+            output = output[:, convert_list]
+
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        acc1 = accelerator.gather(acc1).mean(0)
+        acc5 = accelerator.gather(acc5).mean(0)
+
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+
+        if (idx + 1) % config.PRINT_FREQ == 0 or idx + 1 == len(data_loader):
+            logger.info(f'Test: [{idx+1}/{len(data_loader)}]\t'
+                        f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                        f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                        )
+    return acc1_meter.avg
+
+
+def eval(config, accelerator: Accelerator):
+    _, _, _, _, validate_dataloader, _, _ = build_loader2(config)
+    model = build_model(config)
+    model, validate_dataloader = accelerator.prepare(model, validate_dataloader)
+    model = load_model_checkpoint(config, model, accelerator)
+    log_model_statistic(accelerator.unwrap_model(model))
+    eval_epoch(config=config, data_loader=validate_dataloader, model=model, accelerator=accelerator)
+
+
+def train(config, accelerator: Accelerator):
+    _, _, _, training_dataloader, validate_dataloader, _, mixup_fn = build_loader2(config)
+    model = build_model(config)
+    optimizer = build_optimizer(config, model)
+    criterion = build_criterion(config)
+
+    model, optimizer, training_dataloader, validate_dataloader = accelerator.prepare(
+        model, optimizer, training_dataloader, validate_dataloader)
+
+    effective_update_steps_per_epoch = len(training_dataloader) // config.TRAIN.ACCUMULATION_STEPS
+    lr_scheduler = build_scheduler(config, optimizer, effective_update_steps_per_epoch)
+
+    try:
+        model.register_comm_hook(state=None, hook=fp16_compress_hook)
+        logger.info('using fp16_compress_hook!')
+    except:
+        logger.info('cannot register fp16_compress_hook!')
+
+    max_acc = load_checkpoint_if_needed(accelerator, config, lr_scheduler)
+
+    logger.info(f'Created model:{config.MODEL.TYPE}/{config.MODEL.NAME}')
+    logger.info(str(model))
+    logger.info('Effective Optimizer Steps: {}'.format(effective_update_steps_per_epoch))
+    logger.info('Start training')
+    logger.info('Max accuracy: {}'.format(max_acc))
+    log_model_statistic(accelerator.unwrap_model(model))
+
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        train_epoch(model=model, optimizer=optimizer, data_loader=training_dataloader,
+                    scheduler=lr_scheduler, criterion=criterion, mixup_fn=mixup_fn,
+                    accelerator=accelerator, epoch=epoch, config=config)
+        acc = eval_epoch(config=config, data_loader=validate_dataloader, model=model,
+                         accelerator=accelerator)
+
+        accelerator.wait_for_everyone()
+        if acc > max_acc:
+            max_acc = acc
+            save_checkpoint(os.path.join(config.OUTPUT, 'best'), accelerator, epoch, max_acc, config, lr_scheduler)
+        logger.info(f'Max Acc@1 {max_acc:.3f}')
+        save_checkpoint(os.path.join(config.OUTPUT, 'last'), accelerator, epoch, max_acc, config, lr_scheduler)
+
+
+def main():
+    args, config = parse_option()
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    logging.basicConfig(
+        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+        datefmt='%m/%d/%Y %H:%M:%S',
+        filename=os.path.join(config.OUTPUT, 'run.log'),
+        level=logging.INFO,
+    )
+
+    loggers = ['tensorboard']
+    accelerator = Accelerator(
+        log_with=loggers,
+        project_dir=config.OUTPUT,
+        gradient_accumulation_steps=config.TRAIN.ACCUMULATION_STEPS,
+        # When use deepspeed, you could not comment this out
+        # even if you set loss scale to 1.0 in deepspeed config.
+        kwargs_handlers=[GradScalerKwargs(enabled=not args.disable_grad_scalar)],
+    )
+    logger.info(accelerator.state, main_process_only=False)
+
+    scale_learning_rate(config, accelerator.num_processes)
+    seed_everything(config.SEED, accelerator.process_index)
+    save_config(config)
+
+    logger.info(config.dump())
+
+    if config.EVAL_MODE:
+        eval(config, accelerator)
+    else:
+        train(config, accelerator)
+
+
+if __name__ == '__main__':
+    main()
--- a/classification/main_deepspeed.py
+++ b/classification/main_deepspeed.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import argparse
+import datetime
+import os
+import random
+import subprocess
+import time
+
+import deepspeed
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from config import get_config
+from dataset import build_loader
+from ddp_hooks import fp16_compress_hook
+from ema_deepspeed import EMADeepspeed
+from logger import create_logger
+from lr_scheduler import build_scheduler
+from models import build_model
+from optimizer import set_weight_decay_and_lr
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import AverageMeter, accuracy
+from utils import MyAverageMeter, load_pretrained, reduce_tensor
+
+
+def parse_option():
+    parser = argparse.ArgumentParser(
+        'InternVL training and evaluation script', add_help=False)
+    parser.add_argument('--cfg', type=str, required=True, metavar='FILE', help='path to config file')
+    parser.add_argument('--opts', help="Modify config options by adding 'KEY VALUE' pairs. ", default=None, nargs='+')
+
+    # easy config modification
+    parser.add_argument('--batch-size', type=int, help='batch size for single GPU')
+    parser.add_argument('--dataset', type=str, help='dataset name', default=None)
+    parser.add_argument('--data-path', type=str, help='path to dataset')
+    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
+    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
+                        help='no: no cache, '
+                             'full: cache all data, '
+                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
+                        )
+    parser.add_argument('--pretrained',
+                        help='pretrained weight from checkpoint, could be imagenet22k pretrained weight')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--output', default='work_dirs', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
+                        )
+
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
+    parser.add_argument('--save-ckpt-num', default=1, type=int)
+    parser.add_argument('--accumulation-steps', type=int, default=1, help='gradient accumulation steps')
+
+    # distributed training
+    parser.add_argument('--local-rank', type=int, required=True, help='local rank for DistributedDataParallel')
+
+    # deepspeed config
+    parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar')
+    parser.add_argument('--offload-optimizer', type=str, default='none', choices=['cpu', 'none'],
+                        help='enable optimizer offloading')
+    parser.add_argument('--offload-param', type=str, default='none', choices=['cpu', 'none'],
+                        help='enable model offloading')
+    # To use Zero3, Please use main_accelerate.py instead.
+    # For this script, we are facing a similar issue as https://github.com/microsoft/DeepSpeed/issues/3068
+    parser.add_argument('--zero-stage', type=int, default=1, choices=[1, 2], help='deep speed zero stage')
+
+    args, unparsed = parser.parse_known_args()
+    config = get_config(args)
+
+    return args, config
+
+
+def seed_everything(seed, rank):
+    seed = seed + rank
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    cudnn.benchmark = True
+
+
+def save_config(config):
+    path = os.path.join(config.OUTPUT, 'config.json')
+    with open(path, 'w') as f:
+        f.write(config.dump())
+    logger.info(f'Full config saved to {path}')
+
+
+def build_criterion(config):
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(
+            smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+    return criterion
+
+
+def scale_learning_rate(config, num_processes):
+    # linear scale the learning rate according to total batch size, may not be optimal
+    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * num_processes / 512.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * num_processes / 512.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * num_processes / 512.0
+    # gradient accumulation also need to scale the learning rate
+    if config.TRAIN.ACCUMULATION_STEPS > 1:
+        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+    config.defrost()
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    config.freeze()
+
+    logger.info('BASE_LR={}'.format(config.TRAIN.BASE_LR))
+    logger.info('WARMUP_LR={}'.format(config.TRAIN.WARMUP_LR))
+    logger.info('MIN_LR={}'.format(config.TRAIN.MIN_LR))
+
+
+def log_model_statistic(model_wo_ddp):
+    n_parameters = sum(p.numel() for p in model_wo_ddp.parameters()
+                       if p.requires_grad)
+    logger.info(f'number of params: {n_parameters / 1e6} M')
+    if hasattr(model_wo_ddp, 'flops'):
+        flops = model_wo_ddp.flops()
+        logger.info(f'number of GFLOPs: {flops / 1e9}')
+
+
+def get_parameter_groups(model, config):
+    skip = {}
+    skip_keywords = {}
+    if hasattr(model, 'no_weight_decay'):
+        skip = model.no_weight_decay()
+    if hasattr(model, 'no_weight_decay_keywords'):
+        skip_keywords = model.no_weight_decay_keywords()
+
+    parameters = set_weight_decay_and_lr(
+        model,
+        config.TRAIN.WEIGHT_DECAY,
+        config.TRAIN.BASE_LR,
+        skip,
+        skip_keywords,
+        lr_layer_decay=config.TRAIN.LR_LAYER_DECAY,
+        lr_layer_decay_ratio=config.TRAIN.LR_LAYER_DECAY_RATIO,
+        freeze_backbone=config.TRAIN.OPTIMIZER.FREEZE_BACKBONE,
+        dcn_lr_mul=config.TRAIN.OPTIMIZER.DCN_LR_MUL,
+    )
+    return parameters
+
+
+def get_optimizer_state_str(optimizer):
+    states = []
+    for param_group in optimizer.param_groups:
+        states.append(f'name={param_group["name"]} lr={param_group["lr"]} weight_decay={param_group["weight_decay"]}')
+    return '\n'.join(states)
+
+
+def build_ds_config(config, args):
+    opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
+    if opt_lower == 'adamw':
+        optimizer = {
+            'type': 'AdamW',
+            'params': {
+                'lr': config.TRAIN.BASE_LR,
+                'eps': config.TRAIN.OPTIMIZER.EPS,
+                'betas': config.TRAIN.OPTIMIZER.BETAS,
+                'weight_decay': config.TRAIN.WEIGHT_DECAY
+            }
+        }
+    else:
+        return NotImplemented
+
+    ds_config = {
+        'train_micro_batch_size_per_gpu': config.DATA.BATCH_SIZE,
+        'optimizer': optimizer,
+        'bf16': {
+            'enabled': True,
+        },
+        'zero_optimization': {
+            'stage': 1,
+            'allgather_partitions': True,
+            'allgather_bucket_size': 1e9,
+            'overlap_comm': True,
+            'reduce_scatter': True,
+            'reduce_bucket_size': 1e9,
+            'contiguous_gradients': True
+        },
+        'steps_per_print': 1e10,
+        'gradient_accumulation_steps': config.TRAIN.ACCUMULATION_STEPS,
+        'gradient_clipping': config.TRAIN.CLIP_GRAD,
+    }
+    return ds_config
+
+
+@torch.no_grad()
+def throughput(data_loader, model, logger):
+    model.eval()
+
+    for idx, (images, _) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        batch_size = images.shape[0]
+        for i in range(50):
+            model(images)
+        torch.cuda.synchronize()
+        logger.info(f'throughput averaged with 30 times')
+        tic1 = time.time()
+        for i in range(30):
+            model(images)
+        torch.cuda.synchronize()
+        tic2 = time.time()
+        logger.info(
+            f'batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}'
+        )
+        return
+
+
+def train_epoch(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler, model_ema=None):
+    model.train()
+
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    model_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = MyAverageMeter(300)
+
+    start = time.time()
+    end = time.time()
+
+    for idx, (samples, targets) in enumerate(data_loader):
+        iter_begin_time = time.time()
+        samples = samples.cuda(non_blocking=True)
+        targets = targets.cuda(non_blocking=True)
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+        outputs = model(samples)
+        loss = criterion(outputs, targets)
+
+        model.backward(loss)
+        model.step()
+
+        if model_ema is not None:
+            model_ema(model)
+
+        if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+            lr_scheduler.step_update(epoch * num_steps + idx)
+
+        torch.cuda.synchronize()
+        loss_meter.update(loss.item(), targets.size(0))
+        norm_meter.update(optimizer._global_grad_norm)
+        batch_time.update(time.time() - end)
+        model_time.update(time.time() - iter_begin_time)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'model_time {model_time.val:.4f} ({model_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f}/{norm_meter.var:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+
+    epoch_time = time.time() - start
+    logger.info(f'EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}')
+
+
+@torch.no_grad()
+def eval_epoch(config, data_loader, model, epoch=None):
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+
+    end = time.time()
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        output = model(images)
+
+        # convert 22k to 1k to evaluate
+        if output.size(-1) == 21841:
+            convert_file = './meta_data/map22kto1k.txt'
+            with open(convert_file, 'r') as f:
+                convert_list = [int(line) for line in f.readlines()]
+            output = output[:, convert_list]
+
+        # measure accuracy and record loss
+        loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        loss = reduce_tensor(loss)
+
+        loss_meter.update(loss.item(), target.size(0))
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(f'Test: [{idx}/{len(data_loader)}]\t'
+                        f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                        f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                        f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                        f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                        f'Mem {memory_used:.0f}MB')
+    if epoch is not None:
+        logger.info(f'[Epoch:{epoch}] * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+    else:
+        logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+
+    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
+
+
+def train(config, ds_config):
+    # -------------- build ---------------- #
+
+    _, dataset_val, _, data_loader_train, data_loader_val, _, mixup_fn = build_loader(config)
+    model = build_model(config)
+    model.cuda()
+
+    if config.MODEL.PRETRAINED:
+        load_pretrained(config, model, logger)
+
+    logger.info(ds_config)
+    model, optimizer, _, _ = deepspeed.initialize(
+        config=ds_config,
+        model=model,
+        model_parameters=get_parameter_groups(model, config),
+        dist_init_required=False,
+    )
+
+    try:
+        model.register_comm_hook(state=None, hook=fp16_compress_hook)
+        logger.info('using fp16_compress_hook!')
+    except:
+        logger.info('cannot register fp16_compress_hook!')
+
+    model_without_ddp = model.module
+
+    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train))
+    criterion = build_criterion(config)
+
+    model_ema = None
+    if config.TRAIN.EMA.ENABLE:
+        model_ema = EMADeepspeed(model, config.TRAIN.EMA.DECAY)
+
+    # -------------- resume ---------------- #
+
+    max_accuracy = 0.0
+    max_accuracy_ema = 0.0
+    client_state = {}
+    if config.MODEL.RESUME == '' and config.TRAIN.AUTO_RESUME:
+        if os.path.exists(os.path.join(config.OUTPUT, 'latest')):
+            config.defrost()
+            config.MODEL.RESUME = config.OUTPUT
+            config.freeze()
+            tag = None
+    elif config.MODEL.RESUME:
+        config.MODEL.RESUME = os.path.dirname(config.MODEL.RESUME)
+        tag = os.path.basename(config.MODEL.RESUME)
+    if config.MODEL.RESUME:
+        logger.info('loading checkpoint from {}'.format(config.MODEL.RESUME))
+        _, client_state = model.load_checkpoint(load_dir=config.MODEL.RESUME, tag=tag)
+        logger.info(f'client_state={client_state.keys()}')
+        lr_scheduler.load_state_dict(client_state['custom_lr_scheduler'])
+        max_accuracy = client_state['max_accuracy']
+
+        if model_ema is not None:
+            max_accuracy_ema = client_state.get('max_accuracy_ema', 0.0)
+            model_ema.load_state_dict((client_state['model_ema']))
+
+    # -------------- training ---------------- #
+
+    logger.info(f'Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}')
+    logger.info(str(model))
+    logger.info(get_optimizer_state_str(optimizer))
+    logger.info('Start training')
+    logger.info('max_accuracy: {}'.format(max_accuracy))
+    log_model_statistic(model_without_ddp)
+
+    start_time = time.time()
+    start_epoch = client_state['epoch'] + 1 if 'epoch' in client_state else config.TRAIN.START_EPOCH
+    for epoch in range(start_epoch, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+        train_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler,
+                    model_ema=model_ema)
+
+        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.EPOCHS - 1:
+            model.save_checkpoint(
+                save_dir=config.OUTPUT,
+                tag=f'epoch{epoch}',
+                client_state={
+                    'custom_lr_scheduler': lr_scheduler.state_dict(),
+                    'max_accuracy': max_accuracy,
+                    'epoch': epoch,
+                    'config': config,
+                    'max_accuracy_ema': max_accuracy_ema if model_ema is not None else 0.0,
+                    'model_ema': model_ema.state_dict() if model_ema is not None else None,
+                }
+            )
+
+        if epoch % config.EVAL_FREQ == 0:
+            acc1, _, _ = eval_epoch(config, data_loader_val, model, epoch)
+            logger.info(f'Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%')
+
+            if acc1 > max_accuracy:
+                model.save_checkpoint(
+                    save_dir=config.OUTPUT,
+                    tag='best',
+                    client_state={
+                        'custom_lr_scheduler': lr_scheduler.state_dict(),
+                        'max_accuracy': max_accuracy,
+                        'epoch': epoch,
+                        'config': config,
+                        'max_accuracy_ema': max_accuracy_ema if model_ema is not None else 0.0,
+                        'model_ema': model_ema.state_dict() if model_ema is not None else None,
+                    }
+                )
+
+            max_accuracy = max(max_accuracy, acc1)
+            logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+
+            if model_ema is not None:
+                with model_ema.activate(model):
+                    acc1_ema, _, _ = eval_epoch(config, data_loader_val, model, epoch)
+                    logger.info(f'[EMA] Accuracy of the network on the {len(dataset_val)} test images: {acc1_ema:.1f}%')
+                    max_accuracy_ema = max(max_accuracy_ema, acc1_ema)
+                    logger.info(f'[EMA] Max accuracy: {max_accuracy_ema:.2f}%')
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+
+
+def eval(config):
+    _, _, _, _, data_loader_val, _, _ = build_loader(config)
+    model = build_model(config)
+    model.cuda()
+    model = torch.nn.parallel.DistributedDataParallel(
+        model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False)
+
+    model_wo_ddp = model.module
+    if config.MODEL.RESUME:
+        try:
+            checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
+            msg = model_wo_ddp.load_state_dict(checkpoint['model'], strict=False)
+            logger.info(msg)
+        except:
+            try:
+                from deepspeed.utils.zero_to_fp32 import \
+                    get_fp32_state_dict_from_zero_checkpoint
+                ckpt_dir = os.path.dirname(config.MODEL.RESUME)
+                tag = os.path.basename(config.MODEL.RESUME)
+                state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir=ckpt_dir, tag=tag)
+                model_wo_ddp.load_state_dict(state_dict)
+            except:
+                checkpoint = torch.load(os.path.join(config.MODEL.RESUME, 'mp_rank_00_model_states.pt'),
+                                        map_location='cpu')
+                model_wo_ddp.load_state_dict(checkpoint['module'])
+    elif config.MODEL.PRETRAINED:
+        load_pretrained(config, model_wo_ddp, logger)
+
+    if config.THROUGHPUT_MODE:
+        throughput(data_loader_val, model, logger)
+
+    eval_epoch(config, data_loader_val, model)
+
+
+if __name__ == '__main__':
+    args, config = parse_option()
+
+    # init distributed env
+    if 'SLURM_PROCID' in os.environ:
+        print('\nDist init: SLURM')
+        rank = int(os.environ['SLURM_PROCID'])
+        gpu = rank % torch.cuda.device_count()
+        config.defrost()
+        config.LOCAL_RANK = gpu
+        config.freeze()
+
+        world_size = int(os.environ['SLURM_NTASKS'])
+        if 'MASTER_PORT' not in os.environ:
+            os.environ['MASTER_PORT'] = '29501'
+        node_list = os.environ['SLURM_NODELIST']
+        addr = subprocess.getoutput(
+            f'scontrol show hostname {node_list} | head -n1')
+        if 'MASTER_ADDR' not in os.environ:
+            os.environ['MASTER_ADDR'] = addr
+
+        os.environ['RANK'] = str(rank)
+        os.environ['LOCAL_RANK'] = str(gpu)
+        os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count())
+        os.environ['WORLD_SIZE'] = str(world_size)
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f'RANK and WORLD_SIZE in environ: {rank}/{world_size}')
+    else:
+        rank = -1
+        world_size = -1
+    torch.cuda.set_device(config.LOCAL_RANK)
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://',
+                                         world_size=world_size,
+                                         rank=rank)
+    torch.distributed.barrier()
+
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    logger = create_logger(output_dir=config.OUTPUT,
+                           dist_rank=dist.get_rank(),
+                           name=f'{config.MODEL.NAME}')
+    logger.info(config.dump())
+
+    if dist.get_rank() == 0:
+        save_config(config)
+    scale_learning_rate(config, dist.get_world_size())
+    seed_everything(config.SEED, dist.get_rank())
+
+    if config.EVAL_MODE:
+        eval(config)
+    else:
+        train(config, build_ds_config(config, args))
--- a/classification/meta_data/22k_class_to_idx.json
+++ b/classification/meta_data/22k_class_to_idx.json
--- a/classification/meta_data/imagenet_classes.json
+++ b/classification/meta_data/imagenet_classes.json
+{
+    "n01440764": 0,
+    "n01443537": 1,
+    "n01484850": 2,
+    "n01491361": 3,
+    "n01494475": 4,
+    "n01496331": 5,
+    "n01498041": 6,
+    "n01514668": 7,
+    "n01514859": 8,
+    "n01518878": 9,
+    "n01530575": 10,
+    "n01531178": 11,
+    "n01532829": 12,
+    "n01534433": 13,
+    "n01537544": 14,
+    "n01558993": 15,
+    "n01560419": 16,
+    "n01580077": 17,
+    "n01582220": 18,
+    "n01592084": 19,
+    "n01601694": 20,
+    "n01608432": 21,
+    "n01614925": 22,
+    "n01616318": 23,
+    "n01622779": 24,
+    "n01629819": 25,
+    "n01630670": 26,
+    "n01631663": 27,
+    "n01632458": 28,
+    "n01632777": 29,
+    "n01641577": 30,
+    "n01644373": 31,
+    "n01644900": 32,
+    "n01664065": 33,
+    "n01665541": 34,
+    "n01667114": 35,
+    "n01667778": 36,
+    "n01669191": 37,
+    "n01675722": 38,
+    "n01677366": 39,
+    "n01682714": 40,
+    "n01685808": 41,
+    "n01687978": 42,
+    "n01688243": 43,
+    "n01689811": 44,
+    "n01692333": 45,
+    "n01693334": 46,
+    "n01694178": 47,
+    "n01695060": 48,
+    "n01697457": 49,
+    "n01698640": 50,
+    "n01704323": 51,
+    "n01728572": 52,
+    "n01728920": 53,
+    "n01729322": 54,
+    "n01729977": 55,
+    "n01734418": 56,
+    "n01735189": 57,
+    "n01737021": 58,
+    "n01739381": 59,
+    "n01740131": 60,
+    "n01742172": 61,
+    "n01744401": 62,
+    "n01748264": 63,
+    "n01749939": 64,
+    "n01751748": 65,
+    "n01753488": 66,
+    "n01755581": 67,
+    "n01756291": 68,
+    "n01768244": 69,
+    "n01770081": 70,
+    "n01770393": 71,
+    "n01773157": 72,
+    "n01773549": 73,
+    "n01773797": 74,
+    "n01774384": 75,
+    "n01774750": 76,
+    "n01775062": 77,
+    "n01776313": 78,
+    "n01784675": 79,
+    "n01795545": 80,
+    "n01796340": 81,
+    "n01797886": 82,
+    "n01798484": 83,
+    "n01806143": 84,
+    "n01806567": 85,
+    "n01807496": 86,
+    "n01817953": 87,
+    "n01818515": 88,
+    "n01819313": 89,
+    "n01820546": 90,
+    "n01824575": 91,
+    "n01828970": 92,
+    "n01829413": 93,
+    "n01833805": 94,
+    "n01843065": 95,
+    "n01843383": 96,
+    "n01847000": 97,
+    "n01855032": 98,
+    "n01855672": 99,
+    "n01860187": 100,
+    "n01871265": 101,
+    "n01872401": 102,
+    "n01873310": 103,
+    "n01877812": 104,
+    "n01882714": 105,
+    "n01883070": 106,
+    "n01910747": 107,
+    "n01914609": 108,
+    "n01917289": 109,
+    "n01924916": 110,
+    "n01930112": 111,
+    "n01943899": 112,
+    "n01944390": 113,
+    "n01945685": 114,
+    "n01950731": 115,
+    "n01955084": 116,
+    "n01968897": 117,
+    "n01978287": 118,
+    "n01978455": 119,
+    "n01980166": 120,
+    "n01981276": 121,
+    "n01983481": 122,
+    "n01984695": 123,
+    "n01985128": 124,
+    "n01986214": 125,
+    "n01990800": 126,
+    "n02002556": 127,
+    "n02002724": 128,
+    "n02006656": 129,
+    "n02007558": 130,
+    "n02009229": 131,
+    "n02009912": 132,
+    "n02011460": 133,
+    "n02012849": 134,
+    "n02013706": 135,
+    "n02017213": 136,
+    "n02018207": 137,
+    "n02018795": 138,
+    "n02025239": 139,
+    "n02027492": 140,
+    "n02028035": 141,
+    "n02033041": 142,
+    "n02037110": 143,
+    "n02051845": 144,
+    "n02056570": 145,
+    "n02058221": 146,
+    "n02066245": 147,
+    "n02071294": 148,
+    "n02074367": 149,
+    "n02077923": 150,
+    "n02085620": 151,
+    "n02085782": 152,
+    "n02085936": 153,
+    "n02086079": 154,
+    "n02086240": 155,
+    "n02086646": 156,
+    "n02086910": 157,
+    "n02087046": 158,
+    "n02087394": 159,
+    "n02088094": 160,
+    "n02088238": 161,
+    "n02088364": 162,
+    "n02088466": 163,
+    "n02088632": 164,
+    "n02089078": 165,
+    "n02089867": 166,
+    "n02089973": 167,
+    "n02090379": 168,
+    "n02090622": 169,
+    "n02090721": 170,
+    "n02091032": 171,
+    "n02091134": 172,
+    "n02091244": 173,
+    "n02091467": 174,
+    "n02091635": 175,
+    "n02091831": 176,
+    "n02092002": 177,
+    "n02092339": 178,
+    "n02093256": 179,
+    "n02093428": 180,
+    "n02093647": 181,
+    "n02093754": 182,
+    "n02093859": 183,
+    "n02093991": 184,
+    "n02094114": 185,
+    "n02094258": 186,
+    "n02094433": 187,
+    "n02095314": 188,
+    "n02095570": 189,
+    "n02095889": 190,
+    "n02096051": 191,
+    "n02096177": 192,
+    "n02096294": 193,
+    "n02096437": 194,
+    "n02096585": 195,
+    "n02097047": 196,
+    "n02097130": 197,
+    "n02097209": 198,
+    "n02097298": 199,
+    "n02097474": 200,
+    "n02097658": 201,
+    "n02098105": 202,
+    "n02098286": 203,
+    "n02098413": 204,
+    "n02099267": 205,
+    "n02099429": 206,
+    "n02099601": 207,
+    "n02099712": 208,
+    "n02099849": 209,
+    "n02100236": 210,
+    "n02100583": 211,
+    "n02100735": 212,
+    "n02100877": 213,
+    "n02101006": 214,
+    "n02101388": 215,
+    "n02101556": 216,
+    "n02102040": 217,
+    "n02102177": 218,
+    "n02102318": 219,
+    "n02102480": 220,
+    "n02102973": 221,
+    "n02104029": 222,
+    "n02104365": 223,
+    "n02105056": 224,
+    "n02105162": 225,
+    "n02105251": 226,
+    "n02105412": 227,
+    "n02105505": 228,
+    "n02105641": 229,
+    "n02105855": 230,
+    "n02106030": 231,
+    "n02106166": 232,
+    "n02106382": 233,
+    "n02106550": 234,
+    "n02106662": 235,
+    "n02107142": 236,
+    "n02107312": 237,
+    "n02107574": 238,
+    "n02107683": 239,
+    "n02107908": 240,
+    "n02108000": 241,
+    "n02108089": 242,
+    "n02108422": 243,
+    "n02108551": 244,
+    "n02108915": 245,
+    "n02109047": 246,
+    "n02109525": 247,
+    "n02109961": 248,
+    "n02110063": 249,
+    "n02110185": 250,
+    "n02110341": 251,
+    "n02110627": 252,
+    "n02110806": 253,
+    "n02110958": 254,
+    "n02111129": 255,
+    "n02111277": 256,
+    "n02111500": 257,
+    "n02111889": 258,
+    "n02112018": 259,
+    "n02112137": 260,
+    "n02112350": 261,
+    "n02112706": 262,
+    "n02113023": 263,
+    "n02113186": 264,
+    "n02113624": 265,
+    "n02113712": 266,
+    "n02113799": 267,
+    "n02113978": 268,
+    "n02114367": 269,
+    "n02114548": 270,
+    "n02114712": 271,
+    "n02114855": 272,
+    "n02115641": 273,
+    "n02115913": 274,
+    "n02116738": 275,
+    "n02117135": 276,
+    "n02119022": 277,
+    "n02119789": 278,
+    "n02120079": 279,
+    "n02120505": 280,
+    "n02123045": 281,
+    "n02123159": 282,
+    "n02123394": 283,
+    "n02123597": 284,
+    "n02124075": 285,
+    "n02125311": 286,
+    "n02127052": 287,
+    "n02128385": 288,
+    "n02128757": 289,
+    "n02128925": 290,
+    "n02129165": 291,
+    "n02129604": 292,
+    "n02130308": 293,
+    "n02132136": 294,
+    "n02133161": 295,
+    "n02134084": 296,
+    "n02134418": 297,
+    "n02137549": 298,
+    "n02138441": 299,
+    "n02165105": 300,
+    "n02165456": 301,
+    "n02167151": 302,
+    "n02168699": 303,
+    "n02169497": 304,
+    "n02172182": 305,
+    "n02174001": 306,
+    "n02177972": 307,
+    "n02190166": 308,
+    "n02206856": 309,
+    "n02219486": 310,
+    "n02226429": 311,
+    "n02229544": 312,
+    "n02231487": 313,
+    "n02233338": 314,
+    "n02236044": 315,
+    "n02256656": 316,
+    "n02259212": 317,
+    "n02264363": 318,
+    "n02268443": 319,
+    "n02268853": 320,
+    "n02276258": 321,
+    "n02277742": 322,
+    "n02279972": 323,
+    "n02280649": 324,
+    "n02281406": 325,
+    "n02281787": 326,
+    "n02317335": 327,
+    "n02319095": 328,
+    "n02321529": 329,
+    "n02325366": 330,
+    "n02326432": 331,
+    "n02328150": 332,
+    "n02342885": 333,
+    "n02346627": 334,
+    "n02356798": 335,
+    "n02361337": 336,
+    "n02363005": 337,
+    "n02364673": 338,
+    "n02389026": 339,
+    "n02391049": 340,
+    "n02395406": 341,
+    "n02396427": 342,
+    "n02397096": 343,
+    "n02398521": 344,
+    "n02403003": 345,
+    "n02408429": 346,
+    "n02410509": 347,
+    "n02412080": 348,
+    "n02415577": 349,
+    "n02417914": 350,
+    "n02422106": 351,
+    "n02422699": 352,
+    "n02423022": 353,
+    "n02437312": 354,
+    "n02437616": 355,
+    "n02441942": 356,
+    "n02442845": 357,
+    "n02443114": 358,
+    "n02443484": 359,
+    "n02444819": 360,
+    "n02445715": 361,
+    "n02447366": 362,
+    "n02454379": 363,
+    "n02457408": 364,
+    "n02480495": 365,
+    "n02480855": 366,
+    "n02481823": 367,
+    "n02483362": 368,
+    "n02483708": 369,
+    "n02484975": 370,
+    "n02486261": 371,
+    "n02486410": 372,
+    "n02487347": 373,
+    "n02488291": 374,
+    "n02488702": 375,
+    "n02489166": 376,
+    "n02490219": 377,
+    "n02492035": 378,
+    "n02492660": 379,
+    "n02493509": 380,
+    "n02493793": 381,
+    "n02494079": 382,
+    "n02497673": 383,
+    "n02500267": 384,
+    "n02504013": 385,
+    "n02504458": 386,
+    "n02509815": 387,
+    "n02510455": 388,
+    "n02514041": 389,
+    "n02526121": 390,
+    "n02536864": 391,
+    "n02606052": 392,
+    "n02607072": 393,
+    "n02640242": 394,
+    "n02641379": 395,
+    "n02643566": 396,
+    "n02655020": 397,
+    "n02666196": 398,
+    "n02667093": 399,
+    "n02669723": 400,
+    "n02672831": 401,
+    "n02676566": 402,
+    "n02687172": 403,
+    "n02690373": 404,
+    "n02692877": 405,
+    "n02699494": 406,
+    "n02701002": 407,
+    "n02704792": 408,
+    "n02708093": 409,
+    "n02727426": 410,
+    "n02730930": 411,
+    "n02747177": 412,
+    "n02749479": 413,
+    "n02769748": 414,
+    "n02776631": 415,
+    "n02777292": 416,
+    "n02782093": 417,
+    "n02783161": 418,
+    "n02786058": 419,
+    "n02787622": 420,
+    "n02788148": 421,
+    "n02790996": 422,
+    "n02791124": 423,
+    "n02791270": 424,
+    "n02793495": 425,
+    "n02794156": 426,
+    "n02795169": 427,
+    "n02797295": 428,
+    "n02799071": 429,
+    "n02802426": 430,
+    "n02804414": 431,
+    "n02804610": 432,
+    "n02807133": 433,
+    "n02808304": 434,
+    "n02808440": 435,
+    "n02814533": 436,
+    "n02814860": 437,
+    "n02815834": 438,
+    "n02817516": 439,
+    "n02823428": 440,
+    "n02823750": 441,
+    "n02825657": 442,
+    "n02834397": 443,
+    "n02835271": 444,
+    "n02837789": 445,
+    "n02840245": 446,
+    "n02841315": 447,
+    "n02843684": 448,
+    "n02859443": 449,
+    "n02860847": 450,
+    "n02865351": 451,
+    "n02869837": 452,
+    "n02870880": 453,
+    "n02871525": 454,
+    "n02877765": 455,
+    "n02879718": 456,
+    "n02883205": 457,
+    "n02892201": 458,
+    "n02892767": 459,
+    "n02894605": 460,
+    "n02895154": 461,
+    "n02906734": 462,
+    "n02909870": 463,
+    "n02910353": 464,
+    "n02916936": 465,
+    "n02917067": 466,
+    "n02927161": 467,
+    "n02930766": 468,
+    "n02939185": 469,
+    "n02948072": 470,
+    "n02950826": 471,
+    "n02951358": 472,
+    "n02951585": 473,
+    "n02963159": 474,
+    "n02965783": 475,
+    "n02966193": 476,
+    "n02966687": 477,
+    "n02971356": 478,
+    "n02974003": 479,
+    "n02977058": 480,
+    "n02978881": 481,
+    "n02979186": 482,
+    "n02980441": 483,
+    "n02981792": 484,
+    "n02988304": 485,
+    "n02992211": 486,
+    "n02992529": 487,
+    "n02999410": 488,
+    "n03000134": 489,
+    "n03000247": 490,
+    "n03000684": 491,
+    "n03014705": 492,
+    "n03016953": 493,
+    "n03017168": 494,
+    "n03018349": 495,
+    "n03026506": 496,
+    "n03028079": 497,
+    "n03032252": 498,
+    "n03041632": 499,
+    "n03042490": 500,
+    "n03045698": 501,
+    "n03047690": 502,
+    "n03062245": 503,
+    "n03063599": 504,
+    "n03063689": 505,
+    "n03065424": 506,
+    "n03075370": 507,
+    "n03085013": 508,
+    "n03089624": 509,
+    "n03095699": 510,
+    "n03100240": 511,
+    "n03109150": 512,
+    "n03110669": 513,
+    "n03124043": 514,
+    "n03124170": 515,
+    "n03125729": 516,
+    "n03126707": 517,
+    "n03127747": 518,
+    "n03127925": 519,
+    "n03131574": 520,
+    "n03133878": 521,
+    "n03134739": 522,
+    "n03141823": 523,
+    "n03146219": 524,
+    "n03160309": 525,
+    "n03179701": 526,
+    "n03180011": 527,
+    "n03187595": 528,
+    "n03188531": 529,
+    "n03196217": 530,
+    "n03197337": 531,
+    "n03201208": 532,
+    "n03207743": 533,
+    "n03207941": 534,
+    "n03208938": 535,
+    "n03216828": 536,
+    "n03218198": 537,
+    "n03220513": 538,
+    "n03223299": 539,
+    "n03240683": 540,
+    "n03249569": 541,
+    "n03250847": 542,
+    "n03255030": 543,
+    "n03259280": 544,
+    "n03271574": 545,
+    "n03272010": 546,
+    "n03272562": 547,
+    "n03290653": 548,
+    "n03291819": 549,
+    "n03297495": 550,
+    "n03314780": 551,
+    "n03325584": 552,
+    "n03337140": 553,
+    "n03344393": 554,
+    "n03345487": 555,
+    "n03347037": 556,
+    "n03355925": 557,
+    "n03372029": 558,
+    "n03376595": 559,
+    "n03379051": 560,
+    "n03384352": 561,
+    "n03388043": 562,
+    "n03388183": 563,
+    "n03388549": 564,
+    "n03393912": 565,
+    "n03394916": 566,
+    "n03400231": 567,
+    "n03404251": 568,
+    "n03417042": 569,
+    "n03424325": 570,
+    "n03425413": 571,
+    "n03443371": 572,
+    "n03444034": 573,
+    "n03445777": 574,
+    "n03445924": 575,
+    "n03447447": 576,
+    "n03447721": 577,
+    "n03450230": 578,
+    "n03452741": 579,
+    "n03457902": 580,
+    "n03459775": 581,
+    "n03461385": 582,
+    "n03467068": 583,
+    "n03476684": 584,
+    "n03476991": 585,
+    "n03478589": 586,
+    "n03481172": 587,
+    "n03482405": 588,
+    "n03483316": 589,
+    "n03485407": 590,
+    "n03485794": 591,
+    "n03492542": 592,
+    "n03494278": 593,
+    "n03495258": 594,
+    "n03496892": 595,
+    "n03498962": 596,
+    "n03527444": 597,
+    "n03529860": 598,
+    "n03530642": 599,
+    "n03532672": 600,
+    "n03534580": 601,
+    "n03535780": 602,
+    "n03538406": 603,
+    "n03544143": 604,
+    "n03584254": 605,
+    "n03584829": 606,
+    "n03590841": 607,
+    "n03594734": 608,
+    "n03594945": 609,
+    "n03595614": 610,
+    "n03598930": 611,
+    "n03599486": 612,
+    "n03602883": 613,
+    "n03617480": 614,
+    "n03623198": 615,
+    "n03627232": 616,
+    "n03630383": 617,
+    "n03633091": 618,
+    "n03637318": 619,
+    "n03642806": 620,
+    "n03649909": 621,
+    "n03657121": 622,
+    "n03658185": 623,
+    "n03661043": 624,
+    "n03662601": 625,
+    "n03666591": 626,
+    "n03670208": 627,
+    "n03673027": 628,
+    "n03676483": 629,
+    "n03680355": 630,
+    "n03690938": 631,
+    "n03691459": 632,
+    "n03692522": 633,
+    "n03697007": 634,
+    "n03706229": 635,
+    "n03709823": 636,
+    "n03710193": 637,
+    "n03710637": 638,
+    "n03710721": 639,
+    "n03717622": 640,
+    "n03720891": 641,
+    "n03721384": 642,
+    "n03724870": 643,
+    "n03729826": 644,
+    "n03733131": 645,
+    "n03733281": 646,
+    "n03733805": 647,
+    "n03742115": 648,
+    "n03743016": 649,
+    "n03759954": 650,
+    "n03761084": 651,
+    "n03763968": 652,
+    "n03764736": 653,
+    "n03769881": 654,
+    "n03770439": 655,
+    "n03770679": 656,
+    "n03773504": 657,
+    "n03775071": 658,
+    "n03775546": 659,
+    "n03776460": 660,
+    "n03777568": 661,
+    "n03777754": 662,
+    "n03781244": 663,
+    "n03782006": 664,
+    "n03785016": 665,
+    "n03786901": 666,
+    "n03787032": 667,
+    "n03788195": 668,
+    "n03788365": 669,
+    "n03791053": 670,
+    "n03792782": 671,
+    "n03792972": 672,
+    "n03793489": 673,
+    "n03794056": 674,
+    "n03796401": 675,
+    "n03803284": 676,
+    "n03804744": 677,
+    "n03814639": 678,
+    "n03814906": 679,
+    "n03825788": 680,
+    "n03832673": 681,
+    "n03837869": 682,
+    "n03838899": 683,
+    "n03840681": 684,
+    "n03841143": 685,
+    "n03843555": 686,
+    "n03854065": 687,
+    "n03857828": 688,
+    "n03866082": 689,
+    "n03868242": 690,
+    "n03868863": 691,
+    "n03871628": 692,
+    "n03873416": 693,
+    "n03874293": 694,
+    "n03874599": 695,
+    "n03876231": 696,
+    "n03877472": 697,
+    "n03877845": 698,
+    "n03884397": 699,
+    "n03887697": 700,
+    "n03888257": 701,
+    "n03888605": 702,
+    "n03891251": 703,
+    "n03891332": 704,
+    "n03895866": 705,
+    "n03899768": 706,
+    "n03902125": 707,
+    "n03903868": 708,
+    "n03908618": 709,
+    "n03908714": 710,
+    "n03916031": 711,
+    "n03920288": 712,
+    "n03924679": 713,
+    "n03929660": 714,
+    "n03929855": 715,
+    "n03930313": 716,
+    "n03930630": 717,
+    "n03933933": 718,
+    "n03935335": 719,
+    "n03937543": 720,
+    "n03938244": 721,
+    "n03942813": 722,
+    "n03944341": 723,
+    "n03947888": 724,
+    "n03950228": 725,
+    "n03954731": 726,
+    "n03956157": 727,
+    "n03958227": 728,
+    "n03961711": 729,
+    "n03967562": 730,
+    "n03970156": 731,
+    "n03976467": 732,
+    "n03976657": 733,
+    "n03977966": 734,
+    "n03980874": 735,
+    "n03982430": 736,
+    "n03983396": 737,
+    "n03991062": 738,
+    "n03992509": 739,
+    "n03995372": 740,
+    "n03998194": 741,
+    "n04004767": 742,
+    "n04005630": 743,
+    "n04008634": 744,
+    "n04009552": 745,
+    "n04019541": 746,
+    "n04023962": 747,
+    "n04026417": 748,
+    "n04033901": 749,
+    "n04033995": 750,
+    "n04037443": 751,
+    "n04039381": 752,
+    "n04040759": 753,
+    "n04041544": 754,
+    "n04044716": 755,
+    "n04049303": 756,
+    "n04065272": 757,
+    "n04067472": 758,
+    "n04069434": 759,
+    "n04070727": 760,
+    "n04074963": 761,
+    "n04081281": 762,
+    "n04086273": 763,
+    "n04090263": 764,
+    "n04099969": 765,
+    "n04111531": 766,
+    "n04116512": 767,
+    "n04118538": 768,
+    "n04118776": 769,
+    "n04120489": 770,
+    "n04125021": 771,
+    "n04127249": 772,
+    "n04131690": 773,
+    "n04133789": 774,
+    "n04136333": 775,
+    "n04141076": 776,
+    "n04141327": 777,
+    "n04141975": 778,
+    "n04146614": 779,
+    "n04147183": 780,
+    "n04149813": 781,
+    "n04152593": 782,
+    "n04153751": 783,
+    "n04154565": 784,
+    "n04162706": 785,
+    "n04179913": 786,
+    "n04192698": 787,
+    "n04200800": 788,
+    "n04201297": 789,
+    "n04204238": 790,
+    "n04204347": 791,
+    "n04208210": 792,
+    "n04209133": 793,
+    "n04209239": 794,
+    "n04228054": 795,
+    "n04229816": 796,
+    "n04235860": 797,
+    "n04238763": 798,
+    "n04239074": 799,
+    "n04243546": 800,
+    "n04251144": 801,
+    "n04252077": 802,
+    "n04252225": 803,
+    "n04254120": 804,
+    "n04254680": 805,
+    "n04254777": 806,
+    "n04258138": 807,
+    "n04259630": 808,
+    "n04263257": 809,
+    "n04264628": 810,
+    "n04265275": 811,
+    "n04266014": 812,
+    "n04270147": 813,
+    "n04273569": 814,
+    "n04275548": 815,
+    "n04277352": 816,
+    "n04285008": 817,
+    "n04286575": 818,
+    "n04296562": 819,
+    "n04310018": 820,
+    "n04311004": 821,
+    "n04311174": 822,
+    "n04317175": 823,
+    "n04325704": 824,
+    "n04326547": 825,
+    "n04328186": 826,
+    "n04330267": 827,
+    "n04332243": 828,
+    "n04335435": 829,
+    "n04336792": 830,
+    "n04344873": 831,
+    "n04346328": 832,
+    "n04347754": 833,
+    "n04350905": 834,
+    "n04355338": 835,
+    "n04355933": 836,
+    "n04356056": 837,
+    "n04357314": 838,
+    "n04366367": 839,
+    "n04367480": 840,
+    "n04370456": 841,
+    "n04371430": 842,
+    "n04371774": 843,
+    "n04372370": 844,
+    "n04376876": 845,
+    "n04380533": 846,
+    "n04389033": 847,
+    "n04392985": 848,
+    "n04398044": 849,
+    "n04399382": 850,
+    "n04404412": 851,
+    "n04409515": 852,
+    "n04417672": 853,
+    "n04418357": 854,
+    "n04423845": 855,
+    "n04428191": 856,
+    "n04429376": 857,
+    "n04435653": 858,
+    "n04442312": 859,
+    "n04443257": 860,
+    "n04447861": 861,
+    "n04456115": 862,
+    "n04458633": 863,
+    "n04461696": 864,
+    "n04462240": 865,
+    "n04465501": 866,
+    "n04467665": 867,
+    "n04476259": 868,
+    "n04479046": 869,
+    "n04482393": 870,
+    "n04483307": 871,
+    "n04485082": 872,
+    "n04486054": 873,
+    "n04487081": 874,
+    "n04487394": 875,
+    "n04493381": 876,
+    "n04501370": 877,
+    "n04505470": 878,
+    "n04507155": 879,
+    "n04509417": 880,
+    "n04515003": 881,
+    "n04517823": 882,
+    "n04522168": 883,
+    "n04523525": 884,
+    "n04525038": 885,
+    "n04525305": 886,
+    "n04532106": 887,
+    "n04532670": 888,
+    "n04536866": 889,
+    "n04540053": 890,
+    "n04542943": 891,
+    "n04548280": 892,
+    "n04548362": 893,
+    "n04550184": 894,
+    "n04552348": 895,
+    "n04553703": 896,
+    "n04554684": 897,
+    "n04557648": 898,
+    "n04560804": 899,
+    "n04562935": 900,
+    "n04579145": 901,
+    "n04579432": 902,
+    "n04584207": 903,
+    "n04589890": 904,
+    "n04590129": 905,
+    "n04591157": 906,
+    "n04591713": 907,
+    "n04592741": 908,
+    "n04596742": 909,
+    "n04597913": 910,
+    "n04599235": 911,
+    "n04604644": 912,
+    "n04606251": 913,
+    "n04612504": 914,
+    "n04613696": 915,
+    "n06359193": 916,
+    "n06596364": 917,
+    "n06785654": 918,
+    "n06794110": 919,
+    "n06874185": 920,
+    "n07248320": 921,
+    "n07565083": 922,
+    "n07579787": 923,
+    "n07583066": 924,
+    "n07584110": 925,
+    "n07590611": 926,
+    "n07613480": 927,
+    "n07614500": 928,
+    "n07615774": 929,
+    "n07684084": 930,
+    "n07693725": 931,
+    "n07695742": 932,
+    "n07697313": 933,
+    "n07697537": 934,
+    "n07711569": 935,
+    "n07714571": 936,
+    "n07714990": 937,
+    "n07715103": 938,
+    "n07716358": 939,
+    "n07716906": 940,
+    "n07717410": 941,
+    "n07717556": 942,
+    "n07718472": 943,
+    "n07718747": 944,
+    "n07720875": 945,
+    "n07730033": 946,
+    "n07734744": 947,
+    "n07742313": 948,
+    "n07745940": 949,
+    "n07747607": 950,
+    "n07749582": 951,
+    "n07753113": 952,
+    "n07753275": 953,
+    "n07753592": 954,
+    "n07754684": 955,
+    "n07760859": 956,
+    "n07768694": 957,
+    "n07802026": 958,
+    "n07831146": 959,
+    "n07836838": 960,
+    "n07860988": 961,
+    "n07871810": 962,
+    "n07873807": 963,
+    "n07875152": 964,
+    "n07880968": 965,
+    "n07892512": 966,
+    "n07920052": 967,
+    "n07930864": 968,
+    "n07932039": 969,
+    "n09193705": 970,
+    "n09229709": 971,
+    "n09246464": 972,
+    "n09256479": 973,
+    "n09288635": 974,
+    "n09332890": 975,
+    "n09399592": 976,
+    "n09421951": 977,
+    "n09428293": 978,
+    "n09468604": 979,
+    "n09472597": 980,
+    "n09835506": 981,
+    "n10148035": 982,
+    "n10565667": 983,
+    "n11879895": 984,
+    "n11939491": 985,
+    "n12057211": 986,
+    "n12144580": 987,
+    "n12267677": 988,
+    "n12620546": 989,
+    "n12768682": 990,
+    "n12985857": 991,
+    "n12998815": 992,
+    "n13037406": 993,
+    "n13040303": 994,
+    "n13044778": 995,
+    "n13052670": 996,
+    "n13054560": 997,
+    "n13133613": 998,
+    "n15075141": 999
+  }