[Feature] Support MMCV on IPU (#1882)

* implement runner on IPU * adjust import * adjust import * add ignore for ipu on without ipu * remove compilation cache * remove ipu from mmcv/runner.__all__ * adjust IS_IPU and IS_MLU * adjust by isort * add ipuHardwareIsAvailable * remove engine_cache * code review 9

[Feature] Support MMCV on IPU (#1882)
* implement runner on IPU * adjust import * adjust import * add ignore for ipu on without ipu * remove compilation cache * remove ipu from mmcv/runner.__all__ * adjust IS_IPU and IS_MLU * adjust by isort * add ipuHardwareIsAvailable * remove engine_cache * code review 9
5221a388 · Hu Di · GitHub · 42e7e2ee · 5221a388 · 5221a388
Unverified Commit 5221a388 authored Apr 18, 2022 by Hu Di Committed by GitHub Apr 18, 2022
19 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -45,7 +45,7 @@ jobs:
      - name: Run unittests and generate coverage report
        run: |
          pip install -r requirements/test.txt
-          pytest tests/ --ignore=tests/test_runner --ignore=tests/test_optimizer.py --ignore=tests/test_cnn --ignore=tests/test_parallel.py --ignore=tests/test_ops --ignore=tests/test_load_model_zoo.py --ignore=tests/test_utils/test_logging.py --ignore=tests/test_image/test_io.py --ignore=tests/test_utils/test_registry.py --ignore=tests/test_utils/test_parrots_jit.py --ignore=tests/test_utils/test_trace.py --ignore=tests/test_utils/test_hub.py --ignore=tests/test_device/test_mlu/test_mlu_parallel.py
+          pytest tests/ --ignore=tests/test_runner --ignore=tests/test_device/test_ipu --ignore=tests/test_optimizer.py --ignore=tests/test_cnn --ignore=tests/test_parallel.py --ignore=tests/test_ops --ignore=tests/test_load_model_zoo.py --ignore=tests/test_utils/test_logging.py --ignore=tests/test_image/test_io.py --ignore=tests/test_utils/test_registry.py --ignore=tests/test_utils/test_parrots_jit.py --ignore=tests/test_utils/test_trace.py --ignore=tests/test_utils/test_hub.py --ignore=tests/test_device/test_mlu/test_mlu_parallel.py
  build_without_ops:
    runs-on: ubuntu-18.04

--- a/mmcv/device/__init__.py
+++ b/mmcv/device/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from . import mlu
+from . import ipu, mlu
-__all__ = ['mlu']
+__all__ = ['mlu', 'ipu']
--- a/mmcv/device/ipu/__init__.py
+++ b/mmcv/device/ipu/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from .dataloader import IPUDataLoader
+    from .hook_wrapper import IPUFp16OptimizerHook
+    from .model_wrapper import ipu_model_wrapper
+    from .runner import IPUBaseRunner, IPUEpochBasedRunner, IPUIterBasedRunner
+    from .utils import cfg2options
+    __all__ = [
+        'cfg2options', 'ipu_model_wrapper', 'IPUFp16OptimizerHook',
+        'IPUDataLoader', 'IPUBaseRunner', 'IPUEpochBasedRunner',
+        'IPUIterBasedRunner'
+    ]
--- a/mmcv/device/ipu/dataloader.py
+++ b/mmcv/device/ipu/dataloader.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Mapping, Sequence
+from functools import partial
+import poptorch
+from torch.utils.data.dataloader import default_collate
+from mmcv.parallel import DataContainer
+def collate(batch, samples_per_gpu=1):
+    """Put each data field into a tensor/DataContainer with outer dimension
+    batch size.
+    TODO support for
+    :type:`~mmcv.parallel.DataContainer`. Currently, it will be ignored.
+    There are 3 cases.
+    1. cpu_only = True, e.g., meta data.
+    2. cpu_only = False, stack = True, e.g., images tensors.
+    3. cpu_only = False, stack = False, e.g., gt bboxes.
+    """
+    if not isinstance(batch, Sequence):
+        raise TypeError(
+            f'`batch` should be a sequence, but got {type(batch)}.')
+    if isinstance(batch[0], DataContainer):
+        # TODO `DataContainer` will be supported in the future.
+        raise TypeError('DataContainer is not supported in ipu data loader.')
+    elif isinstance(batch[0], Sequence):
+        transposed = zip(*batch)
+        collated_batch = []
+        for samples in transposed:
+            if not isinstance(samples[0], DataContainer):
+                # At present, we will skip the processing of datacontainer,
+                # which will reduce the performance of IPU DataLoder
+                collated_batch.append(collate(samples, samples_per_gpu))
+        return collated_batch
+    elif isinstance(batch[0], Mapping):
+        collated_batch = {}
+        for key in batch[0]:
+            if not isinstance(batch[0][key], DataContainer):
+                # At present, we will skip the processing of datacontainer,
+                # which will reduce the performance of IPU DataLoder
+                collated_batch[key] = collate([d[key] for d in batch])
+        return collated_batch
+    else:
+        return default_collate(batch)
+class IPUDataLoader(poptorch.DataLoader):
+    """Thin wrapper of `torch.utils.data.DataLoader`.
+    Compared with the pytorch DataLoder, this DataLoder changes the way of
+    calculation of batch size and adds the AsynchronousDataAccessor to
+    load and release data faster in cpu mode.
+    If this data loader is used in a distributed execution environment, it will
+    ensure that each process uses a different subset of the dataset, providing
+    you first call ``options.randomSeed(N)`` with an integer N which is the
+    same across all hosts.
+    Args:
+        dataset (torch.utils.data.Dataset): The dataset to get the data from.
+        options (poptorch.Options): Options that will be used to compile
+            and run the model.
+        batch_size (int, optional): This is the batch size in the conventional
+            sense of being the size that runs through an operation in the model
+            at any given time.
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: ``False``).
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main
+            process. (default: ``0``)
+        drop_last (bool, optional): If True and the number of elements in the
+            dataset is not a multiple of the combined batch size then the
+            incomplete batch at the end will be dropped.
+        persistent_workers (bool, optional): Re-use workers between
+            iterations if True.
+        auto_distributed_partitioning (bool, optional): If True, partitions the
+            dataset for distributed execution automatically. Otherwise, it is
+            assumed that partitioning has been handled manually.
+        mode (poptorch.DataLoaderMode, optional): If `DataLoaderMode.Async`,
+            uses an :py:class:`~poptorch.AsynchronousDataAccessor` to access
+            the dataset. If `DataLoaderMode.Sync`, accesses the dataset
+            synchronously.
+        async_options (Dict[str, Any], optional): Options to pass to
+            :py:class:`~poptorch.AsynchronousDataAccessor`.
+        rebatched_worker_size (int, optional): When using AsyncRebatched: batch
+            size of the tensors loaded by the workers.
+            Default to the combined batch size.
+            If specified the ``rebatched_worker_size`` must be less than
+            or equal to the combined batch size.
+        kwargs (Dict[str, Any], optional): Other options to pass to PyTorch's
+            ``DataLoader`` constructor.
+    """
+    def __init__(self,
+                 dataset,
+                 options,
+                 batch_size=1,
+                 shuffle=False,
+                 num_workers=0,
+                 drop_last=True,
+                 persistent_workers=True,
+                 auto_distributed_partitioning=True,
+                 mode='sync',
+                 async_options=None,
+                 rebatched_worker_size=None,
+                 **kwargs):
+        """Lazy init:
+        In many frameworks, the dataloader will be constructed before the
+        initialization of the ipu options, so the lazy init method is used
+        here, and the real initialization will not be done until the dataloader
+        needs to be used and the options are input.
+        """
+        # lazy init: sometimes, we cannot get IPU options when build data
+        #            loader
+        self.kwargs = {
+            'dataset': dataset,
+            'batch_size': batch_size,
+            'shuffle': shuffle,
+            'num_workers': num_workers,
+            'drop_last': drop_last,
+            'persistent_workers': persistent_workers,
+            'auto_distributed_partitioning': auto_distributed_partitioning,
+            'mode': mode,
+            'collate_fn': partial(collate, samples_per_gpu=batch_size),
+            'async_options': async_options,
+            'rebatched_worker_size': rebatched_worker_size,
+            **kwargs
+        }
+        self.dataset = dataset
+        self.initialized = False
+        if options:
+            self.init(options=options)
+    def init(self, options, **kwargs):
+        if not self.initialized:
+            kwargs = {**self.kwargs, **kwargs, 'options': options}
+            if kwargs['mode'] == 'sync':
+                kwargs['mode'] = poptorch.DataLoaderMode.Sync
+            elif kwargs['mode'] == 'async':
+                kwargs['mode'] = poptorch.DataLoaderMode.AsyncRebatched
+                if kwargs['async_options'] is None:
+                    kwargs['async_options'] = {
+                        'load_indefinitely': True,
+                        'buffer_size': 8
+                    }
+                if kwargs['rebatched_worker_size'] is None:
+                    kwargs['rebatched_worker_size'] = 128
+            super().__init__(**kwargs)
+            self.initialized = True
+        return self
--- a/mmcv/device/ipu/hierarchical_data_manager.py
+++ b/mmcv/device/ipu/hierarchical_data_manager.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer
+# A customized None type for HierarchicalDataManager
+HierarchicalDataNone = object()
+class HierarchicalDataManager:
+    """A class manage all the tensors in the hierarchical data.
+    At present, the input data structure accepted by IPU is limited,
+    when the input data structure of mmcv varies.
+    Here, an intermediate class is needed to get and update tensors
+    from the original data.
+    HierarchicalDataManager will record a hierarchical input/output data in
+    self._hierarchical_data. For example, we have an input data:
+    {'img': tensorA, 'label': tensorB, 'img_metas': [tensorC, tensorD]}
+    To enable IPU to use the input, HierarchicalDataManager will collect
+    the torch tensors from self._hierarchical_data into a tuple like:
+    (tensorA, tensorB, tensorC, tensorD).
+    Meanwhile, the return of IPU is a tuple of tensors, HierarchicalDataManager
+    also have a function named update_all_tensors to update tensors in
+    self._hierarchical_data which is the output for upper calls.
+    Args:
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+    """
+    def __init__(self, logger=None):
+        self.atomic_types = (int, str, float, np.ndarray, type(None))
+        self.warning = warnings.warn if logger is None else logger.warning
+        # enable or disable input data's shape and value check
+        self.quick_mode = False
+        self._hierarchical_data = None
+    def quick(self):
+        self.quick_mode = True
+    def compare_atomic_type(self, a, b):
+        """Compare data, supported datatypes are numpy array and python basic
+        types."""
+        if isinstance(a, np.ndarray):
+            return np.all(a == b)
+        else:
+            return a == b
+    def record_hierarchical_data(self, data):
+        """Record a hierarchical data."""
+        if self._hierarchical_data is not None:
+            if isinstance(data, torch.Tensor):
+                assert isinstance(self._hierarchical_data, torch.Tensor), \
+                    'original hierarchical data is not torch.tensor'
+                self._hierarchical_data = data
+            else:
+                self.update_hierarchical_data(data)
+        else:
+            self._hierarchical_data = data
+    @property
+    def hierarchical_data(self):
+        return self._hierarchical_data
+    def update_hierarchical_data(self,
+                                 dataA,
+                                 dataB=HierarchicalDataNone,
+                                 strict=True,
+                                 address='data'):
+        """Update dataB with dataA in-place.
+        Args:
+            dataA (list or dict or tuple): New hierarchical data.
+            dataB (list or dict or tuple): hierarchical data to update.
+                if not specified, self.hierarchical_data will be updated then.
+            strict (bool, optional): If true, an error will be reported
+                when the following conditions occur:
+                1. Non-torch.Tensor data changed.
+                2. Torch.Tensor data shape changed.
+            address (str): Record the address of current data to be updated.
+                Default: 'data'.
+        """
+        if dataB is HierarchicalDataNone:
+            dataB = self.hierarchical_data
+        # Update with a da ta with the same structure
+        # but different values(tensors and basic python data types)
+        if isinstance(dataA, (tuple, list)):
+            for idx, node in enumerate(dataA):
+                new_address = ''
+                if not self.quick_mode:
+                    new_address = address + f'[{str(idx)}]'
+                    assert isinstance(node, type(dataB[idx])),\
+                        f'data structure changed: {new_address}'
+                if isinstance(node, torch.Tensor):
+                    dataB[idx] = node
+                else:
+                    self.update_hierarchical_data(
+                        node, dataB[idx], strict, address=new_address)
+        elif isinstance(dataA, dict):
+            for k, v in dataA.items():
+                new_address = ''
+                if not self.quick_mode:
+                    new_address = address + f'[{str(k)}]'
+                    assert isinstance(v, type(dataB[k])),\
+                        f'data structure changed: {new_address}'
+                if isinstance(v, torch.Tensor):
+                    dataB[k] = v
+                else:
+                    self.update_hierarchical_data(
+                        v, dataB[k], strict, address=new_address)
+        elif isinstance(dataA, self.atomic_types):
+            if not self.quick_mode:
+                is_equal = self.compare_atomic_type(dataA, dataB)
+                if not is_equal:
+                    if strict:
+                        raise ValueError(
+                            'all data except torch.Tensor should be same, '
+                            f'but data({address}) is changed.')
+                    else:
+                        self.warning(
+                            f'find a non-torch.Tensor data({type(dataA)}) '
+                            f'changed, and the address is {address}')
+        elif isinstance(dataA, DataContainer):
+            if not self.quick_mode:
+                assert isinstance(dataB, DataContainer)
+                new_address = address + '.data'
+                self.update_hierarchical_data(
+                    dataA.data, dataB.data, False, address=new_address)
+        else:
+            raise NotImplementedError(
+                f'not supported datatype:{type(dataA)}, address is {address}')
+    def collect_all_tensors(self, hierarchical_data=None):
+        """Collect torch.Tensor data from self.hierarchical_data to a list and
+        return."""
+        # get a list of tensor from self._hierarchical_data
+        if hierarchical_data is None:
+            hierarchical_data = self._hierarchical_data
+        tensors = []
+        if isinstance(hierarchical_data, torch.Tensor):
+            tensors = [hierarchical_data]
+        else:
+            self._collect_tensors(hierarchical_data, tensors)
+        return tensors
+    def _collect_tensors(self, data, tensors):
+        if isinstance(data, (tuple, list)):
+            for node in data:
+                if isinstance(node, torch.Tensor):
+                    tensors.append(node)
+                else:
+                    self._collect_tensors(node, tensors)
+        elif isinstance(data, dict):
+            for v in data.values():
+                if isinstance(v, torch.Tensor):
+                    tensors.append(v)
+                else:
+                    self._collect_tensors(v, tensors)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._collect_tensors(data.data, tensors)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
+    def update_all_tensors(self, tensors):
+        """Put tensors from tuple back to self.hierarchical_data."""
+        if isinstance(self._hierarchical_data, torch.Tensor):
+            print(tensors, len(tensors))
+            assert len(tensors) == 1
+            assert isinstance(tensors[0], torch.Tensor)
+            self._hierarchical_data = tensors[0]
+        else:
+            # convert to list if tensors is tuple
+            tensors = list(tensors)
+            self._set_tensors(self._hierarchical_data, tensors)
+        return self.hierarchical_data
+    def _set_tensors(self, data, tensors):
+        if isinstance(data, tuple):
+            data = list(data)
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = tensors.pop(0)
+                else:
+                    self._set_tensors(data[idx], tensors)
+            data = tuple(data)
+        elif isinstance(data, list):
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = tensors.pop(0)
+                else:
+                    self._set_tensors(data[idx], tensors)
+        elif isinstance(data, dict):
+            for k, v in data.items():
+                if isinstance(v, torch.Tensor):
+                    data[k] = tensors.pop(0)
+                else:
+                    self._set_tensors(v, tensors)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._set_tensors(data.data, tensors)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
+    def clean_all_tensors(self):
+        """Delete tensors from self.hierarchical_data."""
+        self._clean_tensors(self._hierarchical_data)
+    def _clean_tensors(self, data):
+        if isinstance(data, tuple):
+            data = list(data)
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = None
+                else:
+                    self._clean_tensors(data[idx])
+            data = tuple(data)
+        elif isinstance(data, list):
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = None
+                else:
+                    self._clean_tensors(data[idx])
+        elif isinstance(data, dict):
+            for k, v in data.items():
+                if isinstance(v, torch.Tensor):
+                    data[k] = None
+                else:
+                    self._clean_tensors(v)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._clean_tensors(data.data)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
--- a/mmcv/device/ipu/hook_wrapper.py
+++ b/mmcv/device/ipu/hook_wrapper.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import HOOKS, LrUpdaterHook, OptimizerHook
+from mmcv.utils import TORCH_VERSION, digit_version
+def wrap_lr_updater_hook(lr_hook_class):
+    """A wrapper function to wrap any subclass of LrUpdaterHook.
+    IPU needs extra operations to upload optimizer settings. This wrapper will
+    override function(_set_lr) of a subclass of LrUpdaterHook.
+    """
+    assert issubclass(lr_hook_class, LrUpdaterHook)
+    class ipu_lr_hook_class(lr_hook_class):
+        def _set_lr(self, runner, *args, **kwargs):
+            super()._set_lr(runner, *args, **kwargs)
+            # convert torch optimizer to poptorch optimizer
+            runner.model.setOptimizer(runner.optimizer)
+    return ipu_lr_hook_class
+def wrap_optimizer_hook(optimizer_hook_class):
+    """A wrapper function to wrap OptimizerHook.
+    This is an non-intrusive implementation of wrapping optimizer hook (or you
+    need to change every config file to use IPU optimizer hook) IPU's clip-norm
+    implementation is different from pytorch, so there should be an error
+    raised when using clip-norm.
+    """
+    class ipu_optimizer_hook_class(OptimizerHook):
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            if self.grad_clip is not None:
+                raise NotImplementedError('IPU does not support gradient clip')
+    return ipu_optimizer_hook_class
+if (TORCH_VERSION != 'parrots'
+        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+    @HOOKS.register_module()
+    class IPUFp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (using PyTorch's implementation).
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of GradScalar.
+                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
+                implementation of GradScaler. If you use a dict version of
+                loss_scale to create GradScaler, please refer to:
+                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
+                for the parameters.
+        Examples:
+            >>> loss_scale = dict(
+            ...     init_scale=65536.0,
+            ...     growth_factor=2.0,
+            ...     backoff_factor=0.5,
+            ...     growth_interval=2000
+            ... )
+            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
+        """
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            assert grad_clip is None,\
+                'IPU mode does not support `grad_clip` currently'
+            assert coalesce,\
+                'implemented all reduce in distributed training currently'
+            assert bucket_size_mb == -1,\
+                '`bucket_size_mb` should not be set in IPU mode'
+            self.distributed = distributed
+            self._scale_update_param = None
+            if loss_scale == 'dynamic':
+                raise NotImplementedError(
+                    'IPU mode does not support dynamic loss scale currently')
+            elif isinstance(loss_scale, float):
+                self.loss_scale = loss_scale
+            elif isinstance(loss_scale, dict):
+                raise NotImplementedError(
+                    'IPU mode supports single scale currently')
+            else:
+                raise ValueError(
+                    f'loss_scale should be float, but got {loss_scale} ')
+        def after_train_iter(self, runner):
+            pass
+else:
+    raise RuntimeError('The IPU mode only supports torch 1.6 and above')
--- a/mmcv/device/ipu/model_wrapper.py
+++ b/mmcv/device/ipu/model_wrapper.py
--- a/mmcv/device/ipu/runner.py
+++ b/mmcv/device/ipu/runner.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import (HOOKS, RUNNERS, BaseRunner, EpochBasedRunner,
+                         IterBasedRunner)
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from .dataloader import IPUDataLoader
+    from .hook_wrapper import (IPUFp16OptimizerHook, wrap_lr_updater_hook,
+                               wrap_optimizer_hook)
+    from .model_wrapper import ipu_model_wrapper
+    from .utils import build_from_cfg_with_wrapper, cfg2options
+class IPUBaseRunner(BaseRunner):
+    """A base runner for IPU.
+    This runner has some extra processes for IPU which are shown below:
+    1. Parse options for IPU
+    2. wrap pytorch model for IPU
+    3. Raise errors while encountering illegal usage
+    4. Input IPU options and initialize dataloader if finding an instance
+       of IPUDataLoader
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        options_cfg (mmcv.Config, dict): Options that will be used to compile
+            and run the model.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+        ipu_model_cfg (mmcv.Config, dict): Config of model partition and
+            recomputing checkpoint
+        fp16_cfg (mmcv.Config): Config for fp16 training.
+        batch_processor (callable): A callable method that process a data
+            batch. Should be None for IPU runner
+        kwargs (Dict[str, Any], optional): Keyword arguments will be passed to
+        ``base_runner.BaseRunner``.
+    """
+    def __init__(self,
+                 model,
+                 options_cfg=None,
+                 modules_to_record=None,
+                 ipu_model_cfg=None,
+                 fp16_cfg=None,
+                 batch_processor=None,
+                 **kwargs):
+        assert hasattr(model, 'train_step') and batch_processor is None,\
+            'only support model with train_step'
+        if options_cfg is None:
+            options_cfg = {}
+        # call BaseRunner.__init__() here
+        super().__init__(model, **kwargs)
+        # process options of ipu
+        if IS_IPU_AVAILABLE:
+            self.options = cfg2options(options_cfg)
+            self.model = ipu_model_wrapper(
+                self.model,
+                self.options,
+                self.optimizer,
+                self.logger,
+                modules_to_record=modules_to_record,
+                ipu_model_cfg=ipu_model_cfg,
+                fp16_cfg=fp16_cfg)
+        else:
+            raise NotImplementedError('cpu mode on IPURunner is not supported')
+    def register_lr_hook(self, lr_config):
+        if lr_config is None:
+            return
+        assert isinstance(lr_config, dict)
+        assert 'policy' in lr_config
+        policy_type = lr_config.pop('policy')
+        # If the type of policy is all in lower case,
+        # e.g., 'cyclic', then its first letter will be capitalized,
+        # e.g., to be 'Cyclic'.
+        # This is for the convenient usage of Lr updater.
+        # Since this is not applicable for `
+        # CosineAnnealingLrUpdater`, the string will not be changed
+        # if it contains capital letters.
+        if policy_type == policy_type.lower():
+            policy_type = policy_type.title()
+        hook_type = policy_type + 'LrUpdaterHook'
+        lr_config['type'] = hook_type
+        hook = build_from_cfg_with_wrapper(lr_config, HOOKS,
+                                           wrap_lr_updater_hook)
+        self.register_hook(hook, priority='VERY_HIGH')
+    def register_optimizer_hook(self, optimizer_config):
+        if optimizer_config is None:
+            return
+        assert isinstance(optimizer_config, (dict, IPUFp16OptimizerHook))
+        if isinstance(optimizer_config, dict):
+            optimizer_config.setdefault('type', 'OptimizerHook')
+            hook = build_from_cfg_with_wrapper(optimizer_config, HOOKS,
+                                               wrap_optimizer_hook)
+        else:
+            hook = optimizer_config
+        self.register_hook(hook, priority='ABOVE_NORMAL')
+    def run(self, data_loaders, workflow, *args, **kwargs):
+        for i, flow in enumerate(workflow):
+            mode, _ = flow
+            # initialize IPU dataloader if not initialized
+            assert isinstance(data_loaders[i], IPUDataLoader),\
+                'IPU runner can only work with `IPUDataLoader`'
+            data_loaders[i].init(options=self.get_options(mode))
+        super().run(data_loaders, workflow, *args, **kwargs)
+    def get_options(self, mode):
+        if mode == 'train':
+            return self.options['training']
+        elif mode == 'val':
+            return self.options['inference']
+        else:
+            raise ValueError(f'mode should be train or val but got {mode}')
+@RUNNERS.register_module()
+class IPUEpochBasedRunner(IPUBaseRunner, EpochBasedRunner):
+    """Epoch-based Runner for IPU.
+    The Inheritance order(MRO) is: IPUEpochBasedRunner -> IPUBaseRunner ->
+    EpochBasedRunner -> BaseRunner This runner train models epoch by epoch.
+    """
+    pass
+@RUNNERS.register_module()
+class IPUIterBasedRunner(IPUBaseRunner, IterBasedRunner):
+    """Iteration-based Runner for IPU.
+    The Inheritance order(MRO) is: IPUIterBasedRunner -> IPUBaseRunner ->
+    IterBasedRunner -> BaseRunner This runner train models iteration by
+    iteration.
+    """
+    pass
--- a/mmcv/device/ipu/utils.py
+++ b/mmcv/device/ipu/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import numpy as np
+import popart
+import poptorch
+import torch
+import torch.nn as nn
+from mmcv.utils import Registry
+def _options_assigner(cfg, options_node):
+    # set popart.options by config
+    # cfg: dict, python data type
+    # options_node: python module or function
+    if isinstance(cfg, dict):
+        for key in cfg:
+            _options_assigner(cfg[key], getattr(options_node, key))
+    elif isinstance(cfg, (int, float, str, list)):
+        if callable(options_node):
+            options_node(cfg)
+        else:
+            error_msg = f'options_node type {type(options_node)} not supported'
+            raise NotImplementedError(error_msg)
+    else:
+        error_msg = f'cfg type {type(cfg)} not supported'
+        raise NotImplementedError(error_msg)
+def cfg2options(cfg):
+    """Parse dictionary to ipu options.
+    Args:
+        cfg (dict): A dictionary of ipu settings.
+    Returns:
+        dict[str, poptorch.Options]: Training options and inference options
+        of IPU.
+    """
+    # set ipu options for inference and training by config
+    train_cfg = cfg.pop('train_cfg', {})
+    eval_cfg = cfg.pop('eval_cfg', {})
+    eval_cfg['replicationFactor'] = 1  # eval mode only use one replica
+    eval_cfg['executionStrategy'] = 'ShardedExecution'
+    # overwrite default ipu cfg with specified train cfgs
+    training_ipu_cfg = {**cfg, **train_cfg}
+    # overwrite default ipu cfg with specified eval cfgs
+    inference_ipu_cfg = {**cfg, **eval_cfg}
+    ipu_options = {
+        'training': _cast_to_options(training_ipu_cfg),
+        'inference': _cast_to_options(inference_ipu_cfg)
+    }
+    # TODO configure these codes
+    ipu_options['training']._Popart.set('disableGradAccumulationTensorStreams',
+                                        True)
+    ipu_options['training']._Popart.set(
+        'accumulateOuterFragmentSettings.schedule',
+        int(popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized))
+    ipu_options['training'].Precision.enableStochasticRounding(True)
+    return ipu_options
+def _cast_to_options(cfg):
+    # If it cannot be directly assigned, use if statement to parse it,
+    # and if it can be directly assigned, use _options_assigner to assign
+    options = poptorch.Options()
+    if 'availableMemoryProportion' in cfg:
+        available_memory_proportion = cfg.pop('availableMemoryProportion')
+        mem_props = {}
+        for i, mem_prop in enumerate(available_memory_proportion):
+            mem_props[f'IPU{i}'] = mem_prop
+        options.setAvailableMemoryProportion(mem_props)
+    if 'executionStrategy' in cfg:
+        execution_strategy = cfg.pop('executionStrategy')
+        if execution_strategy == 'SameAsIpu':
+            options.setExecutionStrategy(
+                poptorch.PipelinedExecution(
+                    getattr(poptorch.AutoStage, execution_strategy)))
+        elif execution_strategy == 'ShardedExecution':
+            options.setExecutionStrategy(poptorch.ShardedExecution())
+        else:
+            raise NotImplementedError(
+                'executionStrategy should be "SameAsIpu" or "ShardedExecution"'
+                f', but got {execution_strategy}')
+    if 'partialsType' in cfg:
+        partials_type = cfg.pop('partialsType')
+        options.Precision.setPartialsType(getattr(
+            torch, partials_type))  # half or float
+    _options_assigner(cfg, options)
+    return options
+def model_sharding(model, split_edges):
+    """split models in-place into multi-IPUs.
+    Args:
+        model (nn.Module): The target model to be split.
+        split_edges (list of dict): Model layer names or layer numbers
+            of split edge. Each item of ``split_edges`` is a dictionary,
+            which may contain the following key-pairs:
+            - layer_to_call: PyTorch module to assign to the block
+            - user_id (optional): A user defined identifier for the block.
+            - ipu_id: The id of the IPU to run on.
+        Examples:
+            >>> split_edges = [
+            ...     dict(layer_to_call='model.conv1', ipu_id=0),
+            ...     dict(layer_to_call='model.conv3', ipu_id=1)]
+            >>> sharding_model = model_sharding(torch_model, split_edges)
+    Returns:
+        nn.Module: Split model.
+    """
+    if len(split_edges) == 0:
+        return model
+    assert isinstance(split_edges, list)
+    spilt_edges_dict = {edge['layer_to_call']: edge for edge in split_edges}
+    for idx, (name, module) in enumerate(model.named_modules()):
+        if idx in spilt_edges_dict and name in spilt_edges_dict:
+            raise ValueError(
+                'The same layer is referenced twice while doing model'
+                f' partition: idx is {idx} and name is {name}')
+        edge = spilt_edges_dict.pop(name, None)
+        edge = spilt_edges_dict.pop(idx, edge)
+        if edge is not None:
+            poptorch.BeginBlock(module, edge.get('user_id', name),
+                                edge['ipu_id'])
+    # ensure all split_edges are used
+    if len(spilt_edges_dict) > 0:
+        split_edge_names = list(spilt_edges_dict.keys())
+        raise RuntimeError(
+            f'split_edges: {split_edge_names} are not contained in the model')
+    return model
+def recomputation_checkpoint(model: nn.Module, module_names: list):
+    """Annotates the output of a module to be checkpointed instead of
+    recomputed.
+    If recomputation mode is enabled, ipu will release the activations of
+    the middle layers to save memory. During the backward of gradient,
+    the activation of the middle layer will be recalculated again.
+    This function is used to declare the activations of some intermediate
+    layers that need to be saved in order to skip the recomputation of
+    some layers.
+    Args:
+        model (nn.Module): The target model to apply recomputation
+            checkpoint.
+        module_names (list): Layer names of module.
+    """
+    def recompute_outputs(module, inputs, outputs):
+        if isinstance(outputs, tuple):
+            return tuple(poptorch.recomputationCheckpoint(y) for y in outputs)
+        else:
+            return poptorch.recomputationCheckpoint(outputs)
+    for name, module in model.named_modules():
+        if name in module_names:
+            module.register_forward_hook(recompute_outputs)
+            module_names.remove(name)
+    # check all module_names are used
+    assert len(module_names) == 0,\
+        f'recomputed nodes: {module_names} are not contained in the model'
+def compare_ndarray(featA, featB, rtol=1e-3, atol=1e-5):
+    """Align data between two activations or weights."""
+    try:
+        np.testing.assert_allclose(featA, featB, rtol=rtol, atol=atol)
+    except AssertionError as e:
+        print(e)
+def build_from_cfg_with_wrapper(cfg,
+                                registry,
+                                wrapper_func=None,
+                                default_args=None):
+    """Build a module from config dict and wrap module with "wrapper_func".
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+        wrapper_func (function): Used to wrap class
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+    args = cfg.copy()
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+    if wrapper_func is None:
+        wrapped_obj_cls = obj_cls
+    else:
+        wrapped_obj_cls = wrapper_func(obj_cls)
+    try:
+        return wrapped_obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{wrapped_obj_cls.__name__}: {e}')
--- a/mmcv/runner/__init__.py
+++ b/mmcv/runner/__init__.py
@@ -40,6 +40,9 @@ from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
 from .priority import Priority, get_priority
 from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
+# initialize ipu to registor ipu runner to RUNNERS
+from mmcv.device import ipu  # isort:skip  # noqa
 __all__ = [
    'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',

--- a/mmcv/runner/fp16_utils.py
+++ b/mmcv/runner/fp16_utils.py
@@ -63,7 +63,7 @@ def cast_tensor_type(inputs, src_type, dst_type):
        return inputs
-def auto_fp16(apply_to=None, out_fp32=False):
+def auto_fp16(apply_to=None, out_fp32=False, supported_types=(nn.Module, )):
    """Decorator to enable fp16 training automatically.
    This decorator is useful when you write custom modules and want to support
@@ -76,7 +76,8 @@ def auto_fp16(apply_to=None, out_fp32=False):
        apply_to (Iterable, optional): The argument names to be converted.
            `None` indicates all arguments.
        out_fp32 (bool): Whether to convert the output back to fp32.
+        supported_types (tuple): Classes can be decorated by ``auto_fp16``.
+            `New in version 1.5.0.`
    Example:
        >>> import torch.nn as nn
@@ -102,9 +103,9 @@ def auto_fp16(apply_to=None, out_fp32=False):
        def new_func(*args, **kwargs):
            # check if the module has set the attribute `fp16_enabled`, if not,
            # just fallback to the original method.
-            if not isinstance(args[0], torch.nn.Module):
+            if not isinstance(args[0], supported_types):
                raise TypeError('@auto_fp16 can only be used to decorate the '
-                                'method of nn.Module')
+                                f'method of those classes {supported_types}')
            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
                return old_func(*args, **kwargs)

--- a/mmcv/utils/__init__.py
+++ b/mmcv/utils/__init__.py
@@ -36,7 +36,7 @@ except ImportError:
        'is_method_overridden', 'has_method'
    ]
 else:
-    from .device_type import IS_MLU_AVAILABLE
+    from .device_type import IS_IPU_AVAILABLE, IS_MLU_AVAILABLE
    from .env import collect_env
    from .hub import load_url
    from .logging import get_logger, print_log
@@ -74,5 +74,5 @@ else:
        'assert_params_all_zeros', 'check_python_script',
        'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
        '_get_cuda_home', 'load_url', 'has_method', 'IS_CUDA_AVAILABLE',
-        'worker_init_fn', 'IS_MLU_AVAILABLE'
+        'worker_init_fn', 'IS_MLU_AVAILABLE', 'IS_IPU_AVAILABLE'
    ]
--- a/mmcv/utils/device_type.py
+++ b/mmcv/utils/device_type.py
 # Copyright (c) OpenMMLab. All rights reserved.
+def is_ipu_available():
+    try:
+        import poptorch
+        return poptorch.ipuHardwareIsAvailable()
+    except ImportError:
+        return False
+IS_IPU_AVAILABLE = is_ipu_available()
 def is_mlu_available():
    try:
        import torch

--- a/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
+++ b/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import numpy as np
+import pytest
+import torch
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu.hierarchical_data_manager import \
+        HierarchicalDataManager
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+@skip_no_ipu
+def test_HierarchicalData():
+    # test hierarchical data
+    hierarchical_data_sample = {
+        'a': torch.rand(3, 4),
+        'b': np.random.rand(3, 4),
+        'c': DataContainer({
+            'a': torch.rand(3, 4),
+            'b': 4,
+            'c': 'd'
+        }),
+        'd': 123,
+        'e': [1, 3, torch.rand(3, 4),
+              np.random.rand(3, 4)],
+        'f': {
+            'a': torch.rand(3, 4),
+            'b': np.random.rand(3, 4),
+            'c': [1, 'asd']
+        }
+    }
+    all_tensors = []
+    all_tensors.append(hierarchical_data_sample['a'])
+    all_tensors.append(hierarchical_data_sample['c'].data['a'])
+    all_tensors.append(hierarchical_data_sample['e'][2])
+    all_tensors.append(hierarchical_data_sample['f']['a'])
+    all_tensors_id = [id(ele) for ele in all_tensors]
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(hierarchical_data_sample)
+    tensors = hd.collect_all_tensors()
+    for t in tensors:
+        assert id(t) in all_tensors_id
+    tensors[0].add_(1)
+    hd.update_all_tensors(tensors)
+    data = hd.hierarchical_data
+    data['c'].data['a'].sub_(1)
+    hd.record_hierarchical_data(data)
+    tensors = hd.collect_all_tensors()
+    for t in tensors:
+        assert id(t) in all_tensors_id
+    hd.quick()
+    with pytest.raises(
+            AssertionError,
+            match='original hierarchical data is not torch.tensor'):
+        hd.record_hierarchical_data(torch.rand(3, 4))
+    class AuxClass:
+        pass
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hd.record_hierarchical_data(AuxClass())
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.update_all_tensors(tensors)
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.collect_all_tensors()
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.clean_all_tensors()
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(hierarchical_data_sample)
+    hierarchical_data_sample['a'] = torch.rand(3, 4)
+    with pytest.raises(ValueError, match='all data except torch.Tensor'):
+        new_hierarchical_data_sample = {
+            **hierarchical_data_sample, 'b': np.random.rand(3, 4)
+        }
+        hd.update_hierarchical_data(new_hierarchical_data_sample)
+    hd.update_hierarchical_data(new_hierarchical_data_sample, strict=False)
+    hd.clean_all_tensors()
+    # test single tensor
+    single_tensor = torch.rand(3, 4)
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(single_tensor)
+    tensors = hd.collect_all_tensors()
+    assert len(tensors) == 1 and single_tensor in tensors
+    single_tensor_to_update = [torch.rand(3, 4)]
+    hd.update_all_tensors(single_tensor_to_update)
+    new_tensors = hd.collect_all_tensors()
+    assert new_tensors == single_tensor_to_update
--- a/tests/test_device/test_ipu/test_ipu_dataloder.py
+++ b/tests/test_device/test_ipu/test_ipu_dataloder.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+from torch.utils.data import Dataset
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import IPUDataLoader, cfg2options
+    from mmcv.device.ipu.dataloader import collate
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+class ToyDataset(Dataset):
+    def __getitem__(self, index):
+        return 111
+    def __len__(self, ):
+        return 3
+@skip_no_ipu
+def test_ipu_dataloader():
+    # test lazy initialization
+    dataloader = IPUDataLoader(
+        ToyDataset(), None, batch_size=256, num_workers=1, mode='async')
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    ipu_options = cfg2options(options_cfg)
+    dataloader.init(ipu_options['training'])
+    # test normal initialization
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    ipu_options = cfg2options(options_cfg)['training']
+    dataloader = IPUDataLoader(
+        ToyDataset(), ipu_options, batch_size=256, num_workers=1, mode='async')
+@skip_no_ipu
+def test_ipu_collate():
+    with pytest.raises(TypeError, match='`batch` should be a sequence'):
+        collate(123)
+    with pytest.raises(TypeError, match='DataContainer is not supported'):
+        collate([DataContainer(666)])
+    data_list = [[1, 2, 3], [2, 3, 4], DataContainer(666)]
+    batch0 = {
+        'tensor': torch.rand(3, 4, 5),
+        'arr': np.random.rand(3, 4, 5, 6),
+        'data_list': data_list
+    }
+    batch1 = {
+        'tensor': torch.rand(3, 4, 5),
+        'arr': np.random.rand(3, 4, 5, 6),
+        'data_list': data_list
+    }
+    batch = [batch1, batch0]
+    results = collate(batch)
+    assert results['tensor'].shape == (2, 3, 4, 5)
+    assert results['arr'].shape == (2, 3, 4, 5, 6)
+    for data in results['data_list']:
+        for tensor in data:
+            assert not isinstance(tensor, DataContainer)
+            assert tensor.shape == (2, )
--- a/tests/test_device/test_ipu/test_ipu_hooks.py
+++ b/tests/test_device/test_ipu/test_ipu_hooks.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+import pytest
+import torch
+import torch.nn as nn
+from mmcv.runner import build_runner
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu.hook_wrapper import IPUFp16OptimizerHook
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+# TODO Once the model training and inference interfaces
+# of MMCLS and MMDET are unified,
+# construct the model according to the unified standards
+class ToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {
+                'loss': loss,
+                'loss_list': [loss, loss],
+                'loss_dict': {
+                    'loss1': loss
+                }
+            }
+        return x
+    def _parse_losses(self, losses):
+        return losses['loss'], losses['loss']
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+@skip_no_ipu
+def test_ipu_hook_wrapper(tmp_path):
+    model = ToyModel()
+    dummy_input = {
+        'data': {
+            'img': torch.rand((16, 3, 10, 10)),
+            'gt_label': torch.rand((16, 3, 10, 10))
+        }
+    }
+    dir_name = 'a_tmp_dir'
+    working_dir = osp.join(tmp_path, dir_name)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    default_args = dict(
+        model=model,
+        work_dir=working_dir,
+        optimizer=optimizer,
+        logger=logging.getLogger())
+    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
+    dummy_runner = build_runner(cfg, default_args=default_args)
+    # learning policy
+    lr_config = dict(policy='step', step=[1, 150])
+    # test optimizer config
+    optimizer_config = dict(
+        grad_clip=dict(max_norm=2), detect_anomalous_params=True)
+    # test building ipu_lr_hook_class
+    dummy_runner.register_training_hooks(
+        lr_config=lr_config, optimizer_config=None, timer_config=None)
+    # test _set_lr()
+    output = dummy_runner.model.train_step(**dummy_input)
+    dummy_runner.outputs = output
+    dummy_runner.call_hook('before_train_epoch')
+    # test building ipu_optimizer_hook_class
+    with pytest.raises(
+            NotImplementedError, match='IPU does not support gradient clip'):
+        dummy_runner.register_training_hooks(
+            lr_config=None,
+            optimizer_config=optimizer_config,
+            timer_config=None)
+    # test fp16 optimizer hook
+    lr_config = dict(policy='step', step=[1, 150])
+    optimizer_config = dict(grad_clip=dict(max_norm=2))
+    dummy_runner.hooks.pop(0)
+    with pytest.raises(NotImplementedError, match='IPU mode does not support'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale='dynamic', distributed=False)
+    with pytest.raises(NotImplementedError, match='IPU mode supports single'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale={}, distributed=False)
+    with pytest.raises(ValueError, match='loss_scale should be float'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale=[], distributed=False)
+    optimizer_config = IPUFp16OptimizerHook(loss_scale=2.0, distributed=False)
+    dummy_runner.register_training_hooks(
+        lr_config=lr_config,
+        optimizer_config=optimizer_config,
+        timer_config=None)
+    dummy_runner.call_hook('after_train_iter')
--- a/tests/test_device/test_ipu/test_ipu_model.py
+++ b/tests/test_device/test_ipu/test_ipu_model.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import cfg2options, ipu_model_wrapper
+    from mmcv.device.ipu.utils import compare_ndarray
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+class MyBN(nn.BatchNorm2d):
+    def forward(self, *args, **kwargs):
+        result = super().forward(*args, **kwargs)
+        return result, self.running_mean
+# TODO Once the model training and inference interfaces
+# of MMCLS and MMDET are unified,
+# construct the model according to the unified standards
+class ToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = MyBN(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x, running_mean = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {
+                'loss': loss,
+                'loss_list': [loss, loss],
+                'loss_dict': {
+                    'loss1': loss
+                }
+            }
+        return x
+    def _parse_losses(self, losses):
+        return losses['loss'], losses['loss']
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+@skip_no_ipu
+def test_build_model():
+    for execution_strategy in \
+            ['SameAsIpu', 'ShardedExecution', 'error_strategy']:
+        if execution_strategy == 'error_strategy':
+            def maybe_catch_error(_error):
+                return pytest.raises(_error)
+        else:
+            class NullContextManager:
+                def __enter__(self, ):
+                    pass
+                def __exit__(self, exc_type, exc_value, exc_traceback):
+                    pass
+            def maybe_catch_error(_error):
+                return NullContextManager()
+        with maybe_catch_error(NotImplementedError):
+            options_cfg = dict(
+                randomSeed=888,
+                enableExecutableCaching='cache_engine',
+                train_cfg=dict(
+                    executionStrategy=execution_strategy,
+                    Training=dict(gradientAccumulation=8),
+                    availableMemoryProportion=[0.3, 0.3, 0.3, 0.3]),
+                eval_cfg=dict(deviceIterations=1, ),
+                partialsType='half')
+            ipu_options = cfg2options(options_cfg)
+            model = ToyModel()
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+            logger = logging.getLogger()
+            modules_to_record = None
+            ipu_model_cfg = dict(
+                train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
+                train_ckpt_nodes=['bn', 'conv'])
+            fp16_cfg = {'loss_scale': 0.5}
+            ipu_model = ipu_model_wrapper(
+                model,
+                ipu_options,
+                optimizer,
+                logger,
+                modules_to_record=modules_to_record,
+                ipu_model_cfg=ipu_model_cfg,
+                fp16_cfg=fp16_cfg)
+            ipu_model.train()
+            ipu_model.eval()
+            ipu_model.train()
+def run_model(ipu_options,
+              fp16_cfg,
+              modules_to_record,
+              ipu_model_wrapper_func,
+              only_eval=False):
+    model = ToyModel()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\
+        if not only_eval else None
+    logger = logging.getLogger()
+    ipu_model_cfg = dict(
+        train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
+        train_ckpt_nodes=['bn', 'conv'])
+    ipu_model = ipu_model_wrapper_func(
+        model,
+        ipu_options,
+        optimizer,
+        logger,
+        modules_to_record=modules_to_record,
+        ipu_model_cfg=ipu_model_cfg,
+        fp16_cfg=fp16_cfg)
+    def get_dummy_input(training):
+        if training:
+            return {
+                'data': {
+                    'img': torch.rand((16, 3, 10, 10)),
+                    'gt_label': torch.rand((16, 3, 10, 10))
+                }
+            }
+        else:
+            return {
+                'img': torch.rand((16, 3, 10, 10)),
+                'img_metas': {
+                    'img': torch.rand((16, 3, 10, 10))
+                },
+                'return_loss': False
+            }
+    if not only_eval:
+        training = True
+        ipu_model.train()
+        for _ in range(3):
+            dummy_input = get_dummy_input(training)
+            output = ipu_model.train_step(**dummy_input)
+    training = False
+    ipu_model.eval()
+    for _ in range(3):
+        dummy_input = get_dummy_input(training)
+        output = ipu_model(**dummy_input)
+    return output, ipu_model
+@skip_no_ipu
+def test_run_model():
+    # test feature alignment not support gradientAccumulation mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            Training=dict(gradientAccumulation=8),
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    with pytest.raises(AssertionError, match='Feature alignment'):
+        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+    # test feature alignment not support multi-replica mode
+    options_cfg = dict(
+        randomSeed=888,
+        replicationFactor=2,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    with pytest.raises(AssertionError, match='Feature alignment'):
+        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+    # test feature alignment not support fp16 mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    fp16_cfg = {
+        'loss_scale': 0.5,
+        'velocity_accum_type': 'half',
+        'accum_type': 'half'
+    }
+    modules_to_record = ['bn']
+    with pytest.raises(NotImplementedError):
+        run_model(ipu_options, fp16_cfg, modules_to_record, ipu_model_wrapper)
+    # test velocity_accum_type and accum_type
+    fp16_cfg = {
+        'loss_scale': 0.5,
+        'velocity_accum_type': 'float',
+        'accum_type': 'float'
+    }
+    run_model(ipu_options, fp16_cfg, None, ipu_model_wrapper)
+    # test compile and run
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+    # test feature alignment
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ))
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = None
+    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+    # test inference mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    fp16_cfg = {'loss_scale': 0.5}
+    modules_to_record = None
+    _, ipu_model = run_model(
+        ipu_options,
+        fp16_cfg,
+        modules_to_record,
+        ipu_model_wrapper,
+        only_eval=True)
+    with pytest.raises(RuntimeError):
+        ipu_model.train()
+    with pytest.raises(ValueError):
+        ipu_model.train(123)
+    _, ipu_model = run_model(ipu_options, None, modules_to_record,
+                             ipu_model_wrapper)
+    # test NotImplementedError in __call__
+    ipu_model.train()
+    with pytest.raises(NotImplementedError):
+        ipu_model()
+    # test parse_losses
+    with pytest.raises(TypeError):
+        ipu_model._model.model._parse_losses({'loss': None})
+@skip_no_ipu
+def test_compare_tensor():
+    compare_ndarray(np.random.rand(3, 4), np.random.rand(3, 4))
--- a/tests/test_device/test_ipu/test_ipu_runner.py
+++ b/tests/test_device/test_ipu/test_ipu_runner.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+import pytest
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+from mmcv.runner import build_runner
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import IPUDataLoader, runner
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+# Most of its functions are inherited from EpochBasedRunner and IterBasedRunner
+# So only do incremental testing on overridden methods
+# Comparing with base runner,
+# Overridden functions are listed below:
+# __init__, register_lr_hook, register_optimizer_hook
+# register_lr_hook and register_optimizer_hook are tested in test_runner.py
+class OldStyleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+class Model(OldStyleModel):
+    def train_step(self):
+        pass
+    def val_step(self):
+        pass
+class ToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {'loss': loss, 'loss1': loss + 1}
+        return x
+    def _parse_losses(self, losses):
+        return losses['loss'], {'loss1': losses['loss']}
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+class ToyDataset(Dataset):
+    def __getitem__(self, index):
+        return {
+            'img': torch.rand((3, 10, 10)),
+            'gt_label': torch.rand((3, 10, 10))
+        }
+    def __len__(self, ):
+        return 3
+@skip_no_ipu
+def test_build_runner(tmp_path):
+    # __init__
+    dir_name = 'a_tmp_dir'
+    default_args = dict(
+        model=Model(),
+        work_dir=osp.join(tmp_path, dir_name),
+        logger=logging.getLogger())
+    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    assert ipu_runner._max_epochs == 1
+    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    assert ipu_runner._max_iters == 1
+    runner.IS_IPU_AVAILABLE = False
+    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
+    with pytest.raises(
+            NotImplementedError,
+            match='cpu mode on IPURunner is not supported'):
+        ipu_runner = build_runner(cfg, default_args=default_args)
+    runner.IS_IPU_AVAILABLE = True
+    with pytest.raises(ValueError, match='Only one of'):
+        cfg = dict(type='IPUIterBasedRunner', max_epochs=1, max_iters=1)
+        ipu_runner = build_runner(cfg, default_args=default_args)
+    model = ToyModel()
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    dataloader = IPUDataLoader(ToyDataset(), None, num_workers=1)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    cfg = dict(type='IPUIterBasedRunner', max_iters=2, options_cfg=options_cfg)
+    default_args = dict(
+        model=model,
+        optimizer=optimizer,
+        work_dir=osp.join(tmp_path, dir_name),
+        logger=logging.getLogger())
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    ipu_runner.run([dataloader], [('train', 2)])
+    ipu_runner.get_options('val')
+    with pytest.raises(ValueError, match='mode should be train or val'):
+        ipu_runner.get_options('666')
--- a/tests/test_device/test_ipu/test_ipu_utils.py
+++ b/tests/test_device/test_ipu/test_ipu_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import pytest
+import torch.nn as nn
+import mmcv
+from mmcv.utils import IS_IPU_AVAILABLE
+if IS_IPU_AVAILABLE:
+    from poptorch.options import _IExecutionStrategy
+    from mmcv.device.ipu import cfg2options
+    from mmcv.device.ipu.utils import (build_from_cfg_with_wrapper,
+                                       model_sharding)
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+class ToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+@skip_no_ipu
+def test_build_from_cfg():
+    BACKBONES = mmcv.Registry('backbone')
+    @BACKBONES.register_module()
+    class ResNet:
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+    @BACKBONES.register_module()
+    class ResNeXt:
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+    cfg = dict(type='ResNet', depth=50)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+    cfg = dict(type='ResNet', depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args={'stages': 3})
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 3
+    cfg = dict(type='ResNeXt', depth=50, stages=3)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNeXt)
+    assert model.depth == 50 and model.stages == 3
+    cfg = dict(type=ResNet, depth=50)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+    # type defined using default_args
+    cfg = dict(depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args=dict(type='ResNet'))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+    cfg = dict(depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args=dict(type=ResNet))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+    # not a registry
+    with pytest.raises(TypeError):
+        cfg = dict(type='VGG')
+        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
+    # non-registered class
+    with pytest.raises(KeyError):
+        cfg = dict(type='VGG')
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    # default_args must be a dict or None
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=1)
+    # cfg['type'] should be a str or class
+    with pytest.raises(TypeError):
+        cfg = dict(type=1000)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    # cfg should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50, stages=4)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    # cfg or default_args should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50)
+        model = build_from_cfg_with_wrapper(
+            cfg, BACKBONES, default_args=dict(stages=4))
+    # incorrect registry type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
+    # incorrect default_args type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=0)
+    # incorrect arguments
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', non_existing_arg=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    # cfg not dict
+    with pytest.raises(TypeError):
+        cfg = []
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+@skip_no_ipu
+def test_cast_to_options():
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            Training=dict(gradientAccumulation=8),
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+    )
+    ipu_options = cfg2options(copy.deepcopy(options_cfg))
+    assert 'training' in ipu_options
+    assert 'inference' in ipu_options
+    assert ipu_options['training']._values['random_seed'] == 888
+    assert ipu_options['training']._values['replication_factor'] == 1
+    assert ipu_options['training']._values['available_memory_proportion'] == {
+        0: 0.3,
+        1: 0.3,
+        2: 0.3,
+        3: 0.3
+    }
+    assert ipu_options['training']._popart.options[
+        'cachePath'] == 'cache_engine'
+    assert isinstance(ipu_options['training']._execution_strategy,
+                      _IExecutionStrategy)
+    assert ipu_options['inference']._values['device_iterations'] == 1
+    with pytest.raises(NotImplementedError, match='cfg type'):
+        _options_cfg = copy.deepcopy(options_cfg)
+        _options_cfg['randomSeed'] = (1, 3)
+        cfg2options(_options_cfg)
+    with pytest.raises(NotImplementedError, match='options_node type'):
+        _options_cfg = copy.deepcopy(options_cfg)
+        _options_cfg['train_cfg']['Precision'] = {'autocast_policy': 123}
+        cfg2options(_options_cfg)
+@skip_no_ipu
+def test_model_sharding():
+    model = ToyModel()
+    split_edges = [dict(layer_to_call='666', ipu_id=0)]
+    with pytest.raises(RuntimeError, match='split_edges:'):
+        model_sharding(model, split_edges)
+    model = ToyModel()
+    split_edges = [
+        dict(layer_to_call='conv', ipu_id=0),
+        dict(layer_to_call=1, ipu_id=0)
+    ]
+    with pytest.raises(ValueError, match='The same layer is referenced'):
+        model_sharding(model, split_edges)
+    model = ToyModel()
+    split_edges = [dict(layer_to_call='conv', ipu_id=0)]
+    model_sharding(model, split_edges)