lfb.py

# Copyright (c) OpenMMLab. All rights reserved.
import io
import os.path as osp
import warnings

import numpy as np
import torch
import torch.distributed as dist
from mmcv.runner import get_dist_info

try:
    import lmdb
    lmdb_imported = True
except (ImportError, ModuleNotFoundError):
    lmdb_imported = False


class LFB:
    """Long-Term Feature Bank (LFB).

    LFB is proposed in `Long-Term Feature Banks for Detailed Video
    Understanding <https://arxiv.org/abs/1812.05038>`_

    The ROI features of videos are stored in the feature bank. The feature bank
    was generated by inferring with a lfb infer config.

    Formally, LFB is a Dict whose keys are video IDs and its values are also
    Dicts whose keys are timestamps in seconds. Example of LFB:

    .. code-block:: Python
        {
            '0f39OWEqJ24': {
                901: tensor([[ 1.2760,  1.1965,  ...,  0.0061, -0.0639],
                    [-0.6320,  0.3794,  ..., -1.2768,  0.5684],
                    [ 0.2535,  1.0049,  ...,  0.4906,  1.2555],
                    [-0.5838,  0.8549,  ..., -2.1736,  0.4162]]),
                ...
                1705: tensor([[-1.0169, -1.1293,  ...,  0.6793, -2.0540],
                    [ 1.2436, -0.4555,  ...,  0.2281, -0.8219],
                    [ 0.2815, -0.0547,  ..., -0.4199,  0.5157]]),
                ...
            },
            'xmqSaQPzL1E': {
                ...
            },
            ...
        }

    Args:
        lfb_prefix_path (str): The storage path of lfb.
        max_num_sampled_feat (int): The max number of sampled features.
            Default: 5.
        window_size (int): Window size of sampling long term feature.
            Default: 60.
        lfb_channels (int): Number of the channels of the features stored
            in LFB. Default: 2048.
        dataset_modes (tuple[str] | str): Load LFB of datasets with different
            modes, such as training, validation, testing datasets. If you don't
            do cross validation during training, just load the training dataset
            i.e. setting `dataset_modes = ('train')`.
            Default: ('train', 'val').
        device (str): Where to load lfb. Choices are 'gpu', 'cpu' and 'lmdb'.
            A 1.65GB half-precision ava lfb (including training and validation)
            occupies about 2GB GPU memory. Default: 'gpu'.
        lmdb_map_size (int): Map size of lmdb. Default: 4e9.
        construct_lmdb (bool): Whether to construct lmdb. If you have
            constructed lmdb of lfb, you can set to False to skip the
            construction. Default: True.
    """

    def __init__(self,
                 lfb_prefix_path,
                 max_num_sampled_feat=5,
                 window_size=60,
                 lfb_channels=2048,
                 dataset_modes=('train', 'val'),
                 device='gpu',
                 lmdb_map_size=4e9,
                 construct_lmdb=True):
        if not osp.exists(lfb_prefix_path):
            raise ValueError(
                f'lfb prefix path {lfb_prefix_path} does not exist!')
        self.lfb_prefix_path = lfb_prefix_path
        self.max_num_sampled_feat = max_num_sampled_feat
        self.window_size = window_size
        self.lfb_channels = lfb_channels
        if not isinstance(dataset_modes, tuple):
            assert isinstance(dataset_modes, str)
            dataset_modes = (dataset_modes, )
        self.dataset_modes = dataset_modes
        self.device = device

        rank, world_size = get_dist_info()

        # Loading LFB
        if self.device == 'gpu':
            self.load_lfb(f'cuda:{rank}')
        elif self.device == 'cpu':
            if world_size > 1:
                warnings.warn(
                    'If distributed training is used with multi-GPUs, lfb '
                    'will be loaded multiple times on RAM. In this case, '
                    "'lmdb' is recommended.", UserWarning)
            self.load_lfb('cpu')
        elif self.device == 'lmdb':
            assert lmdb_imported, (
                'Please install `lmdb` to load lfb on lmdb!')
            self.lmdb_map_size = lmdb_map_size
            self.construct_lmdb = construct_lmdb
            self.lfb_lmdb_path = osp.normpath(
                osp.join(self.lfb_prefix_path, 'lmdb'))

            if rank == 0 and self.construct_lmdb:
                print('Constructing LFB lmdb...')
                self.load_lfb_on_lmdb()

            # Synchronizes all processes to make sure lfb lmdb exist.
            if world_size > 1:
                dist.barrier()
            self.lmdb_env = lmdb.open(self.lfb_lmdb_path, readonly=True)
        else:
            raise ValueError("Device must be 'gpu', 'cpu' or 'lmdb', ",
                             f'but get {self.device}.')

    def load_lfb(self, map_location):
        self.lfb = {}
        for dataset_mode in self.dataset_modes:
            lfb_path = osp.normpath(
                osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl'))
            print(f'Loading LFB from {lfb_path}...')
            self.lfb.update(torch.load(lfb_path, map_location=map_location))
        print(f'LFB has been loaded on {map_location}.')

    def load_lfb_on_lmdb(self):
        lfb = {}
        for dataset_mode in self.dataset_modes:
            lfb_path = osp.normpath(
                osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl'))
            lfb.update(torch.load(lfb_path, map_location='cpu'))

        lmdb_env = lmdb.open(self.lfb_lmdb_path, map_size=self.lmdb_map_size)
        for key, value in lfb.items():
            txn = lmdb_env.begin(write=True)
            buff = io.BytesIO()
            torch.save(value, buff)
            buff.seek(0)
            txn.put(key.encode(), buff.read())
            txn.commit()
            buff.close()

        print(f'LFB lmdb has been constructed on {self.lfb_lmdb_path}!')

    def sample_long_term_features(self, video_id, timestamp):
        if self.device == 'lmdb':
            with self.lmdb_env.begin(write=False) as txn:
                buf = txn.get(video_id.encode())
                video_features = torch.load(io.BytesIO(buf))
        else:
            video_features = self.lfb[video_id]

        # Sample long term features.
        window_size, K = self.window_size, self.max_num_sampled_feat
        start = timestamp - (window_size // 2)
        lt_feats = torch.zeros(window_size * K, self.lfb_channels)

        for idx, sec in enumerate(range(start, start + window_size)):
            if sec in video_features:
                # `num_feat` is the number of roi features in this second.
                num_feat = len(video_features[sec])
                num_feat_sampled = min(num_feat, K)
                # Sample some roi features randomly.
                random_lfb_indices = np.random.choice(
                    range(num_feat), num_feat_sampled, replace=False)

                for k, rand_idx in enumerate(random_lfb_indices):
                    lt_feats[idx * K + k] = video_features[sec][rand_idx]

        # [window_size * max_num_sampled_feat, lfb_channels]
        return lt_feats

    def __getitem__(self, img_key):
        """Sample long term features like `lfb['0f39OWEqJ24,0902']` where `lfb`
        is a instance of class LFB."""
        video_id, timestamp = img_key.split(',')
        return self.sample_long_term_features(video_id, int(timestamp))

    def __len__(self):
        """The number of videos whose ROI features are stored in LFB."""
        return len(self.lfb)