audio_visual_dataset.py

# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp

from .builder import DATASETS
from .rawframe_dataset import RawframeDataset


@DATASETS.register_module()
class AudioVisualDataset(RawframeDataset):
    """Dataset that reads both audio and visual data, supporting both rawframes
    and videos. The annotation file is same as that of the rawframe dataset,
    such as:

    .. code-block:: txt

        some/directory-1 163 1
        some/directory-2 122 1
        some/directory-3 258 2
        some/directory-4 234 2
        some/directory-5 295 3
        some/directory-6 121 3

    Args:
        ann_file (str): Path to the annotation file.
        pipeline (list[dict | callable]): A sequence of data transforms.
        audio_prefix (str): Directory of the audio files.
        kwargs (dict): Other keyword args for `RawframeDataset`. `video_prefix`
            is also allowed if pipeline is designed for videos.
    """

    def __init__(self, ann_file, pipeline, audio_prefix, **kwargs):
        self.audio_prefix = audio_prefix
        self.video_prefix = kwargs.pop('video_prefix', None)
        self.data_prefix = kwargs.get('data_prefix', None)
        super().__init__(ann_file, pipeline, **kwargs)

    def load_annotations(self):
        video_infos = []
        with open(self.ann_file, 'r') as fin:
            for line in fin:
                line_split = line.strip().split()
                video_info = {}
                idx = 0
                # idx for frame_dir
                frame_dir = line_split[idx]
                if self.audio_prefix is not None:
                    audio_path = osp.join(self.audio_prefix,
                                          frame_dir + '.npy')
                    video_info['audio_path'] = audio_path
                if self.video_prefix:
                    video_path = osp.join(self.video_prefix,
                                          frame_dir + '.mp4')
                    video_info['filename'] = video_path
                if self.data_prefix is not None:
                    frame_dir = osp.join(self.data_prefix, frame_dir)
                    video_info['frame_dir'] = frame_dir
                idx += 1
                if self.with_offset:
                    # idx for offset and total_frames
                    video_info['offset'] = int(line_split[idx])
                    video_info['total_frames'] = int(line_split[idx + 1])
                    idx += 2
                else:
                    # idx for total_frames
                    video_info['total_frames'] = int(line_split[idx])
                    idx += 1
                # idx for label[s]
                label = [int(x) for x in line_split[idx:]]
                assert len(label) != 0, f'missing label in line: {line}'
                if self.multi_class:
                    assert self.num_classes is not None
                    video_info['label'] = label
                else:
                    assert len(label) == 1
                    video_info['label'] = label[0]
                video_infos.append(video_info)
        return video_infos