ssn_dataset.py

# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os.path as osp
import warnings
from collections import OrderedDict

import mmcv
import numpy as np
from torch.nn.modules.utils import _pair

from ..core import softmax
from ..localization import (eval_ap, load_localize_proposal_file,
                            perform_regression, temporal_iou, temporal_nms)
from ..utils import get_root_logger
from .base import BaseDataset
from .builder import DATASETS


class SSNInstance:
    """Proposal instance of SSN.

    Args:
        start_frame (int): Index of the proposal's start frame.
        end_frame (int): Index of the proposal's end frame.
        num_video_frames (int): Total frames of the video.
        label (int | None): The category label of the proposal. Default: None.
        best_iou (float): The highest IOU with the groundtruth instance.
            Default: 0.
        overlap_self (float): Percent of the proposal's own span contained
            in a groundtruth instance. Default: 0.
    """

    def __init__(self,
                 start_frame,
                 end_frame,
                 num_video_frames,
                 label=None,
                 best_iou=0,
                 overlap_self=0):
        self.start_frame = start_frame
        self.end_frame = min(end_frame, num_video_frames)
        self.num_video_frames = num_video_frames
        self.label = label if label is not None else -1
        self.coverage = (end_frame - start_frame) / num_video_frames
        self.best_iou = best_iou
        self.overlap_self = overlap_self
        self.loc_reg = None
        self.size_reg = None
        self.regression_targets = [0., 0.]

    def compute_regression_targets(self, gt_list):
        """Compute regression targets of positive proposals.

        Args:
            gt_list (list): The list of groundtruth instances.
        """
        # Find the groundtruth instance with the highest IOU.
        ious = [
            temporal_iou(self.start_frame, self.end_frame, gt.start_frame,
                         gt.end_frame) for gt in gt_list
        ]
        best_gt = gt_list[np.argmax(ious)]

        # interval: [start_frame, end_frame)
        proposal_center = (self.start_frame + self.end_frame - 1) / 2
        gt_center = (best_gt.start_frame + best_gt.end_frame - 1) / 2
        proposal_size = self.end_frame - self.start_frame
        gt_size = best_gt.end_frame - best_gt.start_frame

        # Get regression targets:
        # (1). Localization regression target:
        #     center shift proportional to the proposal duration
        # (2). Duration/Size regression target:
        #     logarithm of the groundtruth duration over proposal duration

        self.loc_reg = (gt_center - proposal_center) / proposal_size
        self.size_reg = np.log(gt_size / proposal_size)
        self.regression_targets = ([self.loc_reg, self.size_reg]
                                   if self.loc_reg is not None else [0., 0.])


@DATASETS.register_module()
class SSNDataset(BaseDataset):
    """Proposal frame dataset for Structured Segment Networks.

    Based on proposal information, the dataset loads raw frames and applies
    specified transforms to return a dict containing the frame tensors and
    other information.

    The ann_file is a text file with multiple lines and each
    video's information takes up several lines. This file can be a normalized
    file with percent or standard file with specific frame indexes. If the file
    is a normalized file, it will be converted into a standard file first.

    Template information of a video in a standard file:
    .. code-block:: txt
        # index
        video_id
        num_frames
        fps
        num_gts
        label, start_frame, end_frame
        label, start_frame, end_frame
        ...
        num_proposals
        label, best_iou, overlap_self, start_frame, end_frame
        label, best_iou, overlap_self, start_frame, end_frame
        ...

    Example of a standard annotation file:
    .. code-block:: txt
        # 0
        video_validation_0000202
        5666
        1
        3
        8 130 185
        8 832 1136
        8 1303 1381
        5
        8 0.0620 0.0620 790 5671
        8 0.1656 0.1656 790 2619
        8 0.0833 0.0833 3945 5671
        8 0.0960 0.0960 4173 5671
        8 0.0614 0.0614 3327 5671

    Args:
        ann_file (str): Path to the annotation file.
        pipeline (list[dict | callable]): A sequence of data transforms.
        train_cfg (dict): Config for training.
        test_cfg (dict): Config for testing.
        data_prefix (str): Path to a directory where videos are held.
        test_mode (bool): Store True when building test or validation dataset.
            Default: False.
        filename_tmpl (str): Template for each filename.
            Default: 'img_{:05}.jpg'.
        start_index (int): Specify a start index for frames in consideration of
            different filename format. Default: 1.
        modality (str): Modality of data. Support 'RGB', 'Flow'.
            Default: 'RGB'.
        video_centric (bool): Whether to sample proposals just from
            this video or sample proposals randomly from the entire dataset.
            Default: True.
        reg_normalize_constants (list): Regression target normalized constants,
            including mean and standard deviation of location and duration.
        body_segments (int): Number of segments in course period.
            Default: 5.
        aug_segments (list[int]): Number of segments in starting and
            ending period. Default: (2, 2).
        aug_ratio (int | float | tuple[int | float]): The ratio of the length
            of augmentation to that of the proposal. Default: (0.5, 0.5).
        clip_len (int): Frames of each sampled output clip.
            Default: 1.
        frame_interval (int): Temporal interval of adjacent sampled frames.
            Default: 1.
        filter_gt (bool): Whether to filter videos with no annotation
            during training. Default: True.
        use_regression (bool): Whether to perform regression. Default: True.
        verbose (bool): Whether to print full information or not.
            Default: False.
    """

    def __init__(self,
                 ann_file,
                 pipeline,
                 train_cfg,
                 test_cfg,
                 data_prefix,
                 test_mode=False,
                 filename_tmpl='img_{:05d}.jpg',
                 start_index=1,
                 modality='RGB',
                 video_centric=True,
                 reg_normalize_constants=None,
                 body_segments=5,
                 aug_segments=(2, 2),
                 aug_ratio=(0.5, 0.5),
                 clip_len=1,
                 frame_interval=1,
                 filter_gt=True,
                 use_regression=True,
                 verbose=False):
        self.logger = get_root_logger()
        super().__init__(
            ann_file,
            pipeline,
            data_prefix=data_prefix,
            test_mode=test_mode,
            start_index=start_index,
            modality=modality)
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.assigner = train_cfg.ssn.assigner
        self.sampler = train_cfg.ssn.sampler
        self.evaluater = test_cfg.ssn.evaluater
        self.verbose = verbose
        self.filename_tmpl = filename_tmpl

        if filter_gt or not test_mode:
            valid_inds = [
                i for i, video_info in enumerate(self.video_infos)
                if len(video_info['gts']) > 0
            ]
        self.logger.info(f'{len(valid_inds)} out of {len(self.video_infos)} '
                         f'videos are valid.')
        self.video_infos = [self.video_infos[i] for i in valid_inds]

        # construct three pools:
        # 1. Positive(Foreground)
        # 2. Background
        # 3. Incomplete
        self.positive_pool = []
        self.background_pool = []
        self.incomplete_pool = []
        self.construct_proposal_pools()

        if reg_normalize_constants is None:
            self.reg_norm_consts = self._compute_reg_normalize_constants()
        else:
            self.reg_norm_consts = reg_normalize_constants
        self.video_centric = video_centric
        self.body_segments = body_segments
        self.aug_segments = aug_segments
        self.aug_ratio = _pair(aug_ratio)
        if not mmcv.is_tuple_of(self.aug_ratio, (int, float)):
            raise TypeError(f'aug_ratio should be int, float'
                            f'or tuple of int and float, '
                            f'but got {type(aug_ratio)}')
        assert len(self.aug_ratio) == 2

        total_ratio = (
            self.sampler.positive_ratio + self.sampler.background_ratio +
            self.sampler.incomplete_ratio)
        self.positive_per_video = int(
            self.sampler.num_per_video *
            (self.sampler.positive_ratio / total_ratio))
        self.background_per_video = int(
            self.sampler.num_per_video *
            (self.sampler.background_ratio / total_ratio))
        self.incomplete_per_video = (
            self.sampler.num_per_video - self.positive_per_video -
            self.background_per_video)

        self.test_interval = self.test_cfg.ssn.sampler.test_interval
        # number of consecutive frames
        self.clip_len = clip_len
        # number of steps (sparse sampling for efficiency of io)
        self.frame_interval = frame_interval

        # test mode or not
        self.filter_gt = filter_gt
        self.use_regression = use_regression
        self.test_mode = test_mode

        # yapf: disable
        if self.verbose:
            self.logger.info(f"""
            SSNDataset: proposal file {self.proposal_file} parsed.

            There are {len(self.positive_pool) + len(self.background_pool) +
                len(self.incomplete_pool)} usable proposals from {len(self.video_infos)} videos.
            {len(self.positive_pool)} positive proposals
            {len(self.incomplete_pool)} incomplete proposals
            {len(self.background_pool)} background proposals

            Sample config:
            FG/BG/INCOMP: {self.positive_per_video}/{self.background_per_video}/{self.incomplete_per_video}  # noqa:E501
            Video Centric: {self.video_centric}

            Regression Normalization Constants:
            Location: mean {self.reg_norm_consts[0][0]:.05f} std {self.reg_norm_consts[1][0]:.05f} # noqa: E501
            Duration: mean {self.reg_norm_consts[0][1]:.05f} std {self.reg_norm_consts[1][1]:.05f} # noqa: E501
            """)
        # yapf: enable
        else:
            self.logger.info(
                f'SSNDataset: proposal file {self.proposal_file} parsed.')

    def load_annotations(self):
        """Load annotation file to get video information."""
        video_infos = []
        if 'normalized_' in self.ann_file:
            self.proposal_file = self.ann_file.replace('normalized_', '')
            if not osp.exists(self.proposal_file):
                raise Exception(f'Please refer to `$MMACTION2/tools/data` to'
                                f'denormalize {self.ann_file}.')
        else:
            self.proposal_file = self.ann_file
        proposal_infos = load_localize_proposal_file(self.proposal_file)
        # proposal_info:[video_id, num_frames, gt_list, proposal_list]
        # gt_list member: [label, start_frame, end_frame]
        # proposal_list member: [label, best_iou, overlap_self,
        #                        start_frame, end_frame]
        for proposal_info in proposal_infos:
            if self.data_prefix is not None:
                frame_dir = osp.join(self.data_prefix, proposal_info[0])
            num_frames = int(proposal_info[1])
            # gts:start, end, num_frames, class_label, tIoU=1
            gts = []
            for x in proposal_info[2]:
                if int(x[2]) > int(x[1]) and int(x[1]) < num_frames:
                    ssn_instance = SSNInstance(
                        int(x[1]),
                        int(x[2]),
                        num_frames,
                        label=int(x[0]),
                        best_iou=1.0)
                    gts.append(ssn_instance)
            # proposals:start, end, num_frames, class_label
            # tIoU=best_iou, overlap_self
            proposals = []
            for x in proposal_info[3]:
                if int(x[4]) > int(x[3]) and int(x[3]) < num_frames:
                    ssn_instance = SSNInstance(
                        int(x[3]),
                        int(x[4]),
                        num_frames,
                        label=int(x[0]),
                        best_iou=float(x[1]),
                        overlap_self=float(x[2]))
                    proposals.append(ssn_instance)
            video_infos.append(
                dict(
                    frame_dir=frame_dir,
                    video_id=proposal_info[0],
                    total_frames=num_frames,
                    gts=gts,
                    proposals=proposals))
        return video_infos

    def results_to_detections(self, results, top_k=2000, **kwargs):
        """Convert prediction results into detections.

        Args:
            results (list): Prediction results.
            top_k (int): Number of top results. Default: 2000.

        Returns:
            list: Detection results.
        """
        num_classes = results[0]['activity_scores'].shape[1] - 1
        detections = [dict() for _ in range(num_classes)]

        for idx in range(len(self)):
            video_id = self.video_infos[idx]['video_id']
            relative_proposals = results[idx]['relative_proposal_list']
            if len(relative_proposals[0].shape) == 3:
                relative_proposals = np.squeeze(relative_proposals, 0)

            activity_scores = results[idx]['activity_scores']
            completeness_scores = results[idx]['completeness_scores']
            regression_scores = results[idx]['bbox_preds']
            if regression_scores is None:
                regression_scores = np.zeros(
                    (len(relative_proposals), num_classes, 2),
                    dtype=np.float32)
            regression_scores = regression_scores.reshape((-1, num_classes, 2))

            if top_k <= 0:
                combined_scores = (
                    softmax(activity_scores[:, 1:], dim=1) *
                    np.exp(completeness_scores))
                for i in range(num_classes):
                    center_scores = regression_scores[:, i, 0][:, None]
                    duration_scores = regression_scores[:, i, 1][:, None]
                    detections[i][video_id] = np.concatenate(
                        (relative_proposals, combined_scores[:, i][:, None],
                         center_scores, duration_scores),
                        axis=1)
            else:
                combined_scores = (
                    softmax(activity_scores[:, 1:], dim=1) *
                    np.exp(completeness_scores))
                keep_idx = np.argsort(combined_scores.ravel())[-top_k:]
                for k in keep_idx:
                    class_idx = k % num_classes
                    proposal_idx = k // num_classes
                    new_item = [
                        relative_proposals[proposal_idx, 0],
                        relative_proposals[proposal_idx,
                                           1], combined_scores[proposal_idx,
                                                               class_idx],
                        regression_scores[proposal_idx, class_idx,
                                          0], regression_scores[proposal_idx,
                                                                class_idx, 1]
                    ]
                    if video_id not in detections[class_idx]:
                        detections[class_idx][video_id] = np.array([new_item])
                    else:
                        detections[class_idx][video_id] = np.vstack(
                            [detections[class_idx][video_id], new_item])

        return detections

    def evaluate(self,
                 results,
                 metrics='mAP',
                 metric_options=dict(mAP=dict(eval_dataset='thumos14')),
                 logger=None,
                 **deprecated_kwargs):
        """Evaluation in SSN proposal dataset.

        Args:
            results (list[dict]): Output results.
            metrics (str | sequence[str]): Metrics to be performed.
                Defaults: 'mAP'.
            metric_options (dict): Dict for metric options. Options are
                ``eval_dataset`` for ``mAP``.
                Default: ``dict(mAP=dict(eval_dataset='thumos14'))``.
            logger (logging.Logger | None): Logger for recording.
                Default: None.
            deprecated_kwargs (dict): Used for containing deprecated arguments.
                See 'https://github.com/open-mmlab/mmaction2/pull/286'.

        Returns:
            dict: Evaluation results for evaluation metrics.
        """
        # Protect ``metric_options`` since it uses mutable value as default
        metric_options = copy.deepcopy(metric_options)

        if deprecated_kwargs != {}:
            warnings.warn(
                'Option arguments for metrics has been changed to '
                "`metric_options`, See 'https://github.com/open-mmlab/mmaction2/pull/286' "  # noqa: E501
                'for more details')
            metric_options['mAP'] = dict(metric_options['mAP'],
                                         **deprecated_kwargs)

        if not isinstance(results, list):
            raise TypeError(f'results must be a list, but got {type(results)}')
        assert len(results) == len(self), (
            f'The length of results is not equal to the dataset len: '
            f'{len(results)} != {len(self)}')

        metrics = metrics if isinstance(metrics, (list, tuple)) else [metrics]
        allowed_metrics = ['mAP']
        for metric in metrics:
            if metric not in allowed_metrics:
                raise KeyError(f'metric {metric} is not supported')

        detections = self.results_to_detections(results, **self.evaluater)

        if self.use_regression:
            self.logger.info('Performing location regression')
            for class_idx, _ in enumerate(detections):
                detections[class_idx] = {
                    k: perform_regression(v)
                    for k, v in detections[class_idx].items()
                }
            self.logger.info('Regression finished')

        self.logger.info('Performing NMS')
        for class_idx, _ in enumerate(detections):
            detections[class_idx] = {
                k: temporal_nms(v, self.evaluater.nms)
                for k, v in detections[class_idx].items()
            }
        self.logger.info('NMS finished')

        # get gts
        all_gts = self.get_all_gts()
        for class_idx, _ in enumerate(detections):
            if class_idx not in all_gts:
                all_gts[class_idx] = dict()

        # get predictions
        plain_detections = {}
        for class_idx, _ in enumerate(detections):
            detection_list = []
            for video, dets in detections[class_idx].items():
                detection_list.extend([[video, class_idx] + x[:3]
                                       for x in dets.tolist()])
            plain_detections[class_idx] = detection_list

        eval_results = OrderedDict()
        for metric in metrics:
            if metric == 'mAP':
                eval_dataset = metric_options.setdefault('mAP', {}).setdefault(
                    'eval_dataset', 'thumos14')
                if eval_dataset == 'thumos14':
                    iou_range = np.arange(0.1, 1.0, .1)
                    ap_values = eval_ap(plain_detections, all_gts, iou_range)
                    map_ious = ap_values.mean(axis=0)
                    self.logger.info('Evaluation finished')

                    for iou, map_iou in zip(iou_range, map_ious):
                        eval_results[f'mAP@{iou:.02f}'] = map_iou

        return eval_results

    def construct_proposal_pools(self):
        """Construct positive proposal pool, incomplete proposal pool and
        background proposal pool of the entire dataset."""
        for video_info in self.video_infos:
            positives = self.get_positives(
                video_info['gts'], video_info['proposals'],
                self.assigner.positive_iou_threshold,
                self.sampler.add_gt_as_proposals)
            self.positive_pool.extend([(video_info['video_id'], proposal)
                                       for proposal in positives])

            incompletes, backgrounds = self.get_negatives(
                video_info['proposals'],
                self.assigner.incomplete_iou_threshold,
                self.assigner.background_iou_threshold,
                self.assigner.background_coverage_threshold,
                self.assigner.incomplete_overlap_threshold)
            self.incomplete_pool.extend([(video_info['video_id'], proposal)
                                         for proposal in incompletes])
            self.background_pool.extend([video_info['video_id'], proposal]
                                        for proposal in backgrounds)

    def get_all_gts(self):
        """Fetch groundtruth instances of the entire dataset."""
        gts = {}
        for video_info in self.video_infos:
            video = video_info['video_id']
            for gt in video_info['gts']:
                class_idx = gt.label - 1
                # gt_info: [relative_start, relative_end]
                gt_info = [
                    gt.start_frame / video_info['total_frames'],
                    gt.end_frame / video_info['total_frames']
                ]
                gts.setdefault(class_idx, {}).setdefault(video,
                                                         []).append(gt_info)

        return gts

    @staticmethod
    def get_positives(gts, proposals, positive_threshold, with_gt=True):
        """Get positive/foreground proposals.

        Args:
            gts (list): List of groundtruth instances(:obj:`SSNInstance`).
            proposals (list): List of proposal instances(:obj:`SSNInstance`).
            positive_threshold (float): Minimum threshold of overlap of
                positive/foreground proposals and groundtruths.
            with_gt (bool): Whether to include groundtruth instances in
                positive proposals. Default: True.

        Returns:
            list[:obj:`SSNInstance`]: (positives), positives is a list
                comprised of positive proposal instances.
        """
        positives = [
            proposal for proposal in proposals
            if proposal.best_iou > positive_threshold
        ]

        if with_gt:
            positives.extend(gts)

        for proposal in positives:
            proposal.compute_regression_targets(gts)

        return positives

    @staticmethod
    def get_negatives(proposals,
                      incomplete_iou_threshold,
                      background_iou_threshold,
                      background_coverage_threshold=0.01,
                      incomplete_overlap_threshold=0.7):
        """Get negative proposals, including incomplete proposals and
        background proposals.

        Args:
            proposals (list): List of proposal instances(:obj:`SSNInstance`).
            incomplete_iou_threshold (float): Maximum threshold of overlap
                of incomplete proposals and groundtruths.
            background_iou_threshold (float): Maximum threshold of overlap
                of background proposals and groundtruths.
            background_coverage_threshold (float): Minimum coverage
                of background proposals in video duration. Default: 0.01.
            incomplete_overlap_threshold (float): Minimum percent of incomplete
                proposals' own span contained in a groundtruth instance.
                Default: 0.7.

        Returns:
            list[:obj:`SSNInstance`]: (incompletes, backgrounds), incompletes
                and backgrounds are lists comprised of incomplete
                proposal instances and background proposal instances.
        """
        incompletes = []
        backgrounds = []

        for proposal in proposals:
            if (proposal.best_iou < incomplete_iou_threshold
                    and proposal.overlap_self > incomplete_overlap_threshold):
                incompletes.append(proposal)
            elif (proposal.best_iou < background_iou_threshold
                  and proposal.coverage > background_coverage_threshold):
                backgrounds.append(proposal)

        return incompletes, backgrounds

    def _video_centric_sampling(self, record):
        """Sample proposals from the this video instance.

        Args:
            record (dict): Information of the video instance(video_info[idx]).
                key: frame_dir, video_id, total_frames,
                gts: List of groundtruth instances(:obj:`SSNInstance`).
                proposals: List of proposal instances(:obj:`SSNInstance`).
        """
        positives = self.get_positives(record['gts'], record['proposals'],
                                       self.assigner.positive_iou_threshold,
                                       self.sampler.add_gt_as_proposals)
        incompletes, backgrounds = self.get_negatives(
            record['proposals'], self.assigner.incomplete_iou_threshold,
            self.assigner.background_iou_threshold,
            self.assigner.background_coverage_threshold,
            self.assigner.incomplete_overlap_threshold)

        def sample_video_proposals(proposal_type, video_id, video_pool,
                                   num_requested_proposals, dataset_pool):
            """This method will sample proposals from the this video pool. If
            the video pool is empty, it will fetch from the dataset pool
            (collect proposal of the entire dataset).

            Args:
                proposal_type (int): Type id of proposal.
                    Positive/Foreground: 0
                    Negative:
                        Incomplete: 1
                        Background: 2
                video_id (str): Name of the video.
                video_pool (list): Pool comprised of proposals in this video.
                num_requested_proposals (int): Number of proposals
                    to be sampled.
                dataset_pool (list): Proposals of the entire dataset.

            Returns:
                list[(str, :obj:`SSNInstance`), int]:
                    video_id (str): Name of the video.
                    :obj:`SSNInstance`: Instance of class SSNInstance.
                    proposal_type (int): Type of proposal.
            """

            if len(video_pool) == 0:
                idx = np.random.choice(
                    len(dataset_pool), num_requested_proposals, replace=False)
                return [(dataset_pool[x], proposal_type) for x in idx]

            replicate = len(video_pool) < num_requested_proposals
            idx = np.random.choice(
                len(video_pool), num_requested_proposals, replace=replicate)
            return [((video_id, video_pool[x]), proposal_type) for x in idx]

        out_proposals = []
        out_proposals.extend(
            sample_video_proposals(0, record['video_id'], positives,
                                   self.positive_per_video,
                                   self.positive_pool))
        out_proposals.extend(
            sample_video_proposals(1, record['video_id'], incompletes,
                                   self.incomplete_per_video,
                                   self.incomplete_pool))
        out_proposals.extend(
            sample_video_proposals(2, record['video_id'], backgrounds,
                                   self.background_per_video,
                                   self.background_pool))

        return out_proposals

    def _random_sampling(self):
        """Randomly sample proposals from the entire dataset."""
        out_proposals = []

        positive_idx = np.random.choice(
            len(self.positive_pool),
            self.positive_per_video,
            replace=len(self.positive_pool) < self.positive_per_video)
        out_proposals.extend([(self.positive_pool[x], 0)
                              for x in positive_idx])
        incomplete_idx = np.random.choice(
            len(self.incomplete_pool),
            self.incomplete_per_video,
            replace=len(self.incomplete_pool) < self.incomplete_per_video)
        out_proposals.extend([(self.incomplete_pool[x], 1)
                              for x in incomplete_idx])
        background_idx = np.random.choice(
            len(self.background_pool),
            self.background_per_video,
            replace=len(self.background_pool) < self.background_per_video)
        out_proposals.extend([(self.background_pool[x], 2)
                              for x in background_idx])

        return out_proposals

    def _get_stage(self, proposal, num_frames):
        """Fetch the scale factor of starting and ending stage and get the
        stage split.

        Args:
            proposal (:obj:`SSNInstance`): Proposal instance.
            num_frames (int): Total frames of the video.

        Returns:
            tuple[float, float, list]: (starting_scale_factor,
                ending_scale_factor, stage_split), starting_scale_factor is
                the ratio of the effective sampling length to augment length
                in starting stage, ending_scale_factor is the ratio of the
                effective sampling length to augment length in ending stage,
                stage_split is  ending segment id of starting, course and
                ending stage.
        """
        # proposal interval: [start_frame, end_frame)
        start_frame = proposal.start_frame
        end_frame = proposal.end_frame
        ori_clip_len = self.clip_len * self.frame_interval

        duration = end_frame - start_frame
        assert duration != 0

        valid_starting = max(0,
                             start_frame - int(duration * self.aug_ratio[0]))
        valid_ending = min(num_frames - ori_clip_len + 1,
                           end_frame - 1 + int(duration * self.aug_ratio[1]))

        valid_starting_length = start_frame - valid_starting - ori_clip_len
        valid_ending_length = (valid_ending - end_frame + 1) - ori_clip_len

        starting_scale_factor = ((valid_starting_length + ori_clip_len + 1) /
                                 (duration * self.aug_ratio[0]))
        ending_scale_factor = (valid_ending_length + ori_clip_len + 1) / (
            duration * self.aug_ratio[1])

        aug_start, aug_end = self.aug_segments
        stage_split = [
            aug_start, aug_start + self.body_segments,
            aug_start + self.body_segments + aug_end
        ]

        return starting_scale_factor, ending_scale_factor, stage_split

    def _compute_reg_normalize_constants(self):
        """Compute regression target normalized constants."""
        if self.verbose:
            self.logger.info('Compute regression target normalized constants')
        targets = []
        for video_info in self.video_infos:
            positives = self.get_positives(
                video_info['gts'], video_info['proposals'],
                self.assigner.positive_iou_threshold, False)
            for positive in positives:
                targets.append(list(positive.regression_targets))

        return np.array((np.mean(targets, axis=0), np.std(targets, axis=0)))

    def prepare_train_frames(self, idx):
        """Prepare the frames for training given the index."""
        results = copy.deepcopy(self.video_infos[idx])
        results['filename_tmpl'] = self.filename_tmpl
        results['modality'] = self.modality
        results['start_index'] = self.start_index

        if self.video_centric:
            # yapf: disable
            results['out_proposals'] = self._video_centric_sampling(self.video_infos[idx])  # noqa: E501
            # yapf: enable
        else:
            results['out_proposals'] = self._random_sampling()

        out_proposal_scale_factor = []
        out_proposal_type = []
        out_proposal_labels = []
        out_proposal_reg_targets = []

        for _, proposal in enumerate(results['out_proposals']):
            # proposal: [(video_id, SSNInstance), proposal_type]
            num_frames = proposal[0][1].num_video_frames

            (starting_scale_factor, ending_scale_factor,
             _) = self._get_stage(proposal[0][1], num_frames)

            # proposal[1]: Type id of proposal.
            # Positive/Foreground: 0
            # Negative:
            #   Incomplete: 1
            #   Background: 2

            # Positivte/Foreground proposal
            if proposal[1] == 0:
                label = proposal[0][1].label
            # Incomplete proposal
            elif proposal[1] == 1:
                label = proposal[0][1].label
            # Background proposal
            elif proposal[1] == 2:
                label = 0
            else:
                raise ValueError(f'Proposal type should be 0, 1, or 2,'
                                 f'but got {proposal[1]}')
            out_proposal_scale_factor.append(
                [starting_scale_factor, ending_scale_factor])
            if not isinstance(label, int):
                raise TypeError(f'proposal_label must be an int,'
                                f'but got {type(label)}')
            out_proposal_labels.append(label)
            out_proposal_type.append(proposal[1])

            reg_targets = proposal[0][1].regression_targets
            if proposal[1] == 0:
                # Normalize regression targets of positive proposals.
                reg_targets = ((reg_targets[0] - self.reg_norm_consts[0][0]) /
                               self.reg_norm_consts[1][0],
                               (reg_targets[1] - self.reg_norm_consts[0][1]) /
                               self.reg_norm_consts[1][1])
            out_proposal_reg_targets.append(reg_targets)

        results['reg_targets'] = np.array(
            out_proposal_reg_targets, dtype=np.float32)
        results['proposal_scale_factor'] = np.array(
            out_proposal_scale_factor, dtype=np.float32)
        results['proposal_labels'] = np.array(out_proposal_labels)
        results['proposal_type'] = np.array(out_proposal_type)

        return self.pipeline(results)

    def prepare_test_frames(self, idx):
        """Prepare the frames for testing given the index."""
        results = copy.deepcopy(self.video_infos[idx])
        results['filename_tmpl'] = self.filename_tmpl
        results['modality'] = self.modality
        results['start_index'] = self.start_index

        proposals = results['proposals']
        num_frames = results['total_frames']
        ori_clip_len = self.clip_len * self.frame_interval
        frame_ticks = np.arange(
            0, num_frames - ori_clip_len, self.test_interval, dtype=int) + 1

        num_sampled_frames = len(frame_ticks)

        if len(proposals) == 0:
            proposals.append(SSNInstance(0, num_frames - 1, num_frames))

        relative_proposal_list = []
        proposal_tick_list = []
        scale_factor_list = []

        for proposal in proposals:
            relative_proposal = (proposal.start_frame / num_frames,
                                 proposal.end_frame / num_frames)
            relative_duration = relative_proposal[1] - relative_proposal[0]
            relative_starting_duration = relative_duration * self.aug_ratio[0]
            relative_ending_duration = relative_duration * self.aug_ratio[1]
            relative_starting = (
                relative_proposal[0] - relative_starting_duration)
            relative_ending = relative_proposal[1] + relative_ending_duration

            real_relative_starting = max(0.0, relative_starting)
            real_relative_ending = min(1.0, relative_ending)

            starting_scale_factor = (
                (relative_proposal[0] - real_relative_starting) /
                relative_starting_duration)
            ending_scale_factor = (
                (real_relative_ending - relative_proposal[1]) /
                relative_ending_duration)

            proposal_ranges = (real_relative_starting, *relative_proposal,
                               real_relative_ending)
            proposal_ticks = (np.array(proposal_ranges) *
                              num_sampled_frames).astype(np.int32)

            relative_proposal_list.append(relative_proposal)
            proposal_tick_list.append(proposal_ticks)
            scale_factor_list.append(
                (starting_scale_factor, ending_scale_factor))

        results['relative_proposal_list'] = np.array(
            relative_proposal_list, dtype=np.float32)
        results['scale_factor_list'] = np.array(
            scale_factor_list, dtype=np.float32)
        results['proposal_tick_list'] = np.array(
            proposal_tick_list, dtype=np.int32)
        results['reg_norm_consts'] = self.reg_norm_consts

        return self.pipeline(results)