base.py 1.97 KB
Newer Older
Ruilong Li's avatar
Ruilong Li committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Copyright (c) Meta Platforms, Inc. and affiliates.
import math

import torch


class CachedIterDataset(torch.utils.data.IterableDataset):
    def __init__(
        self,
        training: bool = False,
        cache_n_repeat: int = 0,
    ):
        self.training = training
        self.cache_n_repeat = cache_n_repeat

        self._cache = None
        self._n_repeat = 0

    def fetch_data(self, index):
        """Fetch the data (it maybe cached for multiple batches)."""
        raise NotImplementedError

    def preprocess(self, data):
        """Process the fetched / cached data with randomness."""
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:  # single-process data loading, return the full iterator
            iter_start = 0
            iter_end = self.__len__()
        else:  # in a worker process
            # split workload
            per_worker = int(math.ceil(self.__len__() / float(worker_info.num_workers)))
            worker_id = worker_info.id
            iter_start = worker_id * per_worker
            iter_end = min(iter_start + per_worker, self.__len__())
        if self.training:
            while True:
                for index in iter_start + torch.randperm(iter_end - iter_start):
                    yield self.__getitem__(index)
        else:
            for index in range(iter_start, iter_end):
                yield self.__getitem__(index)

    def __getitem__(self, index):
        if (
            self.training
            and (self._cache is not None)
            and (self._n_repeat < self.cache_n_repeat)
        ):
            data = self._cache
            self._n_repeat += 1
        else:
            data = self.fetch_data(index)
            self._cache = data
            self._n_repeat = 1
        return self.preprocess(data)

    @classmethod
    def collate_fn(cls, batch):
        return batch[0]