mnist.py 12.3 KB
Newer Older
Tian Qi Chen's avatar
Tian Qi Chen committed
1
from __future__ import print_function
2
from .vision import VisionDataset
3
import warnings
Tian Qi Chen's avatar
Tian Qi Chen committed
4
5
6
from PIL import Image
import os
import os.path
7
import numpy as np
Tian Qi Chen's avatar
Tian Qi Chen committed
8
9
import torch
import codecs
10
from .utils import download_and_extract_archive, extract_archive, makedir_exist_ok
Tian Qi Chen's avatar
Tian Qi Chen committed
11

12

13
class MNIST(VisionDataset):
14
15
16
    """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.

    Args:
17
18
        root (string): Root directory of dataset where ``MNIST/processed/training.pt``
            and  ``MNIST/processed/test.pt`` exist.
19
20
21
22
23
24
25
26
27
28
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """
Tian Qi Chen's avatar
Tian Qi Chen committed
29
30
31
32
33
34
    urls = [
        'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
    ]
35
36
    training_file = 'training.pt'
    test_file = 'test.pt'
37
38
39
    classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
               '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
    @property
    def train_labels(self):
        warnings.warn("train_labels has been renamed targets")
        return self.targets

    @property
    def test_labels(self):
        warnings.warn("test_labels has been renamed targets")
        return self.targets

    @property
    def train_data(self):
        warnings.warn("train_data has been renamed data")
        return self.data

    @property
    def test_data(self):
        warnings.warn("test_data has been renamed data")
        return self.data

Tian Qi Chen's avatar
Tian Qi Chen committed
60
    def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
61
        super(MNIST, self).__init__(root)
Tian Qi Chen's avatar
Tian Qi Chen committed
62
63
        self.transform = transform
        self.target_transform = target_transform
64
        self.train = train  # training set or test set
Tian Qi Chen's avatar
Tian Qi Chen committed
65
66
67
68
69

        if download:
            self.download()

        if not self._check_exists():
70
71
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')
Tian Qi Chen's avatar
Tian Qi Chen committed
72
73

        if self.train:
74
            data_file = self.training_file
Tian Qi Chen's avatar
Tian Qi Chen committed
75
        else:
76
77
            data_file = self.test_file
        self.data, self.targets = torch.load(os.path.join(self.processed_folder, data_file))
Tian Qi Chen's avatar
Tian Qi Chen committed
78
79

    def __getitem__(self, index):
80
81
82
83
84
85
86
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
87
        img, target = self.data[index], int(self.targets[index])
Tian Qi Chen's avatar
Tian Qi Chen committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101

        # doing this so that it is consistent with all other datasets
        # to return a PIL Image
        img = Image.fromarray(img.numpy(), mode='L')

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self):
102
        return len(self.data)
Tian Qi Chen's avatar
Tian Qi Chen committed
103

104
105
106
107
108
109
110
111
112
113
114
115
    @property
    def raw_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'raw')

    @property
    def processed_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'processed')

    @property
    def class_to_idx(self):
        return {_class: i for i, _class in enumerate(self.classes)}

Tian Qi Chen's avatar
Tian Qi Chen committed
116
    def _check_exists(self):
117
118
119
120
        return (os.path.exists(os.path.join(self.processed_folder,
                                            self.training_file)) and
                os.path.exists(os.path.join(self.processed_folder,
                                            self.test_file)))
121

Tian Qi Chen's avatar
Tian Qi Chen committed
122
    def download(self):
123
        """Download the MNIST data if it doesn't exist in processed_folder already."""
Tian Qi Chen's avatar
Tian Qi Chen committed
124
125
126
127

        if self._check_exists():
            return

128
129
        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)
Tian Qi Chen's avatar
Tian Qi Chen committed
130

131
        # download files
Tian Qi Chen's avatar
Tian Qi Chen committed
132
133
        for url in self.urls:
            filename = url.rpartition('/')[2]
134
            download_and_extract_archive(url, download_root=self.raw_folder, filename=filename)
Tian Qi Chen's avatar
Tian Qi Chen committed
135
136

        # process and save as torch files
Adam Paszke's avatar
Adam Paszke committed
137
138
        print('Processing...')

Tian Qi Chen's avatar
Tian Qi Chen committed
139
        training_set = (
140
141
            read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')),
            read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte'))
Tian Qi Chen's avatar
Tian Qi Chen committed
142
143
        )
        test_set = (
144
145
            read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')),
            read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte'))
Tian Qi Chen's avatar
Tian Qi Chen committed
146
        )
147
        with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f:
Tian Qi Chen's avatar
Tian Qi Chen committed
148
            torch.save(training_set, f)
149
        with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f:
Tian Qi Chen's avatar
Tian Qi Chen committed
150
151
152
153
            torch.save(test_set, f)

        print('Done!')

154
155
    def extra_repr(self):
        return "Split: {}".format("Train" if self.train is True else "Test")
156

157

158
class FashionMNIST(MNIST):
159
160
161
    """`Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ Dataset.

    Args:
162
163
        root (string): Root directory of dataset where ``Fashion-MNIST/processed/training.pt``
            and  ``Fashion-MNIST/processed/test.pt`` exist.
164
165
166
167
168
169
170
171
172
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
173
174
175
176
177
178
179
    """
    urls = [
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
    ]
180
181
    classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal',
               'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
182
183


hysts's avatar
hysts committed
184
185
186
187
class KMNIST(MNIST):
    """`Kuzushiji-MNIST <https://github.com/rois-codh/kmnist>`_ Dataset.

    Args:
188
189
        root (string): Root directory of dataset where ``KMNIST/processed/training.pt``
            and  ``KMNIST/processed/test.pt`` exist.
hysts's avatar
hysts committed
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """
    urls = [
        'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
        'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',
        'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',
        'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz',
    ]
    classes = ['o', 'ki', 'su', 'tsu', 'na', 'ha', 'ma', 'ya', 're', 'wo']


209
class EMNIST(MNIST):
Alex Alemi's avatar
Alex Alemi committed
210
    """`EMNIST <https://www.westernsydney.edu.au/bens/home/reproducible_research/emnist>`_ Dataset.
211
212

    Args:
213
214
        root (string): Root directory of dataset where ``EMNIST/processed/training.pt``
            and  ``EMNIST/processed/test.pt`` exist.
215
216
217
218
219
220
221
222
223
224
225
226
227
        split (string): The dataset has 6 different splits: ``byclass``, ``bymerge``,
            ``balanced``, ``letters``, ``digits`` and ``mnist``. This argument specifies
            which one to use.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """
Alex Alemi's avatar
Alex Alemi committed
228
229
    # Updated URL from https://www.westernsydney.edu.au/bens/home/reproducible_research/emnist
    url = 'https://cloudstor.aarnet.edu.au/plus/index.php/s/54h3OuGJhFLwAlQ/download'
230
231
232
233
234
235
236
237
238
239
240
    splits = ('byclass', 'bymerge', 'balanced', 'letters', 'digits', 'mnist')

    def __init__(self, root, split, **kwargs):
        if split not in self.splits:
            raise ValueError('Split "{}" not found. Valid splits are: {}'.format(
                split, ', '.join(self.splits),
            ))
        self.split = split
        self.training_file = self._training_file(split)
        self.test_file = self._test_file(split)
        super(EMNIST, self).__init__(root, **kwargs)
Tian Qi Chen's avatar
Tian Qi Chen committed
241

242
243
    @staticmethod
    def _training_file(split):
244
245
        return 'training_{}.pt'.format(split)

246
247
    @staticmethod
    def _test_file(split):
248
249
250
251
252
        return 'test_{}.pt'.format(split)

    def download(self):
        """Download the EMNIST data if it doesn't exist in processed_folder already."""
        import shutil
253

254
255
256
        if self._check_exists():
            return

257
258
        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)
259

260
        # download files
261
        print('Downloading and extracting zip archive')
262
263
        download_and_extract_archive(self.url, download_root=self.raw_folder, filename="emnist.zip",
                                     remove_finished=True)
264
        gzip_folder = os.path.join(self.raw_folder, 'gzip')
265
266
        for gzip_file in os.listdir(gzip_folder):
            if gzip_file.endswith('.gz'):
267
                extract_archive(os.path.join(gzip_folder, gzip_file), gzip_folder)
268
269
270
271
272

        # process and save as torch files
        for split in self.splits:
            print('Processing ' + split)
            training_set = (
273
274
                read_image_file(os.path.join(gzip_folder, 'emnist-{}-train-images-idx3-ubyte'.format(split))),
                read_label_file(os.path.join(gzip_folder, 'emnist-{}-train-labels-idx1-ubyte'.format(split)))
275
276
            )
            test_set = (
277
278
                read_image_file(os.path.join(gzip_folder, 'emnist-{}-test-images-idx3-ubyte'.format(split))),
                read_label_file(os.path.join(gzip_folder, 'emnist-{}-test-labels-idx1-ubyte'.format(split)))
279
            )
280
            with open(os.path.join(self.processed_folder, self._training_file(split)), 'wb') as f:
281
                torch.save(training_set, f)
282
            with open(os.path.join(self.processed_folder, self._test_file(split)), 'wb') as f:
283
                torch.save(test_set, f)
284
        shutil.rmtree(gzip_folder)
285
286
287
288
289
290

        print('Done!')


def get_int(b):
    return int(codecs.encode(b, 'hex'), 16)
Tian Qi Chen's avatar
Tian Qi Chen committed
291

292

Tian Qi Chen's avatar
Tian Qi Chen committed
293
294
295
296
297
def read_label_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2049
        length = get_int(data[4:8])
298
299
        parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
        return torch.from_numpy(parsed).view(length).long()
Tian Qi Chen's avatar
Tian Qi Chen committed
300

301

Tian Qi Chen's avatar
Tian Qi Chen committed
302
303
304
305
306
307
308
def read_image_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2051
        length = get_int(data[4:8])
        num_rows = get_int(data[8:12])
        num_cols = get_int(data[12:16])
309
310
        parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
        return torch.from_numpy(parsed).view(length, num_rows, num_cols)