mnist.py 12.9 KB
Newer Older
Tian Qi Chen's avatar
Tian Qi Chen committed
1
2
3
4
5
from __future__ import print_function
import torch.utils.data as data
from PIL import Image
import os
import os.path
6
import gzip
7
import numpy as np
Tian Qi Chen's avatar
Tian Qi Chen committed
8
9
import torch
import codecs
10
import hashlib
11
from .utils import download_url, makedir_exist_ok
Tian Qi Chen's avatar
Tian Qi Chen committed
12

13

Tian Qi Chen's avatar
Tian Qi Chen committed
14
class MNIST(data.Dataset):
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
    """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.

    Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """
30

Tian Qi Chen's avatar
Tian Qi Chen committed
31
32
33
34
35
36
    urls = [
        'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
    ]
37
38
39
40
41
42
43
44
45
46
47
48

    md5s = {
        't10k-images-idx3-ubyte.gz': '9fb629c4189551a2d022fa330f9573f3',
        't10k-labels-idx1-ubyte.gz': 'ec29112dd5afa0611ce80d1b7f02629c',
        'train-images-idx3-ubyte.gz': 'f68b3c2dcbeaaa9fbdd348bbdeb94873',
        'train-labels-idx1-ubyte.gz': 'd53e105ee54ea40749a09fcbcd1e9432',
    }

    raw_folder = 'raw'
    processed_folder = 'processed'
    training_file = 'mnist-training.pt'
    test_file = 'mnist-test.pt'
49
50
51
    classes = ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
               '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']

Tian Qi Chen's avatar
Tian Qi Chen committed
52
    def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
53
        self.root = os.path.expanduser(root)
Tian Qi Chen's avatar
Tian Qi Chen committed
54
55
        self.transform = transform
        self.target_transform = target_transform
56
        self.train = train  # training set or test set
Tian Qi Chen's avatar
Tian Qi Chen committed
57
58
59
60
61

        if download:
            self.download()

        if not self._check_exists():
62
63
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')
Tian Qi Chen's avatar
Tian Qi Chen committed
64

65
66
67
68
69
70
71
72
73
74
75
76
        def load_data(filename):
            loaded_data = torch.load(
                os.path.join(self.root, self.processed_folder, filename))
            if len(loaded_data) == 2:
                return loaded_data
            else:
                clsname, data, labels = loaded_data
                if clsname != type(self).__name__:
                    raise RuntimeError("Expected {} data but found {}"
                                       .format(type(self).__name__, clsname, ))
                return data, labels

Tian Qi Chen's avatar
Tian Qi Chen committed
77
        if self.train:
78
            self.train_data, self.train_labels = load_data(self.training_file)
Tian Qi Chen's avatar
Tian Qi Chen committed
79
        else:
80
            self.train_data, self.train_labels = load_data(self.test_file)
Tian Qi Chen's avatar
Tian Qi Chen committed
81
82

    def __getitem__(self, index):
83
84
85
86
87
88
89
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
90
91
92
93
        if self.train:
            img, target = self.train_data[index], self.train_labels[index]
        else:
            img, target = self.test_data[index], self.test_labels[index]
Tian Qi Chen's avatar
Tian Qi Chen committed
94
95
96
97
98
99
100
101
102
103
104
105
106
107

        # doing this so that it is consistent with all other datasets
        # to return a PIL Image
        img = Image.fromarray(img.numpy(), mode='L')

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self):
108
        return len(self.data)
Tian Qi Chen's avatar
Tian Qi Chen committed
109

110
111
112
113
114
115
116
117
118
119
120
121
    @property
    def raw_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'raw')

    @property
    def processed_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'processed')

    @property
    def class_to_idx(self):
        return {_class: i for i, _class in enumerate(self.classes)}

Tian Qi Chen's avatar
Tian Qi Chen committed
122
    def _check_exists(self):
123
124
125
126
127
128
129
130
131
132
133
        return os.path.exists(os.path.join(self.processed_folder, self.training_file)) and \
            os.path.exists(os.path.join(self.processed_folder, self.test_file))

    @staticmethod
    def extract_gzip(gzip_path, remove_finished=False):
        print('Extracting {}'.format(gzip_path))
        with open(gzip_path.replace('.gz', ''), 'wb') as out_f, \
                gzip.GzipFile(gzip_path) as zip_f:
            out_f.write(zip_f.read())
        if remove_finished:
            os.unlink(gzip_path)
Tian Qi Chen's avatar
Tian Qi Chen committed
134
135

    def download(self):
136
        """Download the MNIST data if it doesn't exist in processed_folder already."""
137
138
        from six.moves import urllib
        import gzip
Tian Qi Chen's avatar
Tian Qi Chen committed
139
140
141
142

        if self._check_exists():
            return

143
144
        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)
Tian Qi Chen's avatar
Tian Qi Chen committed
145

146
        # download files
Tian Qi Chen's avatar
Tian Qi Chen committed
147
148
        for url in self.urls:
            filename = url.rpartition('/')[2]
149
            file_path = os.path.join(self.raw_folder, filename)
150
            download_url(url, root=self.raw_folder, filename=filename, md5=self.md5s[filename])
151
            self.extract_gzip(gzip_path=file_path, remove_finished=True)
Tian Qi Chen's avatar
Tian Qi Chen committed
152
153

        # process and save as torch files
Adam Paszke's avatar
Adam Paszke committed
154
155
        print('Processing...')

Tian Qi Chen's avatar
Tian Qi Chen committed
156
        training_set = (
157
158
159
            type(self).__name__,
            read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))
Tian Qi Chen's avatar
Tian Qi Chen committed
160
161
        )
        test_set = (
162
163
164
            type(self).__name__,
            read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))
Tian Qi Chen's avatar
Tian Qi Chen committed
165
        )
166
        with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f:
Tian Qi Chen's avatar
Tian Qi Chen committed
167
            torch.save(training_set, f)
168
        with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f:
Tian Qi Chen's avatar
Tian Qi Chen committed
169
170
171
172
            torch.save(test_set, f)

        print('Done!')

173
174
175
176
177
178
179
180
181
182
183
184
    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        tmp = 'train' if self.train is True else 'test'
        fmt_str += '    Split: {}\n'.format(tmp)
        fmt_str += '    Root Location: {}\n'.format(self.root)
        tmp = '    Transforms (if any): '
        fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        tmp = '    Target Transforms (if any): '
        fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        return fmt_str

185

186
class FashionMNIST(MNIST):
187
188
189
190
191
192
193
194
195
196
197
198
199
200
    """`Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ Dataset.

    Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
201
    """
202
203
204
    training_file = 'fashion-mnist-training.pt'
    test_file = 'fashion-mnist-test.pt'

205
206
207
208
209
210
    urls = [
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
    ]
211
212
213
214
215
216
217

    md5s = {
        't10k-images-idx3-ubyte.gz': 'bef4ecab320f06d8554ea6380940ec79',
        't10k-labels-idx1-ubyte.gz': 'bb300cfdad3c16e7a12a480ee83cd310',
        'train-images-idx3-ubyte.gz': '8d4fb7e6c68d591d4c3dfef9ec88bf0d',
        'train-labels-idx1-ubyte.gz': '25c81989df183df01b3e8a0aad5dffbe',
    }
218
219


220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
class EMNIST(MNIST):
    """`EMNIST <https://www.nist.gov/itl/iad/image-group/emnist-dataset/>`_ Dataset.

    Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        split (string): The dataset has 6 different splits: ``byclass``, ``bymerge``,
            ``balanced``, ``letters``, ``digits`` and ``mnist``. This argument specifies
            which one to use.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """
239
    url = 'http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip'
240
241
242
243
244
245
246
247
248
249
250
    splits = ('byclass', 'bymerge', 'balanced', 'letters', 'digits', 'mnist')

    def __init__(self, root, split, **kwargs):
        if split not in self.splits:
            raise ValueError('Split "{}" not found. Valid splits are: {}'.format(
                split, ', '.join(self.splits),
            ))
        self.split = split
        self.training_file = self._training_file(split)
        self.test_file = self._test_file(split)
        super(EMNIST, self).__init__(root, **kwargs)
Tian Qi Chen's avatar
Tian Qi Chen committed
251

252
253
    @staticmethod
    def _training_file(split):
254
255
        return 'training_{}.pt'.format(split)

256
257
    @staticmethod
    def _test_file(split):
258
259
260
261
262
263
        return 'test_{}.pt'.format(split)

    def download(self):
        """Download the EMNIST data if it doesn't exist in processed_folder already."""
        import shutil
        import zipfile
264

265
266
267
        if self._check_exists():
            return

268
269
        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)
270

271
        # download files
272
        filename = self.url.rpartition('/')[2]
273
274
        file_path = os.path.join(self.raw_folder, filename)
        download_url(self.url, root=self.raw_folder, filename=filename, md5=None)
275
276
277

        print('Extracting zip archive')
        with zipfile.ZipFile(file_path) as zip_f:
278
            zip_f.extractall(self.raw_folder)
279
        os.unlink(file_path)
280
        gzip_folder = os.path.join(self.raw_folder, 'gzip')
281
282
        for gzip_file in os.listdir(gzip_folder):
            if gzip_file.endswith('.gz'):
283
                self.extract_gzip(gzip_path=os.path.join(gzip_folder, gzip_file))
284
285
286
287
288

        # process and save as torch files
        for split in self.splits:
            print('Processing ' + split)
            training_set = (
289
290
                read_image_file(os.path.join(gzip_folder, 'emnist-{}-train-images-idx3-ubyte'.format(split))),
                read_label_file(os.path.join(gzip_folder, 'emnist-{}-train-labels-idx1-ubyte'.format(split)))
291
292
            )
            test_set = (
293
294
                read_image_file(os.path.join(gzip_folder, 'emnist-{}-test-images-idx3-ubyte'.format(split))),
                read_label_file(os.path.join(gzip_folder, 'emnist-{}-test-labels-idx1-ubyte'.format(split)))
295
            )
296
            with open(os.path.join(self.processed_folder, self._training_file(split)), 'wb') as f:
297
                torch.save(training_set, f)
298
            with open(os.path.join(self.processed_folder, self._test_file(split)), 'wb') as f:
299
                torch.save(test_set, f)
300
        shutil.rmtree(gzip_folder)
301
302
303
304
305
306

        print('Done!')


def get_int(b):
    return int(codecs.encode(b, 'hex'), 16)
Tian Qi Chen's avatar
Tian Qi Chen committed
307

308

Tian Qi Chen's avatar
Tian Qi Chen committed
309
310
311
312
313
def read_label_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2049
        length = get_int(data[4:8])
314
315
        parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
        return torch.from_numpy(parsed).view(length).long()
Tian Qi Chen's avatar
Tian Qi Chen committed
316

317

Tian Qi Chen's avatar
Tian Qi Chen committed
318
319
320
321
322
323
324
def read_image_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2051
        length = get_int(data[4:8])
        num_rows = get_int(data[8:12])
        num_cols = get_int(data[12:16])
325
326
        parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
        return torch.from_numpy(parsed).view(length, num_rows, num_cols)