mnist.py 7.43 KB
Newer Older
Tian Qi Chen's avatar
Tian Qi Chen committed
1
2
3
4
5
6
7
8
9
from __future__ import print_function
import torch.utils.data as data
from PIL import Image
import os
import os.path
import errno
import torch
import codecs

10

Tian Qi Chen's avatar
Tian Qi Chen committed
11
class MNIST(data.Dataset):
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
    """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.

    Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """
Tian Qi Chen's avatar
Tian Qi Chen committed
27
28
29
30
31
32
33
34
35
36
37
38
    urls = [
        'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
    ]
    raw_folder = 'raw'
    processed_folder = 'processed'
    training_file = 'training.pt'
    test_file = 'test.pt'

    def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
39
        self.root = os.path.expanduser(root)
Tian Qi Chen's avatar
Tian Qi Chen committed
40
41
        self.transform = transform
        self.target_transform = target_transform
42
        self.train = train  # training set or test set
Tian Qi Chen's avatar
Tian Qi Chen committed
43
44
45
46
47

        if download:
            self.download()

        if not self._check_exists():
48
49
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')
Tian Qi Chen's avatar
Tian Qi Chen committed
50
51

        if self.train:
52
            self.train_data, self.train_labels = torch.load(
moskomule's avatar
moskomule committed
53
                os.path.join(self.root, self.processed_folder, self.training_file))
Tian Qi Chen's avatar
Tian Qi Chen committed
54
        else:
Mikhail Korobov's avatar
Mikhail Korobov committed
55
56
            self.test_data, self.test_labels = torch.load(
                os.path.join(self.root, self.processed_folder, self.test_file))
Tian Qi Chen's avatar
Tian Qi Chen committed
57
58

    def __getitem__(self, index):
59
60
61
62
63
64
65
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
Tian Qi Chen's avatar
Tian Qi Chen committed
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
        if self.train:
            img, target = self.train_data[index], self.train_labels[index]
        else:
            img, target = self.test_data[index], self.test_labels[index]

        # doing this so that it is consistent with all other datasets
        # to return a PIL Image
        img = Image.fromarray(img.numpy(), mode='L')

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self):
        if self.train:
Keon's avatar
Keon committed
85
            return len(self.train_data)
Tian Qi Chen's avatar
Tian Qi Chen committed
86
        else:
Keon's avatar
Keon committed
87
            return len(self.test_data)
Tian Qi Chen's avatar
Tian Qi Chen committed
88
89
90

    def _check_exists(self):
        return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \
91
            os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file))
Tian Qi Chen's avatar
Tian Qi Chen committed
92
93

    def download(self):
94
        """Download the MNIST data if it doesn't exist in processed_folder already."""
Tian Qi Chen's avatar
Tian Qi Chen committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
        from six.moves import urllib
        import gzip

        if self._check_exists():
            return

        # download files
        try:
            os.makedirs(os.path.join(self.root, self.raw_folder))
            os.makedirs(os.path.join(self.root, self.processed_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        for url in self.urls:
            print('Downloading ' + url)
            data = urllib.request.urlopen(url)
            filename = url.rpartition('/')[2]
            file_path = os.path.join(self.root, self.raw_folder, filename)
            with open(file_path, 'wb') as f:
                f.write(data.read())
            with open(file_path.replace('.gz', ''), 'wb') as out_f, \
119
                    gzip.GzipFile(file_path) as zip_f:
Tian Qi Chen's avatar
Tian Qi Chen committed
120
121
122
123
                out_f.write(zip_f.read())
            os.unlink(file_path)

        # process and save as torch files
Adam Paszke's avatar
Adam Paszke committed
124
125
        print('Processing...')

Tian Qi Chen's avatar
Tian Qi Chen committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
        training_set = (
            read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))
        )
        test_set = (
            read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))
        )
        with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')

141

142
class FashionMNIST(MNIST):
143
144
145
146
147
148
149
150
151
152
153
154
155
156
    """`Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ Dataset.

    Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
157
158
159
160
161
162
163
164
165
    """
    urls = [
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
        'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
    ]


Tian Qi Chen's avatar
Tian Qi Chen committed
166
167
168
def get_int(b):
    return int(codecs.encode(b, 'hex'), 16)

169

Tian Qi Chen's avatar
Tian Qi Chen committed
170
171
172
173
174
def parse_byte(b):
    if isinstance(b, str):
        return ord(b)
    return b

175

Tian Qi Chen's avatar
Tian Qi Chen committed
176
177
178
179
180
181
182
183
184
def read_label_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2049
        length = get_int(data[4:8])
        labels = [parse_byte(b) for b in data[8:]]
        assert len(labels) == length
        return torch.LongTensor(labels)

185

Tian Qi Chen's avatar
Tian Qi Chen committed
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def read_image_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2051
        length = get_int(data[4:8])
        num_rows = get_int(data[8:12])
        num_cols = get_int(data[12:16])
        images = []
        idx = 16
        for l in range(length):
            img = []
            images.append(img)
            for r in range(num_rows):
                row = []
                img.append(row)
                for c in range(num_cols):
                    row.append(parse_byte(data[idx]))
                    idx += 1
        assert len(images) == length
        return torch.ByteTensor(images).view(-1, 28, 28)