mnist.py 6.17 KB
Newer Older
Tian Qi Chen's avatar
Tian Qi Chen committed
1
2
3
4
5
6
7
8
9
from __future__ import print_function
import torch.utils.data as data
from PIL import Image
import os
import os.path
import errno
import torch
import codecs

10

Tian Qi Chen's avatar
Tian Qi Chen committed
11
class MNIST(data.Dataset):
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
    """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.

    Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """
Tian Qi Chen's avatar
Tian Qi Chen committed
27
28
29
30
31
32
33
34
35
36
37
38
    urls = [
        'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
    ]
    raw_folder = 'raw'
    processed_folder = 'processed'
    training_file = 'training.pt'
    test_file = 'test.pt'

    def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
39
        self.root = os.path.expanduser(root)
Tian Qi Chen's avatar
Tian Qi Chen committed
40
41
        self.transform = transform
        self.target_transform = target_transform
42
        self.train = train  # training set or test set
Tian Qi Chen's avatar
Tian Qi Chen committed
43
44
45
46
47

        if download:
            self.download()

        if not self._check_exists():
48
49
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')
Tian Qi Chen's avatar
Tian Qi Chen committed
50
51

        if self.train:
52
            self.train_data, self.train_labels = torch.load(
moskomule's avatar
moskomule committed
53
                os.path.join(self.root, self.processed_folder, self.training_file))
Tian Qi Chen's avatar
Tian Qi Chen committed
54
        else:
moskomule's avatar
moskomule committed
55
            self.test_data, self.test_labels = torch.load(os.path.join(self.root, self.processed_folder, self.test_file))
Tian Qi Chen's avatar
Tian Qi Chen committed
56
57

    def __getitem__(self, index):
58
59
60
61
62
63
64
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
Tian Qi Chen's avatar
Tian Qi Chen committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
        if self.train:
            img, target = self.train_data[index], self.train_labels[index]
        else:
            img, target = self.test_data[index], self.test_labels[index]

        # doing this so that it is consistent with all other datasets
        # to return a PIL Image
        img = Image.fromarray(img.numpy(), mode='L')

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self):
        if self.train:
Keon's avatar
Keon committed
84
            return len(self.train_data)
Tian Qi Chen's avatar
Tian Qi Chen committed
85
        else:
Keon's avatar
Keon committed
86
            return len(self.test_data)
Tian Qi Chen's avatar
Tian Qi Chen committed
87
88
89

    def _check_exists(self):
        return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \
90
            os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file))
Tian Qi Chen's avatar
Tian Qi Chen committed
91
92

    def download(self):
93
        """Download the MNIST data if it doesn't exist in processed_folder already."""
Tian Qi Chen's avatar
Tian Qi Chen committed
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
        from six.moves import urllib
        import gzip

        if self._check_exists():
            return

        # download files
        try:
            os.makedirs(os.path.join(self.root, self.raw_folder))
            os.makedirs(os.path.join(self.root, self.processed_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        for url in self.urls:
            print('Downloading ' + url)
            data = urllib.request.urlopen(url)
            filename = url.rpartition('/')[2]
            file_path = os.path.join(self.root, self.raw_folder, filename)
            with open(file_path, 'wb') as f:
                f.write(data.read())
            with open(file_path.replace('.gz', ''), 'wb') as out_f, \
118
                    gzip.GzipFile(file_path) as zip_f:
Tian Qi Chen's avatar
Tian Qi Chen committed
119
120
121
122
                out_f.write(zip_f.read())
            os.unlink(file_path)

        # process and save as torch files
Adam Paszke's avatar
Adam Paszke committed
123
124
        print('Processing...')

Tian Qi Chen's avatar
Tian Qi Chen committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
        training_set = (
            read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))
        )
        test_set = (
            read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')),
            read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))
        )
        with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')

140

Tian Qi Chen's avatar
Tian Qi Chen committed
141
142
143
def get_int(b):
    return int(codecs.encode(b, 'hex'), 16)

144

Tian Qi Chen's avatar
Tian Qi Chen committed
145
146
147
148
149
def parse_byte(b):
    if isinstance(b, str):
        return ord(b)
    return b

150

Tian Qi Chen's avatar
Tian Qi Chen committed
151
152
153
154
155
156
157
158
159
def read_label_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2049
        length = get_int(data[4:8])
        labels = [parse_byte(b) for b in data[8:]]
        assert len(labels) == length
        return torch.LongTensor(labels)

160

Tian Qi Chen's avatar
Tian Qi Chen committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def read_image_file(path):
    with open(path, 'rb') as f:
        data = f.read()
        assert get_int(data[:4]) == 2051
        length = get_int(data[4:8])
        num_rows = get_int(data[8:12])
        num_cols = get_int(data[12:16])
        images = []
        idx = 16
        for l in range(length):
            img = []
            images.append(img)
            for r in range(num_rows):
                row = []
                img.append(row)
                for c in range(num_cols):
                    row.append(parse_byte(data[idx]))
                    idx += 1
        assert len(images) == length
        return torch.ByteTensor(images).view(-1, 28, 28)