dataset.py 3.97 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""tf.data.Dataset interface to the MNIST dataset."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

Karmel Allison's avatar
Karmel Allison committed
20
import gzip
21
22
import os
import shutil
Neal Wu's avatar
Neal Wu committed
23

24
import numpy as np
25
from six.moves import urllib
26
27
28
29
30
31
32
33
34
35
36
import tensorflow as tf


def read32(bytestream):
  """Read 4 bytes from bytestream as an unsigned 32-bit integer."""
  dt = np.dtype(np.uint32).newbyteorder('>')
  return np.frombuffer(bytestream.read(4), dtype=dt)[0]


def check_image_file_header(filename):
  """Validate that filename corresponds to images for the MNIST dataset."""
Neal Wu's avatar
Neal Wu committed
37
  with tf.gfile.Open(filename, 'rb') as f:
38
    magic = read32(f)
Karmel Allison's avatar
Karmel Allison committed
39
    read32(f)  # num_images, unused
40
41
42
43
44
45
46
47
48
49
50
51
52
    rows = read32(f)
    cols = read32(f)
    if magic != 2051:
      raise ValueError('Invalid magic number %d in MNIST file %s' % (magic,
                                                                     f.name))
    if rows != 28 or cols != 28:
      raise ValueError(
          'Invalid MNIST file %s: Expected 28x28 images, found %dx%d' %
          (f.name, rows, cols))


def check_labels_file_header(filename):
  """Validate that filename corresponds to labels for the MNIST dataset."""
Neal Wu's avatar
Neal Wu committed
53
  with tf.gfile.Open(filename, 'rb') as f:
54
    magic = read32(f)
Karmel Allison's avatar
Karmel Allison committed
55
    read32(f)  # num_items, unused
56
57
58
59
60
    if magic != 2049:
      raise ValueError('Invalid magic number %d in MNIST file %s' % (magic,
                                                                     f.name))


61
def download(directory, filename):
Neal Wu's avatar
Neal Wu committed
62
  """Download (and unzip) a file from the MNIST dataset if not already done."""
63
64
65
  filepath = os.path.join(directory, filename)
  if tf.gfile.Exists(filepath):
    return filepath
Asim Shankar's avatar
Asim Shankar committed
66
67
  if not tf.gfile.Exists(directory):
    tf.gfile.MakeDirs(directory)
68
69
  # CVDF mirror of http://yann.lecun.com/exdb/mnist/
  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
70
71
72
73
  zipped_filepath = filepath + '.gz'
  print('Downloading %s to %s' % (url, zipped_filepath))
  urllib.request.urlretrieve(url, zipped_filepath)
  with gzip.open(zipped_filepath, 'rb') as f_in, open(filepath, 'wb') as f_out:
74
75
76
77
78
79
    shutil.copyfileobj(f_in, f_out)
  os.remove(zipped_filepath)
  return filepath


def dataset(directory, images_file, labels_file):
Karmel Allison's avatar
Karmel Allison committed
80
81
  """Download and parse MNIST dataset."""

82
83
  images_file = download(directory, images_file)
  labels_file = download(directory, labels_file)
84
85
86
87
88
89
90
91
92
93
94

  check_image_file_header(images_file)
  check_labels_file_header(labels_file)

  def decode_image(image):
    # Normalize from [0, 255] to [0.0, 1.0]
    image = tf.decode_raw(image, tf.uint8)
    image = tf.cast(image, tf.float32)
    image = tf.reshape(image, [784])
    return image / 255.0

95
96
  def decode_label(label):
    label = tf.decode_raw(label, tf.uint8)  # tf.string -> [tf.uint8]
97
    label = tf.reshape(label, [])  # label is a scalar
98
    return tf.to_int32(label)
99
100
101
102

  images = tf.data.FixedLengthRecordDataset(
      images_file, 28 * 28, header_bytes=16).map(decode_image)
  labels = tf.data.FixedLengthRecordDataset(
103
      labels_file, 1, header_bytes=8).map(decode_label)
104
105
106
107
108
109
110
111
112
113
114
115
  return tf.data.Dataset.zip((images, labels))


def train(directory):
  """tf.data.Dataset object for MNIST training data."""
  return dataset(directory, 'train-images-idx3-ubyte',
                 'train-labels-idx1-ubyte')


def test(directory):
  """tf.data.Dataset object for MNIST test data."""
  return dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')