dataset.py 4.02 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""tf.data.Dataset interface to the MNIST dataset."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

Karmel Allison's avatar
Karmel Allison committed
20
import gzip
21
22
import os
import shutil
23
import tempfile
Neal Wu's avatar
Neal Wu committed
24

25
import numpy as np
26
from six.moves import urllib
27
28
29
30
31
32
33
34
35
36
37
import tensorflow as tf


def read32(bytestream):
  """Read 4 bytes from bytestream as an unsigned 32-bit integer."""
  dt = np.dtype(np.uint32).newbyteorder('>')
  return np.frombuffer(bytestream.read(4), dtype=dt)[0]


def check_image_file_header(filename):
  """Validate that filename corresponds to images for the MNIST dataset."""
Neal Wu's avatar
Neal Wu committed
38
  with tf.gfile.Open(filename, 'rb') as f:
39
    magic = read32(f)
Karmel Allison's avatar
Karmel Allison committed
40
    read32(f)  # num_images, unused
41
42
43
44
45
46
47
48
49
50
51
52
53
    rows = read32(f)
    cols = read32(f)
    if magic != 2051:
      raise ValueError('Invalid magic number %d in MNIST file %s' % (magic,
                                                                     f.name))
    if rows != 28 or cols != 28:
      raise ValueError(
          'Invalid MNIST file %s: Expected 28x28 images, found %dx%d' %
          (f.name, rows, cols))


def check_labels_file_header(filename):
  """Validate that filename corresponds to labels for the MNIST dataset."""
Neal Wu's avatar
Neal Wu committed
54
  with tf.gfile.Open(filename, 'rb') as f:
55
    magic = read32(f)
Karmel Allison's avatar
Karmel Allison committed
56
    read32(f)  # num_items, unused
57
58
59
60
61
    if magic != 2049:
      raise ValueError('Invalid magic number %d in MNIST file %s' % (magic,
                                                                     f.name))


62
def download(directory, filename):
Neal Wu's avatar
Neal Wu committed
63
  """Download (and unzip) a file from the MNIST dataset if not already done."""
64
65
66
  filepath = os.path.join(directory, filename)
  if tf.gfile.Exists(filepath):
    return filepath
Asim Shankar's avatar
Asim Shankar committed
67
68
  if not tf.gfile.Exists(directory):
    tf.gfile.MakeDirs(directory)
69
70
  # CVDF mirror of http://yann.lecun.com/exdb/mnist/
  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
71
  _, zipped_filepath = tempfile.mkstemp(suffix='.gz')
72
73
  print('Downloading %s to %s' % (url, zipped_filepath))
  urllib.request.urlretrieve(url, zipped_filepath)
74
75
  with gzip.open(zipped_filepath, 'rb') as f_in, \
      tf.gfile.Open(filepath, 'wb') as f_out:
76
77
78
79
80
81
    shutil.copyfileobj(f_in, f_out)
  os.remove(zipped_filepath)
  return filepath


def dataset(directory, images_file, labels_file):
Karmel Allison's avatar
Karmel Allison committed
82
83
  """Download and parse MNIST dataset."""

84
85
  images_file = download(directory, images_file)
  labels_file = download(directory, labels_file)
86
87
88
89
90
91
92
93
94
95
96

  check_image_file_header(images_file)
  check_labels_file_header(labels_file)

  def decode_image(image):
    # Normalize from [0, 255] to [0.0, 1.0]
    image = tf.decode_raw(image, tf.uint8)
    image = tf.cast(image, tf.float32)
    image = tf.reshape(image, [784])
    return image / 255.0

97
98
  def decode_label(label):
    label = tf.decode_raw(label, tf.uint8)  # tf.string -> [tf.uint8]
99
    label = tf.reshape(label, [])  # label is a scalar
100
    return tf.to_int32(label)
101
102
103
104

  images = tf.data.FixedLengthRecordDataset(
      images_file, 28 * 28, header_bytes=16).map(decode_image)
  labels = tf.data.FixedLengthRecordDataset(
105
      labels_file, 1, header_bytes=8).map(decode_label)
106
107
108
109
110
111
112
113
114
115
116
117
  return tf.data.Dataset.zip((images, labels))


def train(directory):
  """tf.data.Dataset object for MNIST training data."""
  return dataset(directory, 'train-images-idx3-ubyte',
                 'train-labels-idx1-ubyte')


def test(directory):
  """tf.data.Dataset object for MNIST test data."""
  return dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')