# Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides data for the ImageNet ILSVRC 2012 Dataset plus some bounding boxes. Some images have one or more bounding boxes associated with the label of the image. See details here: http://image-net.org/download-bboxes ImageNet is based upon WordNet 3.0. To uniquely identify a synset, we use "WordNet ID" (wnid), which is a concatenation of POS ( i.e. part of speech ) and SYNSET OFFSET of WordNet. For more information, please refer to the WordNet documentation[http://wordnet.princeton.edu/wordnet/documentation/]. "There are bounding boxes for over 3000 popular synsets available. For each synset, there are on average 150 images with bounding boxes." WARNING: Don't use for object detection, in this case all the bounding boxes of the image belong to just one class. To read about optimizations that can be applied to the input preprocessing stage, see: https://www.tensorflow.org/performance/performance_guide#input_pipeline_optimization. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os from six.moves import urllib import tensorflow as tf import imagenet_dataset_utils slim = tf.contrib.slim # TODO(nsilberman): Add tfrecord file type once the script is updated. _FILE_PATTERN = '%s-*' _SPLITS_TO_SIZES = { 'train': 1281167, 'validation': 50000, } _ITEMS_TO_DESCRIPTIONS = { 'image': 'A color image of varying height and width.', 'label': 'The label id of the image, integer between 0 and 999', 'label_text': 'The text of the label.', 'object/bbox': 'A list of bounding boxes.', 'object/label': 'A list of labels, one per each object.', } _NUM_CLASSES = 1001 def create_readable_names_for_imagenet_labels(): """Create a dict mapping label id to human readable string. Returns: labels_to_names: dictionary where keys are integers from to 1000 and values are human-readable names. We retrieve a synset file, which contains a list of valid synset labels used by ILSVRC competition. There is one synset one per line, eg. # n01440764 # n01443537 We also retrieve a synset_to_human_file, which contains a mapping from synsets to human-readable names for every synset in Imagenet. These are stored in a tsv format, as follows: # n02119247 black fox # n02119359 silver fox We assign each synset (in alphabetical order) an integer, starting from 1 (since 0 is reserved for the background class). Code is based on https://github.com/tensorflow/models/blob/master/research/slim/datasets/build_imagenet_data.py """ # pylint: disable=g-line-too-long base_url = 'https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/' # pylint: enable=g-line-too-long synset_url = '{}/imagenet_lsvrc_2015_synsets.txt'.format(base_url) synset_to_human_url = '{}/imagenet_metadata.txt'.format(base_url) filename, _ = urllib.request.urlretrieve(synset_url) synset_list = [s.strip() for s in open(filename).readlines()] num_synsets_in_ilsvrc = len(synset_list) assert num_synsets_in_ilsvrc == 1000 filename, _ = urllib.request.urlretrieve(synset_to_human_url) synset_to_human_list = open(filename).readlines() num_synsets_in_all_imagenet = len(synset_to_human_list) assert num_synsets_in_all_imagenet == 21842 synset_to_human = {} for s in synset_to_human_list: parts = s.strip().split('\t') assert len(parts) == 2 synset = parts[0] human = parts[1] synset_to_human[synset] = human label_index = 1 labels_to_names = {0: 'background'} for synset in synset_list: name = synset_to_human[synset] labels_to_names[label_index] = name label_index += 1 return labels_to_names def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading ImageNet. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature( (), tf.string, default_value=''), 'image/format': tf.FixedLenFeature( (), tf.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature( [], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature( [], dtype=tf.string, default_value=''), 'image/object/bbox/xmin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature( dtype=tf.float32), 'image/object/class/label': tf.VarLenFeature( dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'label': slim.tfexample_decoder.Tensor('image/class/label'), 'label_text': slim.tfexample_decoder.Tensor('image/class/text'), 'object/bbox': slim.tfexample_decoder.BoundingBox( ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None if imagenet_dataset_utils.has_labels(dataset_dir): labels_to_names = imagenet_dataset_utils.read_label_file(dataset_dir) else: labels_to_names = create_readable_names_for_imagenet_labels() imagenet_dataset_utils.write_label_file(labels_to_names, dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names)