"git@developer.sourcefind.cn:change/sglang.git" did not exist on "18317ddc13bc403749fe9f99ef5726796f855b0e"
Unverified Commit 67c403fc authored by pkulzc's avatar pkulzc Committed by GitHub
Browse files

Add VisualWakeWords Dataset to Slim dataset_factory (#6661)

* Merged commit includes the following changes:
244869387  by Sergio Guadarrama:

    This CL adds script/code to generate Visual WakeWords Dataset annotation files and TF records starting from COCO dataset.

--
244866660  by Sergio Guadarrama:

    Add VisualWakeWords Dataset to Slim dataset_factory to train MobileNets on it.

--
244836000  by Sergio Guadarrama:

    n/a

--
244104396  by Sergio Guadarrama:

    Add an option whether to enable / disable image cropping in inception_preprocessing.

--
242040128  by Sergio Guadarrama:

    Internal change

241793677  by Sergio Guadarrama:

    Internal change

241073081  by Sergio Guadarrama:

    Internal change

240131189  by Sergio Guadarrama:

    Internal change

PiperOrigin-RevId: 244869387

* Merged commit includes the following changes:
245431876  by Sergio Guadarrama:

    Internal cleanup

--

PiperOrigin-RevId: 245431876

* Merged commit includes the following changes:
245454983  by Sergio Guadarrama:

    Internal Cleanup

--

PiperOrigin-RevId: 245454983
parent 4a1fba0b
...@@ -13,6 +13,7 @@ py_library( ...@@ -13,6 +13,7 @@ py_library(
name = "dataset_utils", name = "dataset_utils",
srcs = ["datasets/dataset_utils.py"], srcs = ["datasets/dataset_utils.py"],
deps = [ deps = [
"//third_party/py/six",
# "//tensorflow", # "//tensorflow",
], ],
) )
...@@ -34,6 +35,7 @@ sh_binary( ...@@ -34,6 +35,7 @@ sh_binary(
py_binary( py_binary(
name = "build_imagenet_data", name = "build_imagenet_data",
srcs = ["datasets/build_imagenet_data.py"], srcs = ["datasets/build_imagenet_data.py"],
python_version = "PY2",
deps = [ deps = [
# "//numpy", # "//numpy",
# "//tensorflow", # "//tensorflow",
...@@ -72,6 +74,7 @@ py_library( ...@@ -72,6 +74,7 @@ py_library(
py_binary( py_binary(
name = "download_and_convert_data", name = "download_and_convert_data",
srcs = ["download_and_convert_data.py"], srcs = ["download_and_convert_data.py"],
python_version = "PY2",
deps = [ deps = [
":download_and_convert_cifar10", ":download_and_convert_cifar10",
":download_and_convert_flowers", ":download_and_convert_flowers",
...@@ -80,6 +83,31 @@ py_binary( ...@@ -80,6 +83,31 @@ py_binary(
], ],
) )
sh_binary(
name = "download_mscoco",
srcs = ["datasets/download_mscoco.sh"],
)
py_binary(
name = "build_visualwakewords_data",
srcs = ["datasets/build_visualwakewords_data.py"],
deps = [
":build_visualwakewords_data_lib",
# "//tensorflow",
],
)
py_library(
name = "build_visualwakewords_data_lib",
srcs = ["datasets/build_visualwakewords_data_lib.py"],
deps = [
":dataset_utils",
"//third_party/py/PIL:pil",
"//third_party/py/contextlib2",
# "//tensorflow",
],
)
py_library( py_library(
name = "cifar10", name = "cifar10",
srcs = ["datasets/cifar10.py"], srcs = ["datasets/cifar10.py"],
...@@ -116,6 +144,15 @@ py_library( ...@@ -116,6 +144,15 @@ py_library(
], ],
) )
py_library(
name = "visualwakewords",
srcs = ["datasets/visualwakewords.py"],
deps = [
":dataset_utils",
# "//tensorflow",
],
)
py_library( py_library(
name = "dataset_factory", name = "dataset_factory",
srcs = ["datasets/dataset_factory.py"], srcs = ["datasets/dataset_factory.py"],
...@@ -124,6 +161,7 @@ py_library( ...@@ -124,6 +161,7 @@ py_library(
":flowers", ":flowers",
":imagenet", ":imagenet",
":mnist", ":mnist",
":visualwakewords",
], ],
) )
...@@ -138,6 +176,7 @@ py_library( ...@@ -138,6 +176,7 @@ py_library(
py_test( py_test(
name = "model_deploy_test", name = "model_deploy_test",
srcs = ["deployment/model_deploy_test.py"], srcs = ["deployment/model_deploy_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":model_deploy", ":model_deploy",
...@@ -227,6 +266,7 @@ py_test( ...@@ -227,6 +266,7 @@ py_test(
name = "alexnet_test", name = "alexnet_test",
size = "medium", size = "medium",
srcs = ["nets/alexnet_test.py"], srcs = ["nets/alexnet_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":alexnet", ":alexnet",
...@@ -254,6 +294,7 @@ py_library( ...@@ -254,6 +294,7 @@ py_library(
py_test( py_test(
name = "cyclegan_test", name = "cyclegan_test",
srcs = ["nets/cyclegan_test.py"], srcs = ["nets/cyclegan_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -273,6 +314,7 @@ py_library( ...@@ -273,6 +314,7 @@ py_library(
py_test( py_test(
name = "dcgan_test", name = "dcgan_test",
srcs = ["nets/dcgan_test.py"], srcs = ["nets/dcgan_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -296,6 +338,7 @@ py_test( ...@@ -296,6 +338,7 @@ py_test(
name = "i3d_test", name = "i3d_test",
size = "large", size = "large",
srcs = ["nets/i3d_test.py"], srcs = ["nets/i3d_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -388,6 +431,7 @@ py_test( ...@@ -388,6 +431,7 @@ py_test(
name = "inception_v1_test", name = "inception_v1_test",
size = "large", size = "large",
srcs = ["nets/inception_v1_test.py"], srcs = ["nets/inception_v1_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -401,6 +445,7 @@ py_test( ...@@ -401,6 +445,7 @@ py_test(
name = "inception_v2_test", name = "inception_v2_test",
size = "large", size = "large",
srcs = ["nets/inception_v2_test.py"], srcs = ["nets/inception_v2_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -414,6 +459,7 @@ py_test( ...@@ -414,6 +459,7 @@ py_test(
name = "inception_v3_test", name = "inception_v3_test",
size = "large", size = "large",
srcs = ["nets/inception_v3_test.py"], srcs = ["nets/inception_v3_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -427,6 +473,7 @@ py_test( ...@@ -427,6 +473,7 @@ py_test(
name = "inception_v4_test", name = "inception_v4_test",
size = "large", size = "large",
srcs = ["nets/inception_v4_test.py"], srcs = ["nets/inception_v4_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -439,6 +486,7 @@ py_test( ...@@ -439,6 +486,7 @@ py_test(
name = "inception_resnet_v2_test", name = "inception_resnet_v2_test",
size = "large", size = "large",
srcs = ["nets/inception_resnet_v2_test.py"], srcs = ["nets/inception_resnet_v2_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -476,6 +524,7 @@ py_library( ...@@ -476,6 +524,7 @@ py_library(
py_test( py_test(
name = "mobilenet_v2_test", name = "mobilenet_v2_test",
srcs = ["nets/mobilenet/mobilenet_v2_test.py"], srcs = ["nets/mobilenet/mobilenet_v2_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":mobilenet", ":mobilenet",
...@@ -495,6 +544,7 @@ py_test( ...@@ -495,6 +544,7 @@ py_test(
name = "mobilenet_v1_test", name = "mobilenet_v1_test",
size = "large", size = "large",
srcs = ["nets/mobilenet_v1_test.py"], srcs = ["nets/mobilenet_v1_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -507,6 +557,7 @@ py_test( ...@@ -507,6 +557,7 @@ py_test(
py_binary( py_binary(
name = "mobilenet_v1_train", name = "mobilenet_v1_train",
srcs = ["nets/mobilenet_v1_train.py"], srcs = ["nets/mobilenet_v1_train.py"],
python_version = "PY2",
deps = [ deps = [
":dataset_factory", ":dataset_factory",
":mobilenet_v1", ":mobilenet_v1",
...@@ -518,6 +569,7 @@ py_binary( ...@@ -518,6 +569,7 @@ py_binary(
py_binary( py_binary(
name = "mobilenet_v1_eval", name = "mobilenet_v1_eval",
srcs = ["nets/mobilenet_v1_eval.py"], srcs = ["nets/mobilenet_v1_eval.py"],
python_version = "PY2",
deps = [ deps = [
":dataset_factory", ":dataset_factory",
":mobilenet_v1", ":mobilenet_v1",
...@@ -549,6 +601,7 @@ py_test( ...@@ -549,6 +601,7 @@ py_test(
name = "nasnet_utils_test", name = "nasnet_utils_test",
size = "medium", size = "medium",
srcs = ["nets/nasnet/nasnet_utils_test.py"], srcs = ["nets/nasnet/nasnet_utils_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":nasnet_utils", ":nasnet_utils",
...@@ -560,6 +613,7 @@ py_test( ...@@ -560,6 +613,7 @@ py_test(
name = "nasnet_test", name = "nasnet_test",
size = "large", size = "large",
srcs = ["nets/nasnet/nasnet_test.py"], srcs = ["nets/nasnet/nasnet_test.py"],
python_version = "PY2",
shard_count = 10, shard_count = 10,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -583,6 +637,7 @@ py_test( ...@@ -583,6 +637,7 @@ py_test(
name = "pnasnet_test", name = "pnasnet_test",
size = "large", size = "large",
srcs = ["nets/nasnet/pnasnet_test.py"], srcs = ["nets/nasnet/pnasnet_test.py"],
python_version = "PY2",
shard_count = 4, shard_count = 4,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -604,6 +659,7 @@ py_test( ...@@ -604,6 +659,7 @@ py_test(
name = "overfeat_test", name = "overfeat_test",
size = "medium", size = "medium",
srcs = ["nets/overfeat_test.py"], srcs = ["nets/overfeat_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":overfeat", ":overfeat",
...@@ -623,6 +679,7 @@ py_library( ...@@ -623,6 +679,7 @@ py_library(
py_test( py_test(
name = "pix2pix_test", name = "pix2pix_test",
srcs = ["nets/pix2pix_test.py"], srcs = ["nets/pix2pix_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":pix2pix", ":pix2pix",
...@@ -653,6 +710,7 @@ py_test( ...@@ -653,6 +710,7 @@ py_test(
name = "resnet_v1_test", name = "resnet_v1_test",
size = "medium", size = "medium",
srcs = ["nets/resnet_v1_test.py"], srcs = ["nets/resnet_v1_test.py"],
python_version = "PY2",
shard_count = 2, shard_count = 2,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -677,6 +735,7 @@ py_test( ...@@ -677,6 +735,7 @@ py_test(
name = "resnet_v2_test", name = "resnet_v2_test",
size = "medium", size = "medium",
srcs = ["nets/resnet_v2_test.py"], srcs = ["nets/resnet_v2_test.py"],
python_version = "PY2",
shard_count = 2, shard_count = 2,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -701,6 +760,7 @@ py_test( ...@@ -701,6 +760,7 @@ py_test(
name = "s3dg_test", name = "s3dg_test",
size = "large", size = "large",
srcs = ["nets/s3dg_test.py"], srcs = ["nets/s3dg_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -722,6 +782,7 @@ py_test( ...@@ -722,6 +782,7 @@ py_test(
name = "vgg_test", name = "vgg_test",
size = "medium", size = "medium",
srcs = ["nets/vgg_test.py"], srcs = ["nets/vgg_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":vgg", ":vgg",
...@@ -742,6 +803,7 @@ py_test( ...@@ -742,6 +803,7 @@ py_test(
name = "nets_factory_test", name = "nets_factory_test",
size = "large", size = "large",
srcs = ["nets/nets_factory_test.py"], srcs = ["nets/nets_factory_test.py"],
python_version = "PY2",
shard_count = 3, shard_count = 3,
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
...@@ -767,6 +829,7 @@ py_binary( ...@@ -767,6 +829,7 @@ py_binary(
srcs = ["train_image_classifier.py"], srcs = ["train_image_classifier.py"],
# WARNING: not supported in bazel; will be commented out by copybara. # WARNING: not supported in bazel; will be commented out by copybara.
# paropts = ["--compress"], # paropts = ["--compress"],
python_version = "PY2",
deps = [ deps = [
":train_image_classifier_lib", ":train_image_classifier_lib",
], ],
...@@ -786,6 +849,7 @@ py_library( ...@@ -786,6 +849,7 @@ py_library(
py_binary( py_binary(
name = "eval_image_classifier", name = "eval_image_classifier",
srcs = ["eval_image_classifier.py"], srcs = ["eval_image_classifier.py"],
python_version = "PY2",
deps = [ deps = [
":eval_image_classifier_lib", ":eval_image_classifier_lib",
], ],
...@@ -796,6 +860,7 @@ py_binary( ...@@ -796,6 +860,7 @@ py_binary(
srcs = ["export_inference_graph.py"], srcs = ["export_inference_graph.py"],
# WARNING: not supported in bazel; will be commented out by copybara. # WARNING: not supported in bazel; will be commented out by copybara.
# paropts = ["--compress"], # paropts = ["--compress"],
python_version = "PY2",
deps = [":export_inference_graph_lib"], deps = [":export_inference_graph_lib"],
) )
...@@ -814,6 +879,7 @@ py_test( ...@@ -814,6 +879,7 @@ py_test(
name = "export_inference_graph_test", name = "export_inference_graph_test",
size = "medium", size = "medium",
srcs = ["export_inference_graph_test.py"], srcs = ["export_inference_graph_test.py"],
python_version = "PY2",
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
tags = [ tags = [
"manual", "manual",
......
...@@ -96,6 +96,7 @@ Flowers|2500 | 2500 | 5 | Various sizes (source: Flickr) ...@@ -96,6 +96,7 @@ Flowers|2500 | 2500 | 5 | Various sizes (source: Flickr)
[Cifar10](https://www.cs.toronto.edu/~kriz/cifar.html) | 60k| 10k | 10 |32x32 color [Cifar10](https://www.cs.toronto.edu/~kriz/cifar.html) | 60k| 10k | 10 |32x32 color
[MNIST](http://yann.lecun.com/exdb/mnist/)| 60k | 10k | 10 | 28x28 gray [MNIST](http://yann.lecun.com/exdb/mnist/)| 60k | 10k | 10 | 28x28 gray
[ImageNet](http://www.image-net.org/challenges/LSVRC/2012/)|1.2M| 50k | 1000 | Various sizes [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/)|1.2M| 50k | 1000 | Various sizes
VisualWakeWords|82783 | 40504 | 2 | Various sizes (source: MS COCO)
## Downloading and converting to TFRecord format ## Downloading and converting to TFRecord format
...@@ -135,6 +136,9 @@ However, for ImageNet, you have to follow the instructions ...@@ -135,6 +136,9 @@ However, for ImageNet, you have to follow the instructions
[here](https://github.com/tensorflow/models/blob/master/research/inception/README.md#getting-started). [here](https://github.com/tensorflow/models/blob/master/research/inception/README.md#getting-started).
Note that you first have to sign up for an account at image-net.org. Note that you first have to sign up for an account at image-net.org.
Also, the download can take several hours, and could use up to 500GB. Also, the download can take several hours, and could use up to 500GB.
For the visualwakewords dataset, you need to download the MSCOCO dataset [here](https://github.com/tensorflow/models/blob/master/research/slim/datasets/download_mscoco.sh)
and build TFRecords with the following instructions
[here](https://github.com/tensorflow/models/blob/master/research/slim/datasets/build_visualwakewords_data.py).
## Creating a TF-Slim Dataset Descriptor. ## Creating a TF-Slim Dataset Descriptor.
...@@ -148,6 +152,7 @@ for ...@@ -148,6 +152,7 @@ for
[Cifar10](https://github.com/tensorflow/models/blob/master/research/slim/datasets/cifar10.py), [Cifar10](https://github.com/tensorflow/models/blob/master/research/slim/datasets/cifar10.py),
[ImageNet](https://github.com/tensorflow/models/blob/master/research/slim/datasets/imagenet.py), [ImageNet](https://github.com/tensorflow/models/blob/master/research/slim/datasets/imagenet.py),
[Flowers](https://github.com/tensorflow/models/blob/master/research/slim/datasets/flowers.py), [Flowers](https://github.com/tensorflow/models/blob/master/research/slim/datasets/flowers.py),
[VisualWakeWords](https://github.com/tensorflow/models/blob/master/research/slim/datasets/visualwakewords.py),
and and
[MNIST](https://github.com/tensorflow/models/blob/master/research/slim/datasets/mnist.py). [MNIST](https://github.com/tensorflow/models/blob/master/research/slim/datasets/mnist.py).
An example of how to load data using a TF-Slim dataset descriptor using a An example of how to load data using a TF-Slim dataset descriptor using a
......
...@@ -314,7 +314,7 @@ def _process_image(filename, coder): ...@@ -314,7 +314,7 @@ def _process_image(filename, coder):
width: integer, image width in pixels. width: integer, image width in pixels.
""" """
# Read the image file. # Read the image file.
image_data = tf.gfile.FastGFile(filename, 'r').read() image_data = tf.gfile.GFile(filename, 'r').read()
# Clean the dirty data. # Clean the dirty data.
if _is_png(filename): if _is_png(filename):
...@@ -497,8 +497,9 @@ def _find_image_files(data_dir, labels_file): ...@@ -497,8 +497,9 @@ def _find_image_files(data_dir, labels_file):
labels: list of integer; each integer identifies the ground truth. labels: list of integer; each integer identifies the ground truth.
""" """
print('Determining list of input files and labels from %s.' % data_dir) print('Determining list of input files and labels from %s.' % data_dir)
challenge_synsets = [l.strip() for l in challenge_synsets = [
tf.gfile.FastGFile(labels_file, 'r').readlines()] l.strip() for l in tf.gfile.GFile(labels_file, 'r').readlines()
]
labels = [] labels = []
filenames = [] filenames = []
...@@ -621,7 +622,7 @@ def _build_synset_lookup(imagenet_metadata_file): ...@@ -621,7 +622,7 @@ def _build_synset_lookup(imagenet_metadata_file):
Dictionary of synset to human labels, such as: Dictionary of synset to human labels, such as:
'n02119022' --> 'red fox, Vulpes vulpes' 'n02119022' --> 'red fox, Vulpes vulpes'
""" """
lines = tf.gfile.FastGFile(imagenet_metadata_file, 'r').readlines() lines = tf.gfile.GFile(imagenet_metadata_file, 'r').readlines()
synset_to_human = {} synset_to_human = {}
for l in lines: for l in lines:
if l: if l:
...@@ -655,7 +656,7 @@ def _build_bounding_box_lookup(bounding_box_file): ...@@ -655,7 +656,7 @@ def _build_bounding_box_lookup(bounding_box_file):
Dictionary mapping image file names to a list of bounding boxes. This list Dictionary mapping image file names to a list of bounding boxes. This list
contains 0+ bounding boxes. contains 0+ bounding boxes.
""" """
lines = tf.gfile.FastGFile(bounding_box_file, 'r').readlines() lines = tf.gfile.GFile(bounding_box_file, 'r').readlines()
images_to_bboxes = {} images_to_bboxes = {}
num_bbox = 0 num_bbox = 0
num_image = 0 num_image = 0
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Build Visual WakeWords Dataset with images and labels for person/not-person.
This script generates the Visual WakeWords dataset annotations from
the raw COCO dataset and converts them to TFRecord.
Visual WakeWords Dataset derives from the COCO dataset to design tiny models
classifying two classes, such as person/not-person. The COCO annotations
are filtered to two classes: foreground_class_of_interest and background
( for e.g. person and not-person). Bounding boxes for small objects
with area less than 5% of the image area are filtered out.
The resulting annotations file has the following fields, where
the image and categories fields are same as COCO dataset, while the annotation
field corresponds to the foreground_class_of_interest/background class and
bounding boxes for the foreground_class_of_interest class.
images{"id", "width", "height", "file_name", "license", "flickr_url",
"coco_url", "date_captured",}
annotations{
"image_id", object[{"category_id", "area", "bbox" : [x,y,width,height],}]
"count",
"label"
}
categories[{
"id", "name", "supercategory",
}]
The TFRecord file contains the following features:
{ image/height, image/width, image/source_id, image/encoded,
image/class/label_text, image/class/label,
image/object/class/text,
image/object/bbox/ymin, image/object/bbox/xmin, image/object/bbox/ymax,
image/object/bbox/xmax, image/object/area
image/filename, image/format, image/key/sha256}
For classification models, you need the image/encoded and image/class/label.
Please note that this tool creates sharded output files.
Example usage:
Add folder tensorflow/models/research/slim to your PYTHONPATH,
and from this folder, run the following commands:
bash download_mscoco.sh path-to-mscoco-dataset
TRAIN_IMAGE_DIR="path-to-mscoco-dataset/train2014"
VAL_IMAGE_DIR="path-to-mscoco-dataset/val2014"
TRAIN_ANNOTATIONS_FILE="path-to-mscoco-dataset/annotations/instances_train2014.json"
VAL_ANNOTATIONS_FILE="path-to-mscoco-dataset/annotations/instances_val2014.json"
python datasets/build_visualwakewords_data.py --logtostderr \
--train_image_dir="${TRAIN_IMAGE_DIR}" \
--val_image_dir="${VAL_IMAGE_DIR}" \
--train_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
--val_annotations_file="${VAL_ANNOTATIONS_FILE}" \
--output_dir="${OUTPUT_DIR}" \
--small_object_area_threshold=0.005 \
--foreground_class_of_interest='person'
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf
from datasets import build_visualwakewords_data_lib
flags = tf.app.flags
tf.flags.DEFINE_string('train_image_dir', '', 'Training image directory.')
tf.flags.DEFINE_string('val_image_dir', '', 'Validation image directory.')
tf.flags.DEFINE_string('train_annotations_file', '',
'Training annotations JSON file.')
tf.flags.DEFINE_string('val_annotations_file', '',
'Validation annotations JSON file.')
tf.flags.DEFINE_string('output_dir', '/tmp/', 'Output data directory.')
tf.flags.DEFINE_float(
'small_object_area_threshold', 0.005,
'Threshold of fraction of image area below which small'
'objects are filtered')
tf.flags.DEFINE_string(
'foreground_class_of_interest', 'person',
'Build a binary classifier based on the presence or absence'
'of this object in the scene (default is person/not-person)')
FLAGS = flags.FLAGS
tf.logging.set_verbosity(tf.logging.INFO)
def main(unused_argv):
# Path to COCO dataset images and annotations
assert FLAGS.train_image_dir, '`train_image_dir` missing.'
assert FLAGS.val_image_dir, '`val_image_dir` missing.'
assert FLAGS.train_annotations_file, '`train_annotations_file` missing.'
assert FLAGS.val_annotations_file, '`val_annotations_file` missing.'
visualwakewords_annotations_train = os.path.join(
FLAGS.output_dir, 'instances_visualwakewords_train2014.json')
visualwakewords_annotations_val = os.path.join(
FLAGS.output_dir, 'instances_visualwakewords_val2014.json')
visualwakewords_labels_filename = os.path.join(FLAGS.output_dir,
'labels.txt')
small_object_area_threshold = FLAGS.small_object_area_threshold
foreground_class_of_interest = FLAGS.foreground_class_of_interest
# Create the Visual WakeWords annotations from COCO annotations
if not tf.gfile.IsDirectory(FLAGS.output_dir):
tf.gfile.MakeDirs(FLAGS.output_dir)
build_visualwakewords_data_lib.create_visual_wakeword_annotations(
FLAGS.train_annotations_file, visualwakewords_annotations_train,
small_object_area_threshold, foreground_class_of_interest,
visualwakewords_labels_filename)
build_visualwakewords_data_lib.create_visual_wakeword_annotations(
FLAGS.val_annotations_file, visualwakewords_annotations_val,
small_object_area_threshold, foreground_class_of_interest,
visualwakewords_labels_filename)
# Create the TF Records for Visual WakeWords Dataset
if not tf.gfile.IsDirectory(FLAGS.output_dir):
tf.gfile.MakeDirs(FLAGS.output_dir)
train_output_path = os.path.join(FLAGS.output_dir, 'train.record')
val_output_path = os.path.join(FLAGS.output_dir, 'val.record')
build_visualwakewords_data_lib.create_tf_record_for_visualwakewords_dataset(
visualwakewords_annotations_train,
FLAGS.train_image_dir,
train_output_path,
num_shards=100)
build_visualwakewords_data_lib.create_tf_record_for_visualwakewords_dataset(
visualwakewords_annotations_val,
FLAGS.val_image_dir,
val_output_path,
num_shards=10)
if __name__ == '__main__':
tf.app.run()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Generate Visual Wakewords Dataset.
Helper functions to generate the Visual WakeWords dataset. It filters raw
COCO annotations file to Visual WakeWords Dataset annotations.
The resulting annotations and COCO images are then
converted to TF records.
See build_visualwakewords_data.py for the sample usage.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import hashlib
import io
import json
import os
import contextlib2
import PIL.Image
import tensorflow as tf
from datasets import dataset_utils
tf.logging.set_verbosity(tf.logging.INFO)
def create_visual_wakeword_annotations(annotations_file,
visualwakewords_annotations_path,
small_object_area_threshold,
foreground_class_of_interest,
visualwakewords_labels_filename):
"""Generate visual wakewords annotations file.
Loads COCO annotation json files and filters to person/not-person
class (or user-specified class) to generate visual wakewords annotations file.
Each image is assigned a label 1 or 0. The label 1 is assigned as long
as it has at least one foreground_class_of_interest (e.g. person)
bounding box greater than 5% of the image area.
Args:
annotations_file: JSON file containing COCO bounding box annotations
visualwakewords_annotations_path: output path to annotations file
small_object_area_threshold: threshold on fraction of image area below which
small object bounding boxes are filtered
foreground_class_of_interest: category from COCO dataset that is filtered by
the visual wakewords dataset
visualwakewords_labels_filename: The filename to write the visual wakewords
label file
"""
# default object of interest is person
foreground_class_of_interest_id = 1
with tf.gfile.GFile(annotations_file, 'r') as fid:
groundtruth_data = json.load(fid)
images = groundtruth_data['images']
# Create category index
category_index = {}
for category in groundtruth_data['categories']:
if category['name'] == foreground_class_of_interest:
foreground_class_of_interest_id = category['id']
category_index[category['id']] = category
# Create annotations index
annotations_index = {}
annotations_index_filtered = {}
if 'annotations' in groundtruth_data:
tf.logging.info(
'Found groundtruth annotations. Building annotations index.')
for annotation in groundtruth_data['annotations']:
image_id = annotation['image_id']
if image_id not in annotations_index:
annotations_index[image_id] = []
annotations_index_filtered[image_id] = []
annotations_index[image_id].append(annotation)
missing_annotation_count = 0
for image in images:
image_id = image['id']
if image_id not in annotations_index:
missing_annotation_count += 1
annotations_index[image_id] = []
annotations_index_filtered[image_id] = []
tf.logging.info('%d images are missing annotations.',
missing_annotation_count)
# Create filtered annotations index
for idx, image in enumerate(images):
if idx % 100 == 0:
tf.logging.info('On image %d of %d', idx, len(images))
annotations_list = annotations_index[image['id']]
annotations_list_filtered = _filter_annotations_list(
annotations_list, image, small_object_area_threshold,
foreground_class_of_interest_id)
annotations_index_filtered[image['id']].append(annotations_list_filtered)
# Output Visual WakeWords annotations and labels
labels_to_class_names = {0: 'background', 1: foreground_class_of_interest}
with open(visualwakewords_labels_filename, 'w') as fp:
for label in labels_to_class_names:
fp.write(str(label) + ':' + str(labels_to_class_names[label]) + '\n')
with open(visualwakewords_annotations_path, 'w') as fp:
json.dump(
{
'images': images,
'annotations': annotations_index_filtered,
'categories': category_index
}, fp)
def _filter_annotations_list(annotations_list, image,
small_object_area_threshold,
foreground_class_of_interest_id):
"""Filters COCO annotations_list to visual wakewords annotations_list.
Each image is assigned a label 1 or 0. The label 1 is assigned as long
as it has at least one foreground_class_of_interest (e.g. person)
bounding box greater than 5% of the image area.
Args:
annotations_list: list of dicts with keys: [ u'id', u'image_id',
u'category_id', u'segmentation', u'area', u'bbox' : [x,y,width,height],
u'iscrowd']. Notice that bounding box coordinates in the official COCO
dataset are given as [x, y, width, height] tuples using absolute
coordinates where x, y represent the top-left (0-indexed) corner.
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
small_object_area_threshold: threshold on fraction of image area below which
small objects are filtered
foreground_class_of_interest_id: category of COCO dataset which visual
wakewords filters
Returns:
filtered_annotations_list: list of dicts with keys: [ u'image_id',
u'label', u'category_id', u'count',
u'object':[{"category_id", "area", "bbox" : [x,y,width,height],}]
"""
category_ids = []
area = []
flag_small_object = []
num_ann = 0
image_height = image['height']
image_width = image['width']
image_area = image_height * image_width
bbox = []
# count of filtered object
count = 0
for object_annotations in annotations_list:
(x, y, width, height) = tuple(object_annotations['bbox'])
category_id = int(object_annotations['category_id'])
category_ids.append(category_id)
obj_area = object_annotations['area']
normalized_object_area = obj_area / image_area
# Filter small object bounding boxes
if category_id == foreground_class_of_interest_id:
if normalized_object_area < small_object_area_threshold:
flag_small_object.append(True)
else:
flag_small_object.append(False)
bbox.append({
u'bbox': [x, y, width, height],
u'area': obj_area,
u'category_id': category_id
})
count = count + 1
area.append(obj_area)
num_ann = num_ann + 1
# Filtered annotations_list with two classes corresponding to
# foreground_class_of_interest_id (e.g. person) and
# background (e.g. not-person)
if (foreground_class_of_interest_id in category_ids) and (
False in flag_small_object):
return {
u'image_id': image['id'],
u'label': 1,
u'object': bbox,
u'count': count
}
else:
return {u'image_id': image['id'], u'label': 0, u'object': [], u'count': 0}
def create_tf_record_for_visualwakewords_dataset(annotations_file, image_dir,
output_path, num_shards):
"""Loads Visual WakeWords annotations/images and converts to tf.Record format.
Args:
annotations_file: JSON file containing bounding box annotations.
image_dir: Directory containing the image files.
output_path: Path to output tf.Record file.
num_shards: number of output file shards.
"""
with contextlib2.ExitStack() as tf_record_close_stack, \
tf.gfile.GFile(annotations_file, 'r') as fid:
output_tfrecords = dataset_utils.open_sharded_output_tfrecords(
tf_record_close_stack, output_path, num_shards)
groundtruth_data = json.load(fid)
images = groundtruth_data['images']
category_index = {}
for category in groundtruth_data['categories'].values():
# if not background class
if category['id'] != 0:
category_index[category['id']] = category
annotations_index = {}
if 'annotations' in groundtruth_data:
tf.logging.info(
'Found groundtruth annotations. Building annotations index.')
for annotation in groundtruth_data['annotations'].values():
image_id = annotation[0]['image_id']
if image_id not in annotations_index:
annotations_index[image_id] = []
annotations_index[image_id].append(annotation[0])
missing_annotation_count = 0
for image in images:
image_id = image['id']
if image_id not in annotations_index:
missing_annotation_count += 1
annotations_index[image_id] = []
tf.logging.info('%d images are missing annotations.',
missing_annotation_count)
total_num_annotations_skipped = 0
for idx, image in enumerate(images):
if idx % 100 == 0:
tf.logging.info('On image %d of %d', idx, len(images))
annotations_list = annotations_index[image['id']]
_, tf_example, num_annotations_skipped = _create_tf_example(
image, annotations_list[0], image_dir)
total_num_annotations_skipped += num_annotations_skipped
shard_idx = idx % num_shards
output_tfrecords[shard_idx].write(tf_example.SerializeToString())
tf.logging.info('Finished writing, skipped %d annotations.',
total_num_annotations_skipped)
def _create_tf_example(image, annotations_list, image_dir):
"""Converts image and annotations to a tf.Example proto.
Args:
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
annotations_list:
list of dicts with keys: [u'image_id', u'bbox', u'label',
object[{"category_id", "area", "bbox" : [x,y,width,height],}]]. Notice
that bounding box coordinates in the COCO dataset are given as [x, y,
width, height] tuples using absolute coordinates where x, y represent
the top-left (0-indexed) corner. This function converts to the format
that can be used by the Tensorflow Object Detection API (which is [ymin,
xmin, ymax, xmax] with coordinates normalized relative to image size).
image_dir: directory containing the image files.
Returns:
example: The converted tf.Example
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG
"""
image_height = image['height']
image_width = image['width']
filename = image['file_name']
image_id = image['id']
full_path = os.path.join(image_dir, filename)
with tf.gfile.GFile(full_path, 'rb') as fid:
encoded_jpg = fid.read()
encoded_jpg_io = io.BytesIO(encoded_jpg)
image = PIL.Image.open(encoded_jpg_io)
key = hashlib.sha256(encoded_jpg).hexdigest()
xmin = []
xmax = []
ymin = []
ymax = []
category_ids = []
area = []
num_annotations_skipped = 0
label = annotations_list['label']
for object_annotations in annotations_list['object']:
(x, y, width, height) = tuple(object_annotations['bbox'])
if width <= 0 or height <= 0:
num_annotations_skipped += 1
continue
if x + width > image_width or y + height > image_height:
num_annotations_skipped += 1
continue
xmin.append(float(x) / image_width)
xmax.append(float(x + width) / image_width)
ymin.append(float(y) / image_height)
ymax.append(float(y + height) / image_height)
category_id = int(object_annotations['category_id'])
category_ids.append(category_id)
area.append(object_annotations['area'])
feature_dict = {
'image/height':
dataset_utils.int64_feature(image_height),
'image/width':
dataset_utils.int64_feature(image_width),
'image/filename':
dataset_utils.bytes_feature(filename.encode('utf8')),
'image/source_id':
dataset_utils.bytes_feature(str(image_id).encode('utf8')),
'image/key/sha256':
dataset_utils.bytes_feature(key.encode('utf8')),
'image/encoded':
dataset_utils.bytes_feature(encoded_jpg),
'image/format':
dataset_utils.bytes_feature('jpeg'.encode('utf8')),
'image/class/label':
dataset_utils.int64_feature(label),
'image/object/bbox/xmin':
dataset_utils.float_list_feature(xmin),
'image/object/bbox/xmax':
dataset_utils.float_list_feature(xmax),
'image/object/bbox/ymin':
dataset_utils.float_list_feature(ymin),
'image/object/bbox/ymax':
dataset_utils.float_list_feature(ymax),
'image/object/class/label':
dataset_utils.int64_feature(label),
'image/object/area':
dataset_utils.float_list_feature(area),
}
example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
return key, example, num_annotations_skipped
...@@ -22,12 +22,14 @@ from datasets import cifar10 ...@@ -22,12 +22,14 @@ from datasets import cifar10
from datasets import flowers from datasets import flowers
from datasets import imagenet from datasets import imagenet
from datasets import mnist from datasets import mnist
from datasets import visualwakewords
datasets_map = { datasets_map = {
'cifar10': cifar10, 'cifar10': cifar10,
'flowers': flowers, 'flowers': flowers,
'imagenet': imagenet, 'imagenet': imagenet,
'mnist': mnist, 'mnist': mnist,
'visualwakewords': visualwakewords,
} }
......
...@@ -41,6 +41,30 @@ def int64_feature(values): ...@@ -41,6 +41,30 @@ def int64_feature(values):
return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
def bytes_list_feature(values):
"""Returns a TF-Feature of list of bytes.
Args:
values: A string or list of strings.
Returns:
A TF-Feature.
"""
return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))
def float_list_feature(values):
"""Returns a TF-Feature of list of floats.
Args:
values: A float or list of floats.
Returns:
A TF-Feature.
"""
return tf.train.Feature(float_list=tf.train.FloatList(value=values))
def bytes_feature(values): def bytes_feature(values):
"""Returns a TF-Feature of bytes. """Returns a TF-Feature of bytes.
...@@ -148,3 +172,28 @@ def read_label_file(dataset_dir, filename=LABELS_FILENAME): ...@@ -148,3 +172,28 @@ def read_label_file(dataset_dir, filename=LABELS_FILENAME):
index = line.index(':') index = line.index(':')
labels_to_class_names[int(line[:index])] = line[index+1:] labels_to_class_names[int(line[:index])] = line[index+1:]
return labels_to_class_names return labels_to_class_names
def open_sharded_output_tfrecords(exit_stack, base_path, num_shards):
"""Opens all TFRecord shards for writing and adds them to an exit stack.
Args:
exit_stack: A context2.ExitStack used to automatically closed the TFRecords
opened in this function.
base_path: The base path for all shards
num_shards: The number of shards
Returns:
The list of opened TFRecords. Position k in the list corresponds to shard k.
"""
tf_record_output_filenames = [
'{}-{:05d}-of-{:05d}'.format(base_path, idx, num_shards)
for idx in range(num_shards)
]
tfrecords = [
exit_stack.enter_context(tf.python_io.TFRecordWriter(file_name))
for file_name in tf_record_output_filenames
]
return tfrecords
...@@ -136,7 +136,7 @@ def _convert_dataset(split_name, filenames, class_names_to_ids, dataset_dir): ...@@ -136,7 +136,7 @@ def _convert_dataset(split_name, filenames, class_names_to_ids, dataset_dir):
sys.stdout.flush() sys.stdout.flush()
# Read the filename: # Read the filename:
image_data = tf.gfile.FastGFile(filenames[i], 'rb').read() image_data = tf.gfile.GFile(filenames[i], 'rb').read()
height, width = image_reader.read_image_dims(sess, image_data) height, width = image_reader.read_image_dims(sess, image_data)
class_name = os.path.basename(os.path.dirname(filenames[i])) class_name = os.path.basename(os.path.dirname(filenames[i]))
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Script to download the COCO dataset. See
# http://cocodataset.org/#overview for an overview of the dataset.
#
# usage:
# bash datasets/download_mscoco.sh path-to-COCO-dataset
#
set -e
if [ -z "$1" ]; then
echo "usage download_mscoco.sh [data dir]"
exit
fi
if [ "$(uname)" == "Darwin" ]; then
UNZIP="tar -xf"
else
UNZIP="unzip -nq"
fi
# Create the output directories.
OUTPUT_DIR="${1%/}"
SCRATCH_DIR="${OUTPUT_DIR}/raw-data"
mkdir -p "${OUTPUT_DIR}"
mkdir -p "${SCRATCH_DIR}"
CURRENT_DIR=$(pwd)
# Helper function to download and unpack a .zip file.
function download_and_unzip() {
local BASE_URL=${1}
local FILENAME=${2}
if [ ! -f ${FILENAME} ]; then
echo "Downloading ${FILENAME} to $(pwd)"
wget -nd -c "${BASE_URL}/${FILENAME}"
else
echo "Skipping download of ${FILENAME}"
fi
echo "Unzipping ${FILENAME}"
${UNZIP} ${FILENAME}
}
cd ${SCRATCH_DIR}
# Download the images.
BASE_IMAGE_URL="http://images.cocodataset.org/zips"
TRAIN_IMAGE_FILE="train2014.zip"
download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE}
TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2014"
VAL_IMAGE_FILE="val2014.zip"
download_and_unzip ${BASE_IMAGE_URL} ${VAL_IMAGE_FILE}
VAL_IMAGE_DIR="${SCRATCH_DIR}/val2014"
# Download the annotations.
BASE_INSTANCES_URL="http://images.cocodataset.org/annotations"
INSTANCES_FILE="annotations_trainval2014.zip"
download_and_unzip ${BASE_INSTANCES_URL} ${INSTANCES_FILE}
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides data for Visual WakeWords Dataset with images+labels.
Visual WakeWords Dataset derives from the COCO dataset to design tiny models
classifying two classes, such as person/not-person. The COCO annotations
are filtered to two classes: person and not-person (or another user-defined
category). Bounding boxes for small objects with area less than 5% of the image
area are filtered out.
See build_visualwakewords_data.py which generates the Visual WakeWords dataset
annotations from the raw COCO dataset and converts them to TFRecord.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf
from datasets import dataset_utils
slim = tf.contrib.slim
_FILE_PATTERN = '%s.record-*'
_SPLITS_TO_SIZES = {
'train': 82783,
'validation': 40504,
}
_ITEMS_TO_DESCRIPTIONS = {
'image': 'A color image of varying height and width.',
'label': 'The label id of the image, an integer in {0, 1}',
'object/bbox': 'A list of bounding boxes.',
'object/label': 'A list of labels, all objects belong to the same class.',
}
_NUM_CLASSES = 2
# labels file
LABELS_FILENAME = 'labels.txt'
def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
"""Gets a dataset tuple with instructions for reading ImageNet.
Args:
split_name: A train/test split name.
dataset_dir: The base directory of the dataset sources.
file_pattern: The file pattern to use when matching the dataset sources. It
is assumed that the pattern contains a '%s' string so that the split name
can be inserted.
reader: The TensorFlow reader type.
Returns:
A `Dataset` namedtuple.
Raises:
ValueError: if `split_name` is not a valid train/test split.
"""
if split_name not in _SPLITS_TO_SIZES:
raise ValueError('split name %s was not recognized.' % split_name)
if not file_pattern:
file_pattern = _FILE_PATTERN
file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
# Allowing None in the signature so that dataset_factory can use the default.
if reader is None:
reader = tf.TFRecordReader
keys_to_features = {
'image/encoded':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/format':
tf.FixedLenFeature((), tf.string, default_value='jpeg'),
'image/class/label':
tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
'image/object/bbox/xmin':
tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymin':
tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/xmax':
tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymax':
tf.VarLenFeature(dtype=tf.float32),
'image/object/class/label':
tf.VarLenFeature(dtype=tf.int64),
}
items_to_handlers = {
'image':
slim.tfexample_decoder.Image('image/encoded', 'image/format'),
'label':
slim.tfexample_decoder.Tensor('image/class/label'),
'object/bbox':
slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
'image/object/bbox/'),
'object/label':
slim.tfexample_decoder.Tensor('image/object/class/label'),
}
decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
items_to_handlers)
labels_to_names = None
labels_file = os.path.join(dataset_dir, LABELS_FILENAME)
if tf.gfile.Exists(labels_file):
labels_to_names = dataset_utils.read_label_file(dataset_dir)
return slim.dataset.Dataset(
data_sources=file_pattern,
reader=reader,
decoder=decoder,
num_samples=_SPLITS_TO_SIZES[split_name],
items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
num_classes=_NUM_CLASSES,
labels_to_names=labels_to_names)
...@@ -171,6 +171,7 @@ def expanded_conv(input_tensor, ...@@ -171,6 +171,7 @@ def expanded_conv(input_tensor,
project_activation_fn=tf.identity, project_activation_fn=tf.identity,
split_projection=1, split_projection=1,
split_expansion=1, split_expansion=1,
split_divisible_by=8,
expansion_transform=None, expansion_transform=None,
depthwise_location='expansion', depthwise_location='expansion',
depthwise_channel_multiplier=1, depthwise_channel_multiplier=1,
...@@ -202,6 +203,7 @@ def expanded_conv(input_tensor, ...@@ -202,6 +203,7 @@ def expanded_conv(input_tensor,
split_expansion: how many ways to split expansion op split_expansion: how many ways to split expansion op
(that is conv bottleneck->expansion) ops will keep depth divisible (that is conv bottleneck->expansion) ops will keep depth divisible
by this value. by this value.
split_divisible_by: make sure every split group is divisible by this number.
expansion_transform: Optional function that takes expansion expansion_transform: Optional function that takes expansion
as a single input and returns output. as a single input and returns output.
depthwise_location: where to put depthwise covnvolutions supported depthwise_location: where to put depthwise covnvolutions supported
...@@ -268,6 +270,7 @@ def expanded_conv(input_tensor, ...@@ -268,6 +270,7 @@ def expanded_conv(input_tensor,
inner_size, inner_size,
num_ways=split_expansion, num_ways=split_expansion,
scope='expand', scope='expand',
divisible_by=split_divisible_by,
stride=1, stride=1,
normalizer_fn=normalizer_fn) normalizer_fn=normalizer_fn)
net = tf.identity(net, 'expansion_output') net = tf.identity(net, 'expansion_output')
...@@ -292,6 +295,7 @@ def expanded_conv(input_tensor, ...@@ -292,6 +295,7 @@ def expanded_conv(input_tensor,
num_ways=split_projection, num_ways=split_projection,
stride=1, stride=1,
scope='project', scope='project',
divisible_by=split_divisible_by,
normalizer_fn=normalizer_fn, normalizer_fn=normalizer_fn,
activation_fn=project_activation_fn) activation_fn=project_activation_fn)
if endpoints is not None: if endpoints is not None:
......
...@@ -110,7 +110,7 @@ _Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func']) ...@@ -110,7 +110,7 @@ _Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
def op(opfunc, multiplier_func=depth_multiplier, **params): def op(opfunc, multiplier_func=depth_multiplier, **params):
multiplier = params.pop('multiplier_transorm', multiplier_func) multiplier = params.pop('multiplier_transform', multiplier_func)
return _Op(opfunc, params=params, multiplier_func=multiplier) return _Op(opfunc, params=params, multiplier_func=multiplier)
......
...@@ -105,8 +105,7 @@ def mobilenet(input_tensor, ...@@ -105,8 +105,7 @@ def mobilenet(input_tensor,
input_tensor: The input tensor input_tensor: The input tensor
num_classes: number of classes num_classes: number of classes
depth_multiplier: The multiplier applied to scale number of depth_multiplier: The multiplier applied to scale number of
channels in each layer. Note: this is called depth multiplier in the channels in each layer.
paper but the name is kept for consistency with slim's model builder.
scope: Scope of the operator scope: Scope of the operator
conv_defs: Allows to override default conv def. conv_defs: Allows to override default conv def.
finegrain_classification_mode: When set to True, the model finegrain_classification_mode: When set to True, the model
......
...@@ -153,10 +153,14 @@ def distorted_bounding_box_crop(image, ...@@ -153,10 +153,14 @@ def distorted_bounding_box_crop(image,
return cropped_image, distort_bbox return cropped_image, distort_bbox
def preprocess_for_train(image, height, width, bbox, def preprocess_for_train(image,
height,
width,
bbox,
fast_mode=True, fast_mode=True,
scope=None, scope=None,
add_image_summaries=True): add_image_summaries=True,
random_crop=True):
"""Distort one image for training a network. """Distort one image for training a network.
Distorting images provides a useful technique for augmenting the data Distorting images provides a useful technique for augmenting the data
...@@ -180,6 +184,8 @@ def preprocess_for_train(image, height, width, bbox, ...@@ -180,6 +184,8 @@ def preprocess_for_train(image, height, width, bbox,
bi-cubic resizing, random_hue or random_contrast). bi-cubic resizing, random_hue or random_contrast).
scope: Optional scope for name_scope. scope: Optional scope for name_scope.
add_image_summaries: Enable image summaries. add_image_summaries: Enable image summaries.
random_crop: Enable random cropping of images during preprocessing for
training.
Returns: Returns:
3-D float Tensor of distorted image used for training with range [-1, 1]. 3-D float Tensor of distorted image used for training with range [-1, 1].
""" """
...@@ -197,15 +203,18 @@ def preprocess_for_train(image, height, width, bbox, ...@@ -197,15 +203,18 @@ def preprocess_for_train(image, height, width, bbox,
if add_image_summaries: if add_image_summaries:
tf.summary.image('image_with_bounding_boxes', image_with_box) tf.summary.image('image_with_bounding_boxes', image_with_box)
distorted_image, distorted_bbox = distorted_bounding_box_crop(image, bbox) if not random_crop:
# Restore the shape since the dynamic slice based upon the bbox_size loses distorted_image = image
# the third dimension. else:
distorted_image.set_shape([None, None, 3]) distorted_image, distorted_bbox = distorted_bounding_box_crop(image, bbox)
image_with_distorted_box = tf.image.draw_bounding_boxes( # Restore the shape since the dynamic slice based upon the bbox_size loses
tf.expand_dims(image, 0), distorted_bbox) # the third dimension.
if add_image_summaries: distorted_image.set_shape([None, None, 3])
tf.summary.image('images_with_distorted_bounding_box', image_with_distorted_box = tf.image.draw_bounding_boxes(
image_with_distorted_box) tf.expand_dims(image, 0), distorted_bbox)
if add_image_summaries:
tf.summary.image('images_with_distorted_bounding_box',
image_with_distorted_box)
# This resizing operation may distort the images because the aspect # This resizing operation may distort the images because the aspect
# ratio is not respected. We select a resize method in a round robin # ratio is not respected. We select a resize method in a round robin
...@@ -220,7 +229,7 @@ def preprocess_for_train(image, height, width, bbox, ...@@ -220,7 +229,7 @@ def preprocess_for_train(image, height, width, bbox,
num_cases=num_resize_cases) num_cases=num_resize_cases)
if add_image_summaries: if add_image_summaries:
tf.summary.image('cropped_resized_image', tf.summary.image(('cropped_' if random_crop else '') + 'resized_image',
tf.expand_dims(distorted_image, 0)) tf.expand_dims(distorted_image, 0))
# Randomly flip the image horizontally. # Randomly flip the image horizontally.
...@@ -241,8 +250,12 @@ def preprocess_for_train(image, height, width, bbox, ...@@ -241,8 +250,12 @@ def preprocess_for_train(image, height, width, bbox,
return distorted_image return distorted_image
def preprocess_for_eval(image, height, width, def preprocess_for_eval(image,
central_fraction=0.875, scope=None): height,
width,
central_fraction=0.875,
scope=None,
central_crop=True):
"""Prepare one image for evaluation. """Prepare one image for evaluation.
If height and width are specified it would output an image with that size by If height and width are specified it would output an image with that size by
...@@ -260,6 +273,8 @@ def preprocess_for_eval(image, height, width, ...@@ -260,6 +273,8 @@ def preprocess_for_eval(image, height, width,
width: integer width: integer
central_fraction: Optional Float, fraction of the image to crop. central_fraction: Optional Float, fraction of the image to crop.
scope: Optional scope for name_scope. scope: Optional scope for name_scope.
central_crop: Enable central cropping of images during preprocessing for
evaluation.
Returns: Returns:
3-D float Tensor of prepared image. 3-D float Tensor of prepared image.
""" """
...@@ -268,7 +283,7 @@ def preprocess_for_eval(image, height, width, ...@@ -268,7 +283,7 @@ def preprocess_for_eval(image, height, width,
image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = tf.image.convert_image_dtype(image, dtype=tf.float32)
# Crop the central region of the image with an area containing 87.5% of # Crop the central region of the image with an area containing 87.5% of
# the original image. # the original image.
if central_fraction: if central_crop and central_fraction:
image = tf.image.central_crop(image, central_fraction=central_fraction) image = tf.image.central_crop(image, central_fraction=central_fraction)
if height and width: if height and width:
...@@ -282,11 +297,14 @@ def preprocess_for_eval(image, height, width, ...@@ -282,11 +297,14 @@ def preprocess_for_eval(image, height, width,
return image return image
def preprocess_image(image, height, width, def preprocess_image(image,
height,
width,
is_training=False, is_training=False,
bbox=None, bbox=None,
fast_mode=True, fast_mode=True,
add_image_summaries=True): add_image_summaries=True,
crop_image=True):
"""Pre-process one image for training or evaluation. """Pre-process one image for training or evaluation.
Args: Args:
...@@ -304,6 +322,8 @@ def preprocess_image(image, height, width, ...@@ -304,6 +322,8 @@ def preprocess_image(image, height, width,
[ymin, xmin, ymax, xmax]. [ymin, xmin, ymax, xmax].
fast_mode: Optional boolean, if True avoids slower transformations. fast_mode: Optional boolean, if True avoids slower transformations.
add_image_summaries: Enable image summaries. add_image_summaries: Enable image summaries.
crop_image: Whether to enable cropping of images during preprocessing for
both training and evaluation.
Returns: Returns:
3-D float Tensor containing an appropriately scaled image 3-D float Tensor containing an appropriately scaled image
...@@ -312,7 +332,13 @@ def preprocess_image(image, height, width, ...@@ -312,7 +332,13 @@ def preprocess_image(image, height, width,
ValueError: if user does not provide bounding box ValueError: if user does not provide bounding box
""" """
if is_training: if is_training:
return preprocess_for_train(image, height, width, bbox, fast_mode, return preprocess_for_train(
add_image_summaries=add_image_summaries) image,
height,
width,
bbox,
fast_mode,
add_image_summaries=add_image_summaries,
random_crop=crop_image)
else: else:
return preprocess_for_eval(image, height, width) return preprocess_for_eval(image, height, width, central_crop=crop_image)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment