Unverified Commit 0cceabfc authored by Yiming Shi's avatar Yiming Shi Committed by GitHub
Browse files

Merge branch 'master' into move_to_keraslayers_fasterrcnn_fpn_keras_feature_extractor

parents 17821c0d 39ee0ac9
......@@ -59,6 +59,14 @@ class ComputeBleuTest(tf.test.TestCase):
tokenized = compute_bleu.bleu_tokenize(s)
self.assertEqual(["Test0", ",", "1", "two", ",", "3"], tokenized)
def test_bleu_list(self):
ref = ["test 1 two 3", "more tests!"]
hyp = ["test 1 two 3", "More tests!"]
uncased_score = compute_bleu.bleu_on_list(ref, hyp, False)
cased_score = compute_bleu.bleu_on_list(ref, hyp, True)
self.assertEqual(uncased_score, 100)
self.assertLess(cased_score, 100)
if __name__ == "__main__":
tf.test.main()
......@@ -23,7 +23,7 @@ import random
import tarfile
# pylint: disable=g-bad-import-order
from absl import app as absl_app
from absl import app
from absl import flags
from absl import logging
import six
......@@ -436,4 +436,4 @@ if __name__ == "__main__":
logging.set_verbosity(logging.INFO)
define_data_download_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
app.run(main)
......@@ -43,6 +43,7 @@ class EmbeddingSharedWeights(tf.keras.layers.Layer):
self.shared_weights = self.add_weight(
"weights",
shape=[self.vocab_size, self.hidden_size],
dtype=tf.float32,
initializer=tf.random_normal_initializer(
mean=0., stddev=self.hidden_size**-0.5))
super(EmbeddingSharedWeights, self).build(input_shape)
......
......@@ -18,9 +18,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
K = tf.keras.backend
class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
......@@ -66,72 +64,3 @@ class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
'hidden_size': self.hidden_size,
'warmup_steps': self.warmup_steps,
}
class LearningRateFn(object):
"""Creates learning rate function."""
def __init__(self, learning_rate, hidden_size, warmup_steps):
self.learning_rate = learning_rate
self.hidden_size = hidden_size
self.warmup_steps = float(warmup_steps)
def __call__(self, global_step):
"""Calculate learning rate with linear warmup and rsqrt decay."""
step = float(global_step)
learning_rate = self.learning_rate
learning_rate *= (self.hidden_size ** -0.5)
# Apply linear warmup
learning_rate *= np.minimum(1.0, step / self.warmup_steps)
# Apply rsqrt decay
learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
return learning_rate
class LearningRateScheduler(tf.keras.callbacks.Callback):
"""Keras callback to schedule learning rate.
TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
official/resnet/keras/keras_common.py.
"""
def __init__(self, schedule, init_steps=None, verbose=False):
super(LearningRateScheduler, self).__init__()
self.schedule = schedule
self.verbose = verbose
if init_steps is None:
init_steps = 0.0
self.steps = float(init_steps) # Total steps during training.
def on_epoch_begin(self, epoch, logs=None):
if not hasattr(self.model.optimizer, 'lr'):
raise ValueError('Optimizer must have a "lr" attribute.')
if not hasattr(self.model.optimizer, 'iterations'):
raise ValueError('Optimizer must have a "iterations" attribute.')
def on_train_batch_begin(self, batch, logs=None):
"""Adjusts learning rate for each train batch."""
if self.verbose > 0:
iterations = K.get_value(self.model.optimizer.iterations)
print('Original iteration %d' % iterations)
self.steps += 1.0
try: # new API
lr = float(K.get_value(self.model.optimizer.lr))
lr = self.schedule(self.steps, lr)
except TypeError: # Support for old API for backward compatibility
lr = self.schedule(self.steps)
if not isinstance(lr, (float, np.float32, np.float64)):
raise ValueError('The output of the "schedule" function '
'should be float.')
K.set_value(self.model.optimizer.lr, lr)
K.set_value(self.model.optimizer.iterations, self.steps)
if self.verbose > 0:
print('Batch %05d Step %05d: LearningRateScheduler setting learning '
'rate to %s.' % (batch + 1, self.steps, lr))
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
logs['lr'] = K.get_value(self.model.optimizer.lr)
logs['steps'] = self.steps
......@@ -23,8 +23,8 @@ from __future__ import print_function
import tensorflow as tf
from official.nlp.modeling.layers import position_embedding
from official.nlp.modeling.ops import beam_search
from official.nlp.transformer import attention_layer
from official.nlp.transformer import beam_search
from official.nlp.transformer import embedding_layer
from official.nlp.transformer import ffn_layer
from official.nlp.transformer import metrics
......@@ -52,7 +52,6 @@ def create_model(params, is_train):
logits = tf.keras.layers.Lambda(lambda x: x, name="logits",
dtype=tf.float32)(logits)
model = tf.keras.Model([inputs, targets], logits)
# TODO(reedwm): Can we do this loss in float16 instead of float32?
loss = metrics.transformer_loss(
logits, targets, label_smoothing, vocab_size)
model.add_loss(loss)
......@@ -238,7 +237,6 @@ class Transformer(tf.keras.Model):
decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
max_decode_length, dtype=self.params["dtype"])
# TODO(b/139770046): Refactor code with better naming of i.
def symbols_to_logits_fn(ids, i, cache):
"""Generate logits for next potential IDs.
......
......@@ -241,14 +241,13 @@ class TransformerTask(object):
if params["use_ctl"]:
train_ds_iterator = iter(train_ds)
callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)
callbacks = self._create_callbacks(flags_obj.model_dir, params)
# Only TimeHistory callback is supported for CTL
if params["use_ctl"]:
callbacks = [cb for cb in callbacks
if isinstance(cb, keras_utils.TimeHistory)]
# TODO(b/139418525): Refactor the custom training loop logic.
@tf.function
def train_steps(iterator, steps):
"""Training steps function for TPU runs.
......@@ -408,14 +407,9 @@ class TransformerTask(object):
for i in range(length):
translate.translate_from_input(val_outputs[i], subtokenizer)
def _create_callbacks(self, cur_log_dir, init_steps, params):
def _create_callbacks(self, cur_log_dir, params):
"""Creates a list of callbacks."""
sfunc = optimizer.LearningRateFn(params["learning_rate"],
params["hidden_size"],
params["learning_rate_warmup_steps"])
scheduler_callback = optimizer.LearningRateScheduler(sfunc, init_steps)
callbacks = misc.get_callbacks()
callbacks.append(scheduler_callback)
if params["enable_checkpointing"]:
ckpt_full_path = os.path.join(cur_log_dir, "cp-{epoch:04d}.ckpt")
callbacks.append(
......@@ -427,8 +421,6 @@ class TransformerTask(object):
"""Loads model weights when it is provided."""
if init_weight_path:
logging.info("Load weights: {}".format(init_weight_path))
# TODO(b/139414977): Having the same variable restoring method for both
# TPU and GPU.
if self.use_tpu:
checkpoint = tf.train.Checkpoint(
model=model, optimizer=self._create_optimizer())
......@@ -445,7 +437,7 @@ class TransformerTask(object):
params["learning_rate"], params["hidden_size"],
params["learning_rate_warmup_steps"])
opt = tf.keras.optimizers.Adam(
lr_schedule if self.use_tpu else params["learning_rate"],
lr_schedule,
params["optimizer_adam_beta1"],
params["optimizer_adam_beta2"],
epsilon=params["optimizer_adam_epsilon"])
......
......@@ -181,7 +181,7 @@ def translate_file(model,
raise ValueError("File output is a directory, will not save outputs to "
"file.")
logging.info("Writing to file %s", output_file)
with tf.compat.v1.gfile.Open(output_file, "w") as f:
with tf.io.gfile.GFile(output_file, "w") as f:
for i in sorted_keys:
f.write("%s\n" % translations[i])
......
......@@ -67,7 +67,7 @@ def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
# Calculate smoothing cross entropy
with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
confidence = 1.0 - smoothing
low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
soft_targets = tf.one_hot(
tf.cast(labels, tf.int32),
depth=vocab_size,
......@@ -79,11 +79,11 @@ def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
# Calculate the best (lowest) possible value of cross entropy, and
# subtract from the cross entropy loss.
normalizing_constant = -(
confidence * tf.log(confidence) + tf.to_float(vocab_size - 1) *
low_confidence * tf.log(low_confidence + 1e-20))
confidence * tf.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
* low_confidence * tf.log(low_confidence + 1e-20))
xentropy -= normalizing_constant
weights = tf.to_float(tf.not_equal(labels, 0))
weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
return xentropy * weights, weights
......@@ -142,24 +142,24 @@ def padded_accuracy(logits, labels):
"""Percentage of times that predictions matches labels on non-0s."""
with tf.variable_scope("padded_accuracy", values=[logits, labels]):
logits, labels = _pad_tensors_to_same_length(logits, labels)
weights = tf.to_float(tf.not_equal(labels, 0))
outputs = tf.to_int32(tf.argmax(logits, axis=-1))
padded_labels = tf.to_int32(labels)
return tf.to_float(tf.equal(outputs, padded_labels)), weights
weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
padded_labels = tf.cast(labels, tf.int32)
return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
def padded_accuracy_topk(logits, labels, k):
"""Percentage of times that top-k predictions matches labels on non-0s."""
with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
logits, labels = _pad_tensors_to_same_length(logits, labels)
weights = tf.to_float(tf.not_equal(labels, 0))
weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
effective_k = tf.minimum(k, tf.shape(logits)[-1])
_, outputs = tf.nn.top_k(logits, k=effective_k)
outputs = tf.to_int32(outputs)
padded_labels = tf.to_int32(labels)
outputs = tf.cast(outputs, tf.int32)
padded_labels = tf.cast(labels, tf.int32)
padded_labels = tf.expand_dims(padded_labels, axis=-1)
padded_labels += tf.zeros_like(outputs) # Pad to same shape.
same = tf.to_float(tf.equal(outputs, padded_labels))
same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
same_topk = tf.reduce_sum(same, axis=-1)
return same_topk, weights
......@@ -172,10 +172,11 @@ def padded_sequence_accuracy(logits, labels):
"""Percentage of times that predictions matches labels everywhere (non-0)."""
with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
logits, labels = _pad_tensors_to_same_length(logits, labels)
weights = tf.to_float(tf.not_equal(labels, 0))
outputs = tf.to_int32(tf.argmax(logits, axis=-1))
padded_labels = tf.to_int32(labels)
not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
padded_labels = tf.cast(labels, tf.int32)
not_correct = (tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) *
weights)
axis = list(range(1, len(outputs.get_shape())))
correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
return correct_seq, tf.constant(1.0)
......@@ -201,7 +202,7 @@ def bleu_score(logits, labels):
Returns:
bleu: int, approx bleu score
"""
predictions = tf.to_int32(tf.argmax(logits, axis=-1))
predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
# TODO: Look into removing use of py_func
bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
return bleu, tf.constant(1.0)
......@@ -306,7 +307,7 @@ def rouge_2_fscore(logits, labels):
Returns:
rouge2_fscore: approx rouge-2 f1 score.
"""
predictions = tf.to_int32(tf.argmax(logits, axis=-1))
predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
# TODO: Look into removing use of py_func
rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
return rouge_2_f_score, tf.constant(1.0)
......@@ -383,7 +384,7 @@ def rouge_l_fscore(predictions, labels):
Returns:
rouge_l_fscore: approx rouge-l f1 score.
"""
outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
outputs = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
tf.float32)
return rouge_l_f_score, tf.constant(1.0)
......
......@@ -14,32 +14,14 @@
# ==============================================================================
"""Keras layers of XLNet model in TF 2.0."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import copy
import numpy as np
import tensorflow as tf
from official.nlp.xlnet import data_utils
def gelu(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
return tf.keras.activations.gelu(x, approximate=True)
def rel_shift(x, klen=-1):
......@@ -55,7 +37,7 @@ def rel_shift(x, klen=-1):
def _get_initializer(flags):
"""Get variable intializer."""
"""Get variable initializer."""
if flags.init_method == 'uniform':
initializer = tf.keras.initializers.RandomUniform(
minval=-flags.init_range, maxval=flags.init_range)
......
......@@ -45,6 +45,9 @@ def _get_requirements():
os.path.join(os.path.dirname(__file__), '../requirements.txt'), 'r') as f:
for line in f:
package_name = line.strip()
# Skip empty line or comments starting with "#".
if not package_name or package_name[0] == '#':
continue
if package_name.startswith('-e '):
dependency_links_tmp.append(package_name[3:].strip())
else:
......
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Legacy Models
The **r1** folder contains legacy model implementations developed
using TensorFlow 1.x.
**Note: We will remove this r1 folder from the master branch in June, 2020.**
After removal, you will still be able to access legacy models
in the previous releases.
(e.g., [v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0))
| Model | Description | Reference |
| ----- | ----------- | --------- |
| [Gradient Boosted Trees](boosted_trees) | A gradient boosted trees model to classify higgs boson process from HIGGS dataset | [Link](https://en.wikipedia.org/wiki/Gradient_boosting) |
| [MNIST](mnist) | A basic model to classify digits from the MNIST dataset | [Link](http://yann.lecun.com/exdb/mnist/) |
| [NCF](ncf) | NCF Estimator implementation | [arXiv:1708.05031](https://arxiv.org/abs/1708.05031) |
| [ResNet](resnet) | A deep residual network for image recognition | [arXiv:1512.03385](https://arxiv.org/abs/1512.03385) |
| [Transformer](transformer) | A transformer model to translate the WMT English to German dataset | [arXiv:1706.03762](https://arxiv.org/abs/1706.03762) |
| [Wide & Deep Learning](wide_deep) | A model that combines a wide linear model and deep neural network for recommender systems | [arXiv:1606.07792](https://arxiv.org/abs/1606.07792) |
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Classifying Higgs boson processes in the HIGGS Data Set
## Overview
The [HIGGS Data Set](https://archive.ics.uci.edu/ml/datasets/HIGGS) contains 11 million samples with 28 features, and is for the classification problem to distinguish between a signal process which produces Higgs bosons and a background process which does not.
We use Gradient Boosted Trees algorithm to distinguish the two classes.
---
The code sample uses the high level `tf.estimator.Estimator` and `tf.data.Dataset`. These APIs are great for fast iteration and quickly adapting models to your own datasets without major code overhauls. It allows you to move from single-worker training to distributed training, and makes it easy to export model binaries for prediction. Here, for further simplicity and faster execution, we use a utility function `tf.contrib.estimator.boosted_trees_classifier_train_in_memory`. This utility function is especially effective when the input is provided as in-memory data sets like numpy arrays.
An input function for the `Estimator` typically uses `tf.data.Dataset` API, which can handle various data control like streaming, batching, transform and shuffling. However `boosted_trees_classifier_train_in_memory()` utility function requires that the entire data is provided as a single batch (i.e. without using `batch()` API). Thus in this practice, simply `Dataset.from_tensors()` is used to convert numpy arrays into structured tensors, and `Dataset.zip()` is used to put features and label together.
For further references of `Dataset`, [Read more here](https://www.tensorflow.org/guide/datasets).
## Running the code
First make sure you've [added the models folder to your Python path](/official/#running-the-models); otherwise you may encounter an error like `ImportError: No module named official.boosted_trees`.
### Setup
The [HIGGS Data Set](https://archive.ics.uci.edu/ml/datasets/HIGGS) that this sample uses for training is hosted by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/). We have provided a script that downloads and cleans the necessary files.
```
python data_download.py
```
This will download a file and store the processed file under the directory designated by `--data_dir` (defaults to `/tmp/higgs_data/`). To change the target directory, set the `--data_dir` flag. The directory could be network storages that Tensorflow supports (like Google Cloud Storage, `gs://<bucket>/<path>/`).
The file downloaded to the local temporary folder is about 2.8 GB, and the processed file is about 0.8 GB, so there should be enough storage to handle them.
### Training
This example uses about 3 GB of RAM during training.
You can run the code locally as follows:
```
python train_higgs.py
```
The model is by default saved to `/tmp/higgs_model`, which can be changed using the `--model_dir` flag.
Note that the model_dir is cleaned up before every time training starts.
Model parameters can be adjusted by flags, like `--n_trees`, `--max_depth`, `--learning_rate` and so on. Check out the code for details.
The final accuracy will be around 74% and loss will be around 0.516 over the eval set, when trained with the default parameters.
By default, the first 1 million examples among 11 millions are used for training, and the last 1 million examples are used for evaluation.
The training/evaluation data can be selected as index ranges by flags `--train_start`, `--train_count`, `--eval_start`, `--eval_count`, etc.
### TensorBoard
Run TensorBoard to inspect the details about the graph and training progression.
```
tensorboard --logdir=/tmp/higgs_model # set logdir as --model_dir set during training.
```
## Inference with SavedModel
You can export the model into Tensorflow [SavedModel](https://www.tensorflow.org/guide/saved_model) format by using the argument `--export_dir`:
```
python train_higgs.py --export_dir /tmp/higgs_boosted_trees_saved_model
```
After the model finishes training, use [`saved_model_cli`](https://www.tensorflow.org/guide/saved_model#cli_to_inspect_and_execute_savedmodel) to inspect and execute the SavedModel.
Try the following commands to inspect the SavedModel:
**Replace `${TIMESTAMP}` with the folder produced (e.g. 1524249124)**
```
# List possible tag_sets. Only one metagraph is saved, so there will be one option.
saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/
# Show SignatureDefs for tag_set=serve. SignatureDefs define the outputs to show.
saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/ \
--tag_set serve --all
```
### Inference
Let's use the model to predict the income group of two examples.
Note that this model exports SavedModel with the custom parsing module that accepts csv lines as features. (Each line is an example with 28 columns; be careful to not add a label column, unlike in the training data.)
```
saved_model_cli run --dir /tmp/boosted_trees_higgs_saved_model/${TIMESTAMP}/ \
--tag_set serve --signature_def="predict" \
--input_exprs='inputs=["0.869293,-0.635082,0.225690,0.327470,-0.689993,0.754202,-0.248573,-1.092064,0.0,1.374992,-0.653674,0.930349,1.107436,1.138904,-1.578198,-1.046985,0.0,0.657930,-0.010455,-0.045767,3.101961,1.353760,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678", "1.595839,-0.607811,0.007075,1.818450,-0.111906,0.847550,-0.566437,1.581239,2.173076,0.755421,0.643110,1.426367,0.0,0.921661,-1.190432,-1.615589,0.0,0.651114,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818"]'
```
This will print out the predicted classes and class probabilities. Something like:
```
Result for output key class_ids:
[[1]
[0]]
Result for output key classes:
[['1']
['0']]
Result for output key logistic:
[[0.6440273 ]
[0.10902369]]
Result for output key logits:
[[ 0.59288704]
[-2.1007526 ]]
Result for output key probabilities:
[[0.3559727 0.6440273]
[0.8909763 0.1090237]]
```
Please note that "predict" signature_def gives out different (more detailed) results than "classification" or "serving_default".
## Additional Links
If you are interested in distributed training, take a look at [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
You can also [train models on Cloud ML Engine](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction), which provides [hyperparameter tuning](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction#hyperparameter_tuning) to maximize your model's results and enables [deploying your model for prediction](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction#deploy_a_model_to_support_prediction).
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Downloads the UCI HIGGS Dataset and prepares train data.
The details on the dataset are in https://archive.ics.uci.edu/ml/datasets/HIGGS
It takes a while as it needs to download 2.8 GB over the network, process, then
store it into the specified location as a compressed numpy file.
Usage:
$ python data_download.py --data_dir=/tmp/higgs_data
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import os
import tempfile
# pylint: disable=g-bad-import-order
import numpy as np
import pandas as pd
from six.moves import urllib
from absl import app as absl_app
from absl import flags
import tensorflow as tf
from official.utils.flags import core as flags_core
URL_ROOT = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280"
INPUT_FILE = "HIGGS.csv.gz"
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file to contain "data" array.
def _download_higgs_data_and_save_npz(data_dir):
"""Download higgs data and store as a numpy compressed file."""
input_url = URL_ROOT + "/" + INPUT_FILE
np_filename = os.path.join(data_dir, NPZ_FILE)
if tf.gfile.Exists(np_filename):
raise ValueError("data_dir already has the processed data file: {}".format(
np_filename))
if not tf.gfile.Exists(data_dir):
tf.gfile.MkDir(data_dir)
# 2.8 GB to download.
try:
tf.logging.info("Data downloading...")
temp_filename, _ = urllib.request.urlretrieve(input_url)
# Reading and parsing 11 million csv lines takes 2~3 minutes.
tf.logging.info("Data processing... taking multiple minutes...")
with gzip.open(temp_filename, "rb") as csv_file:
data = pd.read_csv(
csv_file,
dtype=np.float32,
names=["c%02d" % i for i in range(29)] # label + 28 features.
).as_matrix()
finally:
tf.gfile.Remove(temp_filename)
# Writing to temporary location then copy to the data_dir (0.8 GB).
f = tempfile.NamedTemporaryFile()
np.savez_compressed(f, data=data)
tf.gfile.Copy(f.name, np_filename)
tf.logging.info("Data saved to: {}".format(np_filename))
def main(unused_argv):
if not tf.gfile.Exists(FLAGS.data_dir):
tf.gfile.MkDir(FLAGS.data_dir)
_download_higgs_data_and_save_npz(FLAGS.data_dir)
def define_data_download_flags():
"""Add flags specifying data download arguments."""
flags.DEFINE_string(
name="data_dir", default="/tmp/higgs_data",
help=flags_core.help_wrap(
"Directory to download higgs dataset and store training/eval data."))
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
define_data_download_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""A script that builds boosted trees over higgs data.
If you haven't, please run data_download.py beforehand to prepare the data.
For some more details on this example, please refer to README.md as well.
Note that the model_dir is cleaned up before starting the training.
Usage:
$ python train_higgs.py --n_trees=100 --max_depth=6 --learning_rate=0.1 \
--model_dir=/tmp/higgs_model
Note that BoostedTreesClassifier is available since Tensorflow 1.8.0.
So you need to install recent enough version of Tensorflow to use this example.
The training data is by default the first million examples out of 11M examples,
and eval data is by default the last million examples.
They are controlled by --train_start, --train_count, --eval_start, --eval_count.
e.g. to train over the first 10 million examples instead of 1 million:
$ python train_higgs.py --n_trees=100 --max_depth=6 --learning_rate=0.1 \
--model_dir=/tmp/higgs_model --train_count=10000000
Training history and metrics can be inspected using tensorboard.
Set --logdir as the --model_dir set by flag when training
(or the default /tmp/higgs_model).
$ tensorboard --logdir=/tmp/higgs_model
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import app as absl_app
from absl import flags
import numpy as np
import tensorflow.compat.v1 as tf
from official.r1.utils.logs import logger
from official.utils.flags import core as flags_core
from official.utils.flags._conventions import help_wrap
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file containing "data" array
def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count):
"""Reads higgs data from csv and returns train and eval data.
Args:
data_dir: A string, the directory of higgs dataset.
train_start: An integer, the start index of train examples within the data.
train_count: An integer, the number of train examples within the data.
eval_start: An integer, the start index of eval examples within the data.
eval_count: An integer, the number of eval examples within the data.
Returns:
Numpy array of train data and eval data.
"""
npz_filename = os.path.join(data_dir, NPZ_FILE)
try:
# gfile allows numpy to read data from network data sources as well.
with tf.gfile.Open(npz_filename, "rb") as npz_file:
with np.load(npz_file) as npz:
data = npz["data"]
except tf.errors.NotFoundError as e:
raise RuntimeError(
"Error loading data; use data_download.py to prepare the data.\n{}: {}"
.format(type(e).__name__, e))
return (data[train_start:train_start+train_count],
data[eval_start:eval_start+eval_count])
# This showcases how to make input_fn when the input data is available in the
# form of numpy arrays.
def make_inputs_from_np_arrays(features_np, label_np):
"""Makes and returns input_fn and feature_columns from numpy arrays.
The generated input_fn will return tf.data.Dataset of feature dictionary and a
label, and feature_columns will consist of the list of
tf.feature_column.BucketizedColumn.
Note, for in-memory training, tf.data.Dataset should contain the whole data
as a single tensor. Don't use batch.
Args:
features_np: A numpy ndarray (shape=[batch_size, num_features]) for
float32 features.
label_np: A numpy ndarray (shape=[batch_size, 1]) for labels.
Returns:
input_fn: A function returning a Dataset of feature dict and label.
feature_names: A list of feature names.
feature_column: A list of tf.feature_column.BucketizedColumn.
"""
num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
# Create source feature_columns and bucketized_columns.
def get_bucket_boundaries(feature):
"""Returns bucket boundaries for feature by percentiles."""
return np.unique(np.percentile(feature, range(0, 100))).tolist()
source_columns = [
tf.feature_column.numeric_column(
feature_name, dtype=tf.float32,
# Although higgs data have no missing values, in general, default
# could be set as 0 or some reasonable value for missing values.
default_value=0.0)
for feature_name in feature_names
]
bucketized_columns = [
tf.feature_column.bucketized_column(
source_columns[i],
boundaries=get_bucket_boundaries(features_np_list[i]))
for i in range(num_features)
]
# Make an input_fn that extracts source features.
def input_fn():
"""Returns features as a dictionary of numpy arrays, and a label."""
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((tf.data.Dataset.from_tensors(features),
tf.data.Dataset.from_tensors(label_np),))
return input_fn, feature_names, bucketized_columns
def make_eval_inputs_from_np_arrays(features_np, label_np):
"""Makes eval input as streaming batches."""
num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
def input_fn():
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((
tf.data.Dataset.from_tensor_slices(features),
tf.data.Dataset.from_tensor_slices(label_np),)).batch(1000)
return input_fn
def _make_csv_serving_input_receiver_fn(column_names, column_defaults):
"""Returns serving_input_receiver_fn for csv.
The input arguments are relevant to `tf.decode_csv()`.
Args:
column_names: a list of column names in the order within input csv.
column_defaults: a list of default values with the same size of
column_names. Each entity must be either a list of one scalar, or an
empty list to denote the corresponding column is required.
e.g. [[""], [2.5], []] indicates the third column is required while
the first column must be string and the second must be float/double.
Returns:
a serving_input_receiver_fn that handles csv for serving.
"""
def serving_input_receiver_fn():
csv = tf.placeholder(dtype=tf.string, shape=[None], name="csv")
features = dict(zip(column_names, tf.decode_csv(csv, column_defaults)))
receiver_tensors = {"inputs": csv}
return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
return serving_input_receiver_fn
def train_boosted_trees(flags_obj):
"""Train boosted_trees estimator on HIGGS data.
Args:
flags_obj: An object containing parsed flag values.
"""
# Clean up the model directory if present.
if tf.gfile.Exists(flags_obj.model_dir):
tf.gfile.DeleteRecursively(flags_obj.model_dir)
tf.logging.info("## Data loading...")
train_data, eval_data = read_higgs_data(
flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
flags_obj.eval_start, flags_obj.eval_count)
tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
# Data consists of one label column followed by 28 feature columns.
train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
eval_input_fn = make_eval_inputs_from_np_arrays(
features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
tf.logging.info("## Features prepared. Training starts...")
# Create benchmark logger to log info about the training and metric values
run_params = {
"train_start": flags_obj.train_start,
"train_count": flags_obj.train_count,
"eval_start": flags_obj.eval_start,
"eval_count": flags_obj.eval_count,
"n_trees": flags_obj.n_trees,
"max_depth": flags_obj.max_depth,
}
benchmark_logger = logger.config_benchmark_logger(flags_obj)
benchmark_logger.log_run_info(
model_name="boosted_trees",
dataset_name="higgs",
run_params=run_params,
test_id=flags_obj.benchmark_test_id)
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory
# training is yet provided as a contrib library.
from tensorflow.contrib import estimator as contrib_estimator # pylint: disable=g-import-not-at-top
classifier = contrib_estimator.boosted_trees_classifier_train_in_memory(
train_input_fn,
feature_columns,
model_dir=flags_obj.model_dir or None,
n_trees=flags_obj.n_trees,
max_depth=flags_obj.max_depth,
learning_rate=flags_obj.learning_rate)
# Evaluation.
eval_results = classifier.evaluate(eval_input_fn)
# Benchmark the evaluation results
benchmark_logger.log_evaluation_result(eval_results)
# Exporting the savedmodel with csv parsing.
if flags_obj.export_dir is not None:
classifier.export_savedmodel(
flags_obj.export_dir,
_make_csv_serving_input_receiver_fn(
column_names=feature_names,
# columns are all floats.
column_defaults=[[0.0]] * len(feature_names)),
strip_default_attrs=True)
def main(_):
train_boosted_trees(flags.FLAGS)
def define_train_higgs_flags():
"""Add tree related flags as well as training/eval configuration."""
flags_core.define_base(clean=False, stop_threshold=False, batch_size=False,
num_gpu=False, export_dir=True)
flags_core.define_benchmark()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_integer(
name="train_start", default=0,
help=help_wrap("Start index of train examples within the data."))
flags.DEFINE_integer(
name="train_count", default=1000000,
help=help_wrap("Number of train examples within the data."))
flags.DEFINE_integer(
name="eval_start", default=10000000,
help=help_wrap("Start index of eval examples within the data."))
flags.DEFINE_integer(
name="eval_count", default=1000000,
help=help_wrap("Number of eval examples within the data."))
flags.DEFINE_integer(
"n_trees", default=100, help=help_wrap("Number of trees to build."))
flags.DEFINE_integer(
"max_depth", default=6, help=help_wrap("Maximum depths of each tree."))
flags.DEFINE_float(
"learning_rate", default=0.1,
help=help_wrap("The learning rate."))
flags_core.set_defaults(data_dir="/tmp/higgs_data",
model_dir="/tmp/higgs_model")
if __name__ == "__main__":
# Training progress and eval results are shown as logging.INFO; so enables it.
tf.logging.set_verbosity(tf.logging.INFO)
define_train_higgs_flags()
absl_app.run(main)
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# MNIST in TensorFlow
This directory builds a convolutional neural net to classify the [MNIST
dataset](http://yann.lecun.com/exdb/mnist/) using the
[tf.data](https://www.tensorflow.org/api_docs/python/tf/data),
[tf.estimator.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator),
and
[tf.layers](https://www.tensorflow.org/api_docs/python/tf/layers)
APIs.
## Setup
To begin, you'll simply need the latest version of TensorFlow installed.
First make sure you've [added the models folder to your Python path]:
```shell
export PYTHONPATH="$PYTHONPATH:/path/to/models"
```
Otherwise you may encounter an error like `ImportError: No module named official.mnist`.
Then to train the model, run the following:
```
python mnist.py
```
The model will begin training and will automatically evaluate itself on the
validation data.
Illustrative unit tests and benchmarks can be run with:
```
python mnist_test.py
python mnist_test.py --benchmarks=.
```
## Exporting the model
You can export the model into Tensorflow [SavedModel](https://www.tensorflow.org/guide/saved_model) format by using the argument `--export_dir`:
```
python mnist.py --export_dir /tmp/mnist_saved_model
```
The SavedModel will be saved in a timestamped directory under `/tmp/mnist_saved_model/` (e.g. `/tmp/mnist_saved_model/1513630966/`).
**Getting predictions with SavedModel**
Use [`saved_model_cli`](https://www.tensorflow.org/guide/saved_model#cli_to_inspect_and_execute_savedmodel) to inspect and execute the SavedModel.
```
saved_model_cli run --dir /tmp/mnist_saved_model/TIMESTAMP --tag_set serve --signature_def classify --inputs image=examples.npy
```
`examples.npy` contains the data from `example5.png` and `example3.png` in a numpy array, in that order. The array values are normalized to values between 0 and 1.
The output should look similar to below:
```
Result for output key classes:
[5 3]
Result for output key probabilities:
[[ 1.53558474e-07 1.95694142e-13 1.31193523e-09 5.47467265e-03
5.85711526e-22 9.94520664e-01 3.48423509e-06 2.65365645e-17
9.78631419e-07 3.15522470e-08]
[ 1.22413359e-04 5.87615965e-08 1.72251271e-06 9.39960718e-01
3.30306928e-11 2.87386645e-02 2.82353517e-02 8.21146413e-18
2.52568233e-03 4.15460236e-04]]
```
## Experimental: Eager Execution
[Eager execution](https://research.googleblog.com/2017/10/eager-execution-imperative-define-by.html)
(an preview feature in TensorFlow 1.5) is an imperative interface to TensorFlow.
The exact same model defined in `mnist.py` can be trained without creating a
TensorFlow graph using:
```
python mnist_eager.py
```
## Experimental: TPU Acceleration
`mnist.py` (and `mnist_eager.py`) demonstrate training a neural network to
classify digits on CPUs and GPUs. `mnist_tpu.py` can be used to train the
same model using TPUs for hardware acceleration. More information in
the [tensorflow/tpu](https://github.com/tensorflow/tpu) repository.
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""tf.data.Dataset interface to the MNIST dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import os
import shutil
import tempfile
import numpy as np
from six.moves import urllib
import tensorflow as tf
def read32(bytestream):
"""Read 4 bytes from bytestream as an unsigned 32-bit integer."""
dt = np.dtype(np.uint32).newbyteorder('>')
return np.frombuffer(bytestream.read(4), dtype=dt)[0]
def check_image_file_header(filename):
"""Validate that filename corresponds to images for the MNIST dataset."""
with tf.io.gfile.GFile(filename, 'rb') as f:
magic = read32(f)
read32(f) # num_images, unused
rows = read32(f)
cols = read32(f)
if magic != 2051:
raise ValueError('Invalid magic number %d in MNIST file %s' % (magic,
f.name))
if rows != 28 or cols != 28:
raise ValueError(
'Invalid MNIST file %s: Expected 28x28 images, found %dx%d' %
(f.name, rows, cols))
def check_labels_file_header(filename):
"""Validate that filename corresponds to labels for the MNIST dataset."""
with tf.io.gfile.GFile(filename, 'rb') as f:
magic = read32(f)
read32(f) # num_items, unused
if magic != 2049:
raise ValueError('Invalid magic number %d in MNIST file %s' % (magic,
f.name))
def download(directory, filename):
"""Download (and unzip) a file from the MNIST dataset if not already done."""
filepath = os.path.join(directory, filename)
if tf.io.gfile.exists(filepath):
return filepath
if not tf.io.gfile.exists(directory):
tf.io.gfile.makedirs(directory)
# CVDF mirror of http://yann.lecun.com/exdb/mnist/
url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
_, zipped_filepath = tempfile.mkstemp(suffix='.gz')
print('Downloading %s to %s' % (url, zipped_filepath))
urllib.request.urlretrieve(url, zipped_filepath)
with gzip.open(zipped_filepath, 'rb') as f_in, \
tf.io.gfile.GFile(filepath, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(zipped_filepath)
return filepath
def dataset(directory, images_file, labels_file):
"""Download and parse MNIST dataset."""
images_file = download(directory, images_file)
labels_file = download(directory, labels_file)
check_image_file_header(images_file)
check_labels_file_header(labels_file)
def decode_image(image):
# Normalize from [0, 255] to [0.0, 1.0]
image = tf.io.decode_raw(image, tf.uint8)
image = tf.cast(image, tf.float32)
image = tf.reshape(image, [784])
return image / 255.0
def decode_label(label):
label = tf.io.decode_raw(label, tf.uint8) # tf.string -> [tf.uint8]
label = tf.reshape(label, []) # label is a scalar
return tf.cast(label, tf.int32)
images = tf.data.FixedLengthRecordDataset(
images_file, 28 * 28, header_bytes=16).map(decode_image)
labels = tf.data.FixedLengthRecordDataset(
labels_file, 1, header_bytes=8).map(decode_label)
return tf.data.Dataset.zip((images, labels))
def train(directory):
"""tf.data.Dataset object for MNIST training data."""
return dataset(directory, 'train-images-idx3-ubyte',
'train-labels-idx1-ubyte')
def test(directory):
"""tf.data.Dataset object for MNIST test data."""
return dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment