Commit b3f04bca authored by Mark Daoust's avatar Mark Daoust Committed by Neal Wu
Browse files

Initialize examples directory. (#2546)

parent ddebf55c
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A collection of regression examples using `Estimators`."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A dataset loader for imports85.data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
import tensorflow as tf
try:
import pandas as pd # pylint: disable=g-import-not-at-top
except ImportError:
pass
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
# Order is important for the csv-readers, so we use an OrderedDict here.
COLUMN_TYPES = collections.OrderedDict([
("symboling", int),
("normalized-losses", float),
("make", str),
("fuel-type", str),
("aspiration", str),
("num-of-doors", str),
("body-style", str),
("drive-wheels", str),
("engine-location", str),
("wheel-base", float),
("length", float),
("width", float),
("height", float),
("curb-weight", float),
("engine-type", str),
("num-of-cylinders", str),
("engine-size", float),
("fuel-system", str),
("bore", float),
("stroke", float),
("compression-ratio", float),
("horsepower", float),
("peak-rpm", float),
("city-mpg", float),
("highway-mpg", float),
("price", float)
])
def raw_dataframe():
"""Load the imports85 data as a pd.DataFrame."""
# Download and cache the data
path = tf.keras.utils.get_file(URL.split("/")[-1], URL)
# Load it into a pandas dataframe
df = pd.read_csv(path, names=COLUMN_TYPES.keys(),
dtype=COLUMN_TYPES, na_values="?")
return df
def load_data(y_name="price", train_fraction=0.7, seed=None):
"""Get the imports85 data set.
A description of the data is available at:
https://archive.ics.uci.edu/ml/datasets/automobile
The data itself can be found at:
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Args:
y_name: the column to return as the label.
train_fraction: the fraction of the dataset to use for training.
seed: The random seed to use when shuffling the data. `None` generates a
unique shuffle every run.
Returns:
a pair of pairs where the first pair is the training data, and the second
is the test data:
`(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
`x` contains a pandas DataFrame of features, while `y` contains the label
array.
"""
# Load the raw data columns.
data = raw_dataframe()
# Delete rows with unknowns
data = data.dropna()
# Shuffle the data
np.random.seed(seed)
# Split the data into train/test subsets.
x_train = data.sample(frac=train_fraction, random_state=seed)
x_test = data.drop(x_train.index)
# Extract the label from the features dataframe.
y_train = x_train.pop(y_name)
y_test = x_test.pop(y_name)
return (x_train, y_train), (x_test, y_test)
def make_dataset(x, y=None):
"""Create a slice dataset from a pandas DataFrame and labels"""
# TODO(markdaooust): simplify this after the 1.4 cut.
# Convert the DataFrame to a dict
x = dict(x)
# Convert the pd.Series to np.arrays
for key in x:
x[key] = np.array(x[key])
items = [x]
if y is not None:
items.append(np.array(y, dtype=np.float32))
# Create a Dataset of slices
return tf.data.Dataset.from_tensor_slices(tuple(items))
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Regression using the DNNRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=1000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def my_dnn_regression_fn(features, labels, mode, params):
"""A model function implementing DNN regression for a custom Estimator."""
# Extract the input into a dense layer, according to the feature_columns.
top = tf.feature_column.input_layer(features, params["feature_columns"])
# Iterate over the "hidden_units" list of layer sizes, default is [20].
for units in params.get("hidden_units", [20]):
# Add a hidden layer, densely connected on top of the previous layer.
top = tf.layers.dense(inputs=top, units=units, activation=tf.nn.relu)
# Connect a linear output layer on top.
output_layer = tf.layers.dense(inputs=top, units=1)
# Reshape the output layer to a 1-dim Tensor to return predictions
predictions = tf.squeeze(output_layer, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
# In `PREDICT` mode we only need to return predictions.
return tf.estimator.EstimatorSpec(
mode=mode, predictions={"price": predictions})
# Calculate loss using mean squared error
average_loss = tf.losses.mean_squared_error(labels, predictions)
# Pre-made estimators use the total_loss instead of the average,
# so report total_loss for compatibility.
batch_size = tf.shape(labels)[0]
total_loss = tf.to_float(batch_size) * average_loss
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = params.get("optimizer", tf.train.AdamOptimizer)
optimizer = optimizer(params.get("learning_rate", None))
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(
mode=mode, loss=total_loss, train_op=train_op)
# In evaluation mode we will calculate evaluation metrics.
assert mode == tf.estimator.ModeKeys.EVAL
# Calculate root mean squared error
print(labels)
print(predictions)
rmse = tf.metrics.root_mean_squared_error(labels, predictions)
# Add the rmse to the collection of evaluation metrics.
eval_metrics = {"rmse": rmse}
return tf.estimator.EstimatorSpec(
mode=mode,
# Report sum of error for compatibility with pre-made estimators
loss=total_loss,
eval_metric_ops=eval_metrics)
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
# The first way assigns a unique weight to each category. To do this you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero). Here we specify the vocabulary using a list of
# options. The vocabulary can also be specified with a vocabulary file (using
# `categorical_column_with_vocabulary_file`). For features covering a
# range of positive integers use `categorical_column_with_identity`.
body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
body_style = tf.feature_column.categorical_column_with_vocabulary_list(
key="body-style", vocabulary_list=body_style_vocab)
make = tf.feature_column.categorical_column_with_hash_bucket(
key="make", hash_bucket_size=50)
feature_columns = [
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
# Since this is a DNN model, convert categorical columns from sparse
# to dense.
# Wrap them in an `indicator_column` to create a
# one-hot vector from the input.
tf.feature_column.indicator_column(body_style),
# Or use an `embedding_column` to create a trainable vector for each
# index.
tf.feature_column.embedding_column(make, dimension=3),
]
# Build a custom Estimator, using the model_fn.
# `params` is passed through to the `model_fn`.
model = tf.estimator.Estimator(
model_fn=my_dnn_regression_fn,
params={
"feature_columns": feature_columns,
"learning_rate": 0.001,
"optimizer": tf.train.AdamOptimizer,
"hidden_units": [20, 20]
})
# Train the model.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# Print the Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * eval_result["rmse"]))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Regression using the DNNRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=5000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
# Use the same categorical columns as in `linear_regression_categorical`
body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
body_style_column = tf.feature_column.categorical_column_with_vocabulary_list(
key="body-style", vocabulary_list=body_style_vocab)
make_column = tf.feature_column.categorical_column_with_hash_bucket(
key="make", hash_bucket_size=50)
feature_columns = [
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
# Since this is a DNN model, categorical columns must be converted from
# sparse to dense.
# Wrap them in an `indicator_column` to create a
# one-hot vector from the input.
tf.feature_column.indicator_column(body_style_column),
# Or use an `embedding_column` to create a trainable vector for each
# index.
tf.feature_column.embedding_column(make_column, dimension=3),
]
# Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns
# defined above as input.
model = tf.estimator.DNNRegressor(
hidden_units=[20, 20], feature_columns=feature_columns)
# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]
# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * average_loss**0.5))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Linear regression using the LinearRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import numpy as np
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=1000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
feature_columns = [
# "curb-weight" and "highway-mpg" are numeric columns.
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
]
# Build the Estimator.
model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]
# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * average_loss**0.5))
# Run the model in prediction mode.
input_dict = {
"curb-weight": np.array([2000, 3000]),
"highway-mpg": np.array([30, 40])
}
predict = automobile_data.make_dataset(input_dict).batch(1)
predict_results = model.predict(input_fn=from_dataset(predict))
# Print the prediction results.
print("\nPrediction results:")
for i, prediction in enumerate(predict_results):
msg = ("Curb weight: {: 4d}lbs, "
"Highway: {: 0d}mpg, "
"Prediction: ${: 9.2f}")
msg = msg.format(input_dict["curb-weight"][i], input_dict["highway-mpg"][i],
args.price_norm_factor * prediction["predictions"][0])
print(" " + msg)
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Linear regression with categorical features."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=1000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
# The following code demonstrates two of the ways that `feature_columns` can
# be used to build a model with categorical inputs.
# The first way assigns a unique weight to each category. To do this, you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero).
# Alternatively, you can define the vocabulary in a file (by calling
# `categorical_column_with_vocabulary_file`) or as a range of positive
# integers (by calling `categorical_column_with_identity`)
body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
body_style_column = tf.feature_column.categorical_column_with_vocabulary_list(
key="body-style", vocabulary_list=body_style_vocab)
# The second way, appropriate for an unspecified vocabulary, is to create a
# hashed column. It will create a fixed length list of weights, and
# automatically assign each input category to a weight. Due to the
# pseudo-randomness of the process, some weights may be shared between
# categories, while others will remain unused.
make_column = tf.feature_column.categorical_column_with_hash_bucket(
key="make", hash_bucket_size=50)
feature_columns = [
# This model uses the same two numeric features as `linear_regressor.py`
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
# This model adds two categorical colums that will adjust the price based
# on "make" and "body-style".
body_style_column,
make_column,
]
# Build the Estimator.
model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]
# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * average_loss**0.5))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A simple smoke test that runs these examples for 1 training iteraton."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import tensorflow as tf
from six.moves import StringIO
import automobile_data
import dnn_regression
import linear_regression
import linear_regression_categorical
import custom_regression
# pylint: disable=line-too-long
FOUR_LINES = "\n".join([
"1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.50,171.20,65.50,52.40,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500",
"2,164,audi,gas,std,four,sedan,fwd,front,99.80,176.60,66.20,54.30,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950",
"2,164,audi,gas,std,four,sedan,4wd,front,99.40,176.60,66.40,54.30,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450",
"2,?,audi,gas,std,two,sedan,fwd,front,99.80,177.30,66.30,53.10,2507,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,15250",])
# pylint: enable=line-too-long
mock = tf.test.mock
def four_lines_dataframe():
text = StringIO(FOUR_LINES)
return pd.read_csv(text, names=automobile_data.COLUMN_TYPES.keys(),
dtype=automobile_data.COLUMN_TYPES, na_values="?")
def four_lines_dataset(*args, **kwargs):
del args, kwargs
return tf.data.Dataset.from_tensor_slices(FOUR_LINES.split("\n"))
class RegressionTest(tf.test.TestCase):
"""Test the regression examples in this directory."""
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_linear_regression(self):
linear_regression.main([None, "--train_steps=1"])
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_linear_regression_categorical(self):
linear_regression_categorical.main([None, "--train_steps=1"])
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_dnn_regression(self):
dnn_regression.main([None, "--train_steps=1"])
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_custom_regression(self):
custom_regression.main([None, "--train_steps=1"])
if __name__ == "__main__":
tf.test.main()
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example of DNNClassifier for Iris plant dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import pandas as pd
import tensorflow as tf
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=200, type=int,
help='number of training steps')
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
COLUMNS = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Sentosa', 'Versicolor', 'Virginica']
def load_data(train_fraction=0.8, seed=0, y_name='Species'):
"""Returns the iris dataset as (train_x, train_y), (test_x, test_y)."""
train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
train = pd.read_csv(train_path, names=COLUMNS, header=0)
train_x, train_y = train, train.pop(y_name)
test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)
test = pd.read_csv(test_path, names=COLUMNS, header=0)
test_x, test_y = test, test.pop(y_name)
return (train_x, train_y), (test_x, test_y)
def make_dataset(*inputs):
return tf.data.Dataset.from_tensor_slices(inputs)
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def my_model(features, labels, mode, params):
"""DNN with three hidden layers, and dropout of 0.1 probability."""
# Create three fully connected layers each layer having a dropout
# probability of 0.1.
net = tf.feature_column.input_layer(features, params['feature_columns'])
for units in params.get('hidden_units', [10, 20, 10]):
net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
net = tf.layers.dropout(net, rate=0.1,
training=mode == tf.estimator.ModeKeys.TRAIN)
# Compute logits (1 per class).
logits = tf.layers.dense(net, params['n_classes'], activation=None)
# Compute predictions.
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'class_ids': predicted_classes[:, tf.newaxis],
'probabilities': tf.nn.softmax(logits),
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Convert the labels to a one-hot tensor of shape (length of features, 3)
# and with a on-value of 1 for each one-hot vector of length 3.
onehot_labels = tf.one_hot(labels, 3, 1, 0)
# Compute loss.
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
# Compute evaluation metrics.
accuracy = tf.metrics.accuracy(labels=labels,
predictions=predicted_classes,
name='acc_op')
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
# Create training op.
assert mode == tf.estimator.ModeKeys.TRAIN
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def main(argv):
args = parser.parse_args(argv[1:])
# Fetch the data
(train_x, train_y), (test_x, test_y) = load_data()
train_x = dict(train_x)
test_x = dict(test_x)
# Feature columns describe the input: all columns are numeric.
feature_columns = [tf.feature_column.numeric_column(col_name)
for col_name in COLUMNS[:-1]]
# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.estimator.Estimator(
model_fn=my_model,
params={
'feature_columns': feature_columns,
'hidden_units': [10, 20, 10],
'n_classes': 3,
})
# Train the Model.
train = (
make_dataset(train_x, train_y)
.repeat()
.shuffle(1000)
.batch(args.batch_size))
classifier.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate the model.
test = make_dataset(test_x, test_y).batch(args.batch_size)
eval_result = classifier.evaluate(input_fn=from_dataset(test))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
# Generate predictions from the model
predict_input = make_dataset({
'SepalLength': [6.4, 5.8],
'SepalWidth': [3.2, 3.1],
'PetalLength': [4.5, 5.0],
'PetalWidth': [1.5, 1.7],
}).batch(args.batch_size)
for p in classifier.predict(input_fn=from_dataset(predict_input)):
template = ('Prediction is "{}" ({:.1f}%)')
class_id = p['class_ids'][0]
probability = p['probabilities'][class_id]
print(template.format(SPECIES[class_id], 100 * probability))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A simple smoke test that runs these examples for 1 training iteraton."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import pandas as pd
from six.moves import StringIO
import custom_estimator
import premade_estimator
FOUR_LINES = "\n".join([
"1,52.40, 2823,152,2",
"164, 99.80,176.60,66.20,1",
"176,2824, 136,3.19,0",
"2,177.30,66.30, 53.10,1",])
def four_lines_data():
text = StringIO(FOUR_LINES)
df = pd.read_csv(text, names=premade_estimator.COLUMNS)
xy = (df, df.pop("Species"))
return xy, xy
class RegressionTest(tf.test.TestCase):
"""Test the regression examples in this directory."""
@tf.test.mock.patch.dict(premade_estimator.__dict__,
{"load_data": four_lines_data})
def test_premade_estimator(self):
premade_estimator.main([None, "--train_steps=1"])
@tf.test.mock.patch.dict(custom_estimator.__dict__,
{"load_data": four_lines_data})
def test_custom_estimator(self):
custom_estimator.main([None, "--train_steps=1"])
if __name__ == "__main__":
tf.test.main()
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example of DNNClassifier for Iris plant dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import pandas as pd
import tensorflow as tf
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=200, type=int,
help='number of training steps')
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
COLUMNS = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Sentosa', 'Versicolor', 'Virginica']
def load_data(train_fraction=0.8, seed=0, y_name='Species'):
"""Returns the iris dataset as (train_x, train_y), (test_x, test_y)."""
train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
train = pd.read_csv(train_path, names=COLUMNS, header=0)
train_x, train_y = train, train.pop(y_name)
test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)
test = pd.read_csv(test_path, names=COLUMNS, header=0)
test_x, test_y = test, test.pop(y_name)
return (train_x, train_y), (test_x, test_y)
def make_dataset(*inputs):
return tf.data.Dataset.from_tensor_slices(inputs)
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
args = parser.parse_args(argv[1:])
# Fetch the data
(train_x, train_y), (test_x, test_y) = load_data()
train_x = dict(train_x)
test_x = dict(test_x)
# Feature columns describe the input: all columns are numeric.
feature_columns = [tf.feature_column.numeric_column(col_name)
for col_name in COLUMNS[:-1]]
# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=3)
# Train the Model.
train = (
make_dataset(train_x, train_y)
.repeat()
.shuffle(1000)
.batch(args.batch_size))
classifier.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate the model.
test = make_dataset(test_x, test_y).batch(args.batch_size)
eval_result = classifier.evaluate(input_fn=from_dataset(test))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
# Generate predictions from the model
predict_input = make_dataset({
'SepalLength': [6.4, 5.8],
'SepalWidth': [3.2, 3.1],
'PetalLength': [4.5, 5.0],
'PetalWidth': [1.5, 1.7],
}).batch(args.batch_size)
for p in classifier.predict(input_fn=from_dataset(predict_input)):
template = ('Prediction is "{}" ({:.1f}%)')
class_id = p['class_ids'][0]
probability = p['probabilities'][class_id]
print(template.format(SPECIES[class_id], 100 * probability))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# This is the complete code for the following blogpost:
# https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html
# (https://goo.gl/Ujm2Ep)
import os
import six.moves.urllib.request as request
import tensorflow as tf
# Check that we have correct TensorFlow version installed
tf_version = tf.__version__
print("TensorFlow version: {}".format(tf_version))
assert "1.3" <= tf_version, "TensorFlow r1.3 or later is needed"
# Windows users: You only need to change PATH, rest is platform independent
PATH = "/tmp/tf_dataset_and_estimator_apis"
# Fetch and store Training and Test dataset files
PATH_DATASET = PATH + os.sep + "dataset"
FILE_TRAIN = PATH_DATASET + os.sep + "iris_training.csv"
FILE_TEST = PATH_DATASET + os.sep + "iris_test.csv"
URL_TRAIN = "http://download.tensorflow.org/data/iris_training.csv"
URL_TEST = "http://download.tensorflow.org/data/iris_test.csv"
def downloadDataset(url, file):
if not os.path.exists(PATH_DATASET):
os.makedirs(PATH_DATASET)
if not os.path.exists(file):
data = request.urlopen(url).read()
with open(file, "wb") as f:
f.write(data)
f.close()
downloadDataset(URL_TRAIN, FILE_TRAIN)
downloadDataset(URL_TEST, FILE_TEST)
tf.logging.set_verbosity(tf.logging.INFO)
# The CSV features in our training & test data
feature_names = [
'SepalLength',
'SepalWidth',
'PetalLength',
'PetalWidth']
# Create an input function reading a file using the Dataset API
# Then provide the results to the Estimator API
def my_input_fn(file_path, perform_shuffle=False, repeat_count=1):
def decode_csv(line):
parsed_line = tf.decode_csv(line, [[0.], [0.], [0.], [0.], [0]])
label = parsed_line[-1:] # Last element is the label
del parsed_line[-1] # Delete last element
features = parsed_line # Everything but last elements are the features
d = dict(zip(feature_names, features)), label
return d
dataset = (tf.data.TextLineDataset(file_path) # Read text file
.skip(1) # Skip header row
.map(decode_csv)) # Transform each elem by applying decode_csv fn
if perform_shuffle:
# Randomizes input using a window of 256 elements (read into memory)
dataset = dataset.shuffle(buffer_size=256)
dataset = dataset.repeat(repeat_count) # Repeats dataset this # times
dataset = dataset.batch(32) # Batch size to use
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
next_batch = my_input_fn(FILE_TRAIN, True) # Will return 32 random elements
# Create the feature_columns, which specifies the input to our model
# All our input features are numeric, so use numeric_column for each one
feature_columns = [tf.feature_column.numeric_column(k) for k in feature_names]
# Create a deep neural network regression classifier
# Use the DNNClassifier pre-made estimator
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns, # The input features to our model
hidden_units=[10, 10], # Two layers, each with 10 neurons
n_classes=3,
model_dir=PATH) # Path to where checkpoints etc are stored
# Train our model, use the previously function my_input_fn
# Input to training is a file with training example
# Stop training after 8 iterations of train data (epochs)
classifier.train(
input_fn=lambda: my_input_fn(FILE_TRAIN, True, 8))
# Evaluate our model using the examples contained in FILE_TEST
# Return value will contain evaluation_metrics such as: loss & average_loss
evaluate_result = classifier.evaluate(
input_fn=lambda: my_input_fn(FILE_TEST, False, 4))
print("Evaluation results")
for key in evaluate_result:
print(" {}, was: {}".format(key, evaluate_result[key]))
# Predict the type of some Iris flowers.
# Let's predict the examples in FILE_TEST, repeat only once.
predict_results = classifier.predict(
input_fn=lambda: my_input_fn(FILE_TEST, False, 1))
print("Predictions on test file")
for prediction in predict_results:
# Will print the predicted class, i.e: 0, 1, or 2 if the prediction
# is Iris Sentosa, Vericolor, Virginica, respectively.
print(prediction["class_ids"][0])
# Let create a dataset for prediction
# We've taken the first 3 examples in FILE_TEST
prediction_input = [[5.9, 3.0, 4.2, 1.5], # -> 1, Iris Versicolor
[6.9, 3.1, 5.4, 2.1], # -> 2, Iris Virginica
[5.1, 3.3, 1.7, 0.5]] # -> 0, Iris Sentosa
def new_input_fn():
def decode(x):
x = tf.split(x, 4) # Need to split into our 4 features
return dict(zip(feature_names, x)) # To build a dict of them
dataset = tf.data.Dataset.from_tensor_slices(prediction_input)
dataset = dataset.map(decode)
iterator = dataset.make_one_shot_iterator()
next_feature_batch = iterator.get_next()
return next_feature_batch, None # In prediction, we have no labels
# Predict all our prediction_input
predict_results = classifier.predict(input_fn=new_input_fn)
# Print results
print("Predictions:")
for idx, prediction in enumerate(predict_results):
type = prediction["class_ids"][0] # Get the predicted class (index)
if type == 0:
print(" I think: {}, is Iris Sentosa".format(prediction_input[idx]))
elif type == 1:
print(" I think: {}, is Iris Versicolor".format(prediction_input[idx]))
else:
print(" I think: {}, is Iris Virginica".format(prediction_input[idx]))
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment