Commit b3f04bca authored by Mark Daoust's avatar Mark Daoust Committed by Neal Wu
Browse files

Initialize examples directory. (#2546)

parent ddebf55c
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A collection of regression examples using `Estimators`."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A dataset loader for imports85.data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
import tensorflow as tf
try:
import pandas as pd # pylint: disable=g-import-not-at-top
except ImportError:
pass
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
# Order is important for the csv-readers, so we use an OrderedDict here.
COLUMN_TYPES = collections.OrderedDict([
("symboling", int),
("normalized-losses", float),
("make", str),
("fuel-type", str),
("aspiration", str),
("num-of-doors", str),
("body-style", str),
("drive-wheels", str),
("engine-location", str),
("wheel-base", float),
("length", float),
("width", float),
("height", float),
("curb-weight", float),
("engine-type", str),
("num-of-cylinders", str),
("engine-size", float),
("fuel-system", str),
("bore", float),
("stroke", float),
("compression-ratio", float),
("horsepower", float),
("peak-rpm", float),
("city-mpg", float),
("highway-mpg", float),
("price", float)
])
def raw_dataframe():
"""Load the imports85 data as a pd.DataFrame."""
# Download and cache the data
path = tf.keras.utils.get_file(URL.split("/")[-1], URL)
# Load it into a pandas dataframe
df = pd.read_csv(path, names=COLUMN_TYPES.keys(),
dtype=COLUMN_TYPES, na_values="?")
return df
def load_data(y_name="price", train_fraction=0.7, seed=None):
"""Get the imports85 data set.
A description of the data is available at:
https://archive.ics.uci.edu/ml/datasets/automobile
The data itself can be found at:
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Args:
y_name: the column to return as the label.
train_fraction: the fraction of the dataset to use for training.
seed: The random seed to use when shuffling the data. `None` generates a
unique shuffle every run.
Returns:
a pair of pairs where the first pair is the training data, and the second
is the test data:
`(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
`x` contains a pandas DataFrame of features, while `y` contains the label
array.
"""
# Load the raw data columns.
data = raw_dataframe()
# Delete rows with unknowns
data = data.dropna()
# Shuffle the data
np.random.seed(seed)
# Split the data into train/test subsets.
x_train = data.sample(frac=train_fraction, random_state=seed)
x_test = data.drop(x_train.index)
# Extract the label from the features dataframe.
y_train = x_train.pop(y_name)
y_test = x_test.pop(y_name)
return (x_train, y_train), (x_test, y_test)
def make_dataset(x, y=None):
"""Create a slice dataset from a pandas DataFrame and labels"""
# TODO(markdaooust): simplify this after the 1.4 cut.
# Convert the DataFrame to a dict
x = dict(x)
# Convert the pd.Series to np.arrays
for key in x:
x[key] = np.array(x[key])
items = [x]
if y is not None:
items.append(np.array(y, dtype=np.float32))
# Create a Dataset of slices
return tf.data.Dataset.from_tensor_slices(tuple(items))
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Regression using the DNNRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=1000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def my_dnn_regression_fn(features, labels, mode, params):
"""A model function implementing DNN regression for a custom Estimator."""
# Extract the input into a dense layer, according to the feature_columns.
top = tf.feature_column.input_layer(features, params["feature_columns"])
# Iterate over the "hidden_units" list of layer sizes, default is [20].
for units in params.get("hidden_units", [20]):
# Add a hidden layer, densely connected on top of the previous layer.
top = tf.layers.dense(inputs=top, units=units, activation=tf.nn.relu)
# Connect a linear output layer on top.
output_layer = tf.layers.dense(inputs=top, units=1)
# Reshape the output layer to a 1-dim Tensor to return predictions
predictions = tf.squeeze(output_layer, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
# In `PREDICT` mode we only need to return predictions.
return tf.estimator.EstimatorSpec(
mode=mode, predictions={"price": predictions})
# Calculate loss using mean squared error
average_loss = tf.losses.mean_squared_error(labels, predictions)
# Pre-made estimators use the total_loss instead of the average,
# so report total_loss for compatibility.
batch_size = tf.shape(labels)[0]
total_loss = tf.to_float(batch_size) * average_loss
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = params.get("optimizer", tf.train.AdamOptimizer)
optimizer = optimizer(params.get("learning_rate", None))
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(
mode=mode, loss=total_loss, train_op=train_op)
# In evaluation mode we will calculate evaluation metrics.
assert mode == tf.estimator.ModeKeys.EVAL
# Calculate root mean squared error
print(labels)
print(predictions)
rmse = tf.metrics.root_mean_squared_error(labels, predictions)
# Add the rmse to the collection of evaluation metrics.
eval_metrics = {"rmse": rmse}
return tf.estimator.EstimatorSpec(
mode=mode,
# Report sum of error for compatibility with pre-made estimators
loss=total_loss,
eval_metric_ops=eval_metrics)
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
# The first way assigns a unique weight to each category. To do this you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero). Here we specify the vocabulary using a list of
# options. The vocabulary can also be specified with a vocabulary file (using
# `categorical_column_with_vocabulary_file`). For features covering a
# range of positive integers use `categorical_column_with_identity`.
body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
body_style = tf.feature_column.categorical_column_with_vocabulary_list(
key="body-style", vocabulary_list=body_style_vocab)
make = tf.feature_column.categorical_column_with_hash_bucket(
key="make", hash_bucket_size=50)
feature_columns = [
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
# Since this is a DNN model, convert categorical columns from sparse
# to dense.
# Wrap them in an `indicator_column` to create a
# one-hot vector from the input.
tf.feature_column.indicator_column(body_style),
# Or use an `embedding_column` to create a trainable vector for each
# index.
tf.feature_column.embedding_column(make, dimension=3),
]
# Build a custom Estimator, using the model_fn.
# `params` is passed through to the `model_fn`.
model = tf.estimator.Estimator(
model_fn=my_dnn_regression_fn,
params={
"feature_columns": feature_columns,
"learning_rate": 0.001,
"optimizer": tf.train.AdamOptimizer,
"hidden_units": [20, 20]
})
# Train the model.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# Print the Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * eval_result["rmse"]))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Regression using the DNNRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=5000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
# Use the same categorical columns as in `linear_regression_categorical`
body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
body_style_column = tf.feature_column.categorical_column_with_vocabulary_list(
key="body-style", vocabulary_list=body_style_vocab)
make_column = tf.feature_column.categorical_column_with_hash_bucket(
key="make", hash_bucket_size=50)
feature_columns = [
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
# Since this is a DNN model, categorical columns must be converted from
# sparse to dense.
# Wrap them in an `indicator_column` to create a
# one-hot vector from the input.
tf.feature_column.indicator_column(body_style_column),
# Or use an `embedding_column` to create a trainable vector for each
# index.
tf.feature_column.embedding_column(make_column, dimension=3),
]
# Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns
# defined above as input.
model = tf.estimator.DNNRegressor(
hidden_units=[20, 20], feature_columns=feature_columns)
# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]
# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * average_loss**0.5))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Linear regression using the LinearRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import numpy as np
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=1000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
feature_columns = [
# "curb-weight" and "highway-mpg" are numeric columns.
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
]
# Build the Estimator.
model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]
# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * average_loss**0.5))
# Run the model in prediction mode.
input_dict = {
"curb-weight": np.array([2000, 3000]),
"highway-mpg": np.array([30, 40])
}
predict = automobile_data.make_dataset(input_dict).batch(1)
predict_results = model.predict(input_fn=from_dataset(predict))
# Print the prediction results.
print("\nPrediction results:")
for i, prediction in enumerate(predict_results):
msg = ("Curb weight: {: 4d}lbs, "
"Highway: {: 0d}mpg, "
"Prediction: ${: 9.2f}")
msg = msg.format(input_dict["curb-weight"][i], input_dict["highway-mpg"][i],
args.price_norm_factor * prediction["predictions"][0])
print(" " + msg)
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Linear regression with categorical features."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import automobile_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=1000, type=int,
help='number of training steps')
parser.add_argument('--price_norm_factor', default=1000., type=float,
help='price normalization factor')
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
"""Builds, trains, and evaluates the model."""
args = parser.parse_args(argv[1:])
(train_x,train_y), (test_x, test_y) = automobile_data.load_data()
train_y /= args.price_norm_factor
test_y /= args.price_norm_factor
# Build the training dataset.
train = (
automobile_data.make_dataset(train_x, train_y)
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
.shuffle(1000).batch(args.batch_size)
# Repeat forever
.repeat())
# Build the validation dataset.
test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
# The following code demonstrates two of the ways that `feature_columns` can
# be used to build a model with categorical inputs.
# The first way assigns a unique weight to each category. To do this, you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero).
# Alternatively, you can define the vocabulary in a file (by calling
# `categorical_column_with_vocabulary_file`) or as a range of positive
# integers (by calling `categorical_column_with_identity`)
body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
body_style_column = tf.feature_column.categorical_column_with_vocabulary_list(
key="body-style", vocabulary_list=body_style_vocab)
# The second way, appropriate for an unspecified vocabulary, is to create a
# hashed column. It will create a fixed length list of weights, and
# automatically assign each input category to a weight. Due to the
# pseudo-randomness of the process, some weights may be shared between
# categories, while others will remain unused.
make_column = tf.feature_column.categorical_column_with_hash_bucket(
key="make", hash_bucket_size=50)
feature_columns = [
# This model uses the same two numeric features as `linear_regressor.py`
tf.feature_column.numeric_column(key="curb-weight"),
tf.feature_column.numeric_column(key="highway-mpg"),
# This model adds two categorical colums that will adjust the price based
# on "make" and "body-style".
body_style_column,
make_column,
]
# Build the Estimator.
model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=from_dataset(test))
# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]
# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
.format(args.price_norm_factor * average_loss**0.5))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A simple smoke test that runs these examples for 1 training iteraton."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import tensorflow as tf
from six.moves import StringIO
import automobile_data
import dnn_regression
import linear_regression
import linear_regression_categorical
import custom_regression
# pylint: disable=line-too-long
FOUR_LINES = "\n".join([
"1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.50,171.20,65.50,52.40,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500",
"2,164,audi,gas,std,four,sedan,fwd,front,99.80,176.60,66.20,54.30,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950",
"2,164,audi,gas,std,four,sedan,4wd,front,99.40,176.60,66.40,54.30,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450",
"2,?,audi,gas,std,two,sedan,fwd,front,99.80,177.30,66.30,53.10,2507,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,15250",])
# pylint: enable=line-too-long
mock = tf.test.mock
def four_lines_dataframe():
text = StringIO(FOUR_LINES)
return pd.read_csv(text, names=automobile_data.COLUMN_TYPES.keys(),
dtype=automobile_data.COLUMN_TYPES, na_values="?")
def four_lines_dataset(*args, **kwargs):
del args, kwargs
return tf.data.Dataset.from_tensor_slices(FOUR_LINES.split("\n"))
class RegressionTest(tf.test.TestCase):
"""Test the regression examples in this directory."""
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_linear_regression(self):
linear_regression.main([None, "--train_steps=1"])
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_linear_regression_categorical(self):
linear_regression_categorical.main([None, "--train_steps=1"])
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_dnn_regression(self):
dnn_regression.main([None, "--train_steps=1"])
@mock.patch.dict(automobile_data.__dict__, {"raw_dataframe": four_lines_dataframe})
def test_custom_regression(self):
custom_regression.main([None, "--train_steps=1"])
if __name__ == "__main__":
tf.test.main()
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example of DNNClassifier for Iris plant dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import pandas as pd
import tensorflow as tf
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=200, type=int,
help='number of training steps')
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
COLUMNS = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Sentosa', 'Versicolor', 'Virginica']
def load_data(train_fraction=0.8, seed=0, y_name='Species'):
"""Returns the iris dataset as (train_x, train_y), (test_x, test_y)."""
train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
train = pd.read_csv(train_path, names=COLUMNS, header=0)
train_x, train_y = train, train.pop(y_name)
test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)
test = pd.read_csv(test_path, names=COLUMNS, header=0)
test_x, test_y = test, test.pop(y_name)
return (train_x, train_y), (test_x, test_y)
def make_dataset(*inputs):
return tf.data.Dataset.from_tensor_slices(inputs)
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def my_model(features, labels, mode, params):
"""DNN with three hidden layers, and dropout of 0.1 probability."""
# Create three fully connected layers each layer having a dropout
# probability of 0.1.
net = tf.feature_column.input_layer(features, params['feature_columns'])
for units in params.get('hidden_units', [10, 20, 10]):
net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
net = tf.layers.dropout(net, rate=0.1,
training=mode == tf.estimator.ModeKeys.TRAIN)
# Compute logits (1 per class).
logits = tf.layers.dense(net, params['n_classes'], activation=None)
# Compute predictions.
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'class_ids': predicted_classes[:, tf.newaxis],
'probabilities': tf.nn.softmax(logits),
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Convert the labels to a one-hot tensor of shape (length of features, 3)
# and with a on-value of 1 for each one-hot vector of length 3.
onehot_labels = tf.one_hot(labels, 3, 1, 0)
# Compute loss.
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
# Compute evaluation metrics.
accuracy = tf.metrics.accuracy(labels=labels,
predictions=predicted_classes,
name='acc_op')
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
# Create training op.
assert mode == tf.estimator.ModeKeys.TRAIN
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def main(argv):
args = parser.parse_args(argv[1:])
# Fetch the data
(train_x, train_y), (test_x, test_y) = load_data()
train_x = dict(train_x)
test_x = dict(test_x)
# Feature columns describe the input: all columns are numeric.
feature_columns = [tf.feature_column.numeric_column(col_name)
for col_name in COLUMNS[:-1]]
# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.estimator.Estimator(
model_fn=my_model,
params={
'feature_columns': feature_columns,
'hidden_units': [10, 20, 10],
'n_classes': 3,
})
# Train the Model.
train = (
make_dataset(train_x, train_y)
.repeat()
.shuffle(1000)
.batch(args.batch_size))
classifier.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate the model.
test = make_dataset(test_x, test_y).batch(args.batch_size)
eval_result = classifier.evaluate(input_fn=from_dataset(test))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
# Generate predictions from the model
predict_input = make_dataset({
'SepalLength': [6.4, 5.8],
'SepalWidth': [3.2, 3.1],
'PetalLength': [4.5, 5.0],
'PetalWidth': [1.5, 1.7],
}).batch(args.batch_size)
for p in classifier.predict(input_fn=from_dataset(predict_input)):
template = ('Prediction is "{}" ({:.1f}%)')
class_id = p['class_ids'][0]
probability = p['probabilities'][class_id]
print(template.format(SPECIES[class_id], 100 * probability))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A simple smoke test that runs these examples for 1 training iteraton."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import pandas as pd
from six.moves import StringIO
import custom_estimator
import premade_estimator
FOUR_LINES = "\n".join([
"1,52.40, 2823,152,2",
"164, 99.80,176.60,66.20,1",
"176,2824, 136,3.19,0",
"2,177.30,66.30, 53.10,1",])
def four_lines_data():
text = StringIO(FOUR_LINES)
df = pd.read_csv(text, names=premade_estimator.COLUMNS)
xy = (df, df.pop("Species"))
return xy, xy
class RegressionTest(tf.test.TestCase):
"""Test the regression examples in this directory."""
@tf.test.mock.patch.dict(premade_estimator.__dict__,
{"load_data": four_lines_data})
def test_premade_estimator(self):
premade_estimator.main([None, "--train_steps=1"])
@tf.test.mock.patch.dict(custom_estimator.__dict__,
{"load_data": four_lines_data})
def test_custom_estimator(self):
custom_estimator.main([None, "--train_steps=1"])
if __name__ == "__main__":
tf.test.main()
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example of DNNClassifier for Iris plant dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import pandas as pd
import tensorflow as tf
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=200, type=int,
help='number of training steps')
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
COLUMNS = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Sentosa', 'Versicolor', 'Virginica']
def load_data(train_fraction=0.8, seed=0, y_name='Species'):
"""Returns the iris dataset as (train_x, train_y), (test_x, test_y)."""
train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
train = pd.read_csv(train_path, names=COLUMNS, header=0)
train_x, train_y = train, train.pop(y_name)
test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)
test = pd.read_csv(test_path, names=COLUMNS, header=0)
test_x, test_y = test, test.pop(y_name)
return (train_x, train_y), (test_x, test_y)
def make_dataset(*inputs):
return tf.data.Dataset.from_tensor_slices(inputs)
def from_dataset(ds):
return lambda: ds.make_one_shot_iterator().get_next()
def main(argv):
args = parser.parse_args(argv[1:])
# Fetch the data
(train_x, train_y), (test_x, test_y) = load_data()
train_x = dict(train_x)
test_x = dict(test_x)
# Feature columns describe the input: all columns are numeric.
feature_columns = [tf.feature_column.numeric_column(col_name)
for col_name in COLUMNS[:-1]]
# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=3)
# Train the Model.
train = (
make_dataset(train_x, train_y)
.repeat()
.shuffle(1000)
.batch(args.batch_size))
classifier.train(input_fn=from_dataset(train), steps=args.train_steps)
# Evaluate the model.
test = make_dataset(test_x, test_y).batch(args.batch_size)
eval_result = classifier.evaluate(input_fn=from_dataset(test))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
# Generate predictions from the model
predict_input = make_dataset({
'SepalLength': [6.4, 5.8],
'SepalWidth': [3.2, 3.1],
'PetalLength': [4.5, 5.0],
'PetalWidth': [1.5, 1.7],
}).batch(args.batch_size)
for p in classifier.predict(input_fn=from_dataset(predict_input)):
template = ('Prediction is "{}" ({:.1f}%)')
class_id = p['class_ids'][0]
probability = p['probabilities'][class_id]
print(template.format(SPECIES[class_id], 100 * probability))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# This is the complete code for the following blogpost:
# https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html
# (https://goo.gl/Ujm2Ep)
import os
import six.moves.urllib.request as request
import tensorflow as tf
# Check that we have correct TensorFlow version installed
tf_version = tf.__version__
print("TensorFlow version: {}".format(tf_version))
assert "1.3" <= tf_version, "TensorFlow r1.3 or later is needed"
# Windows users: You only need to change PATH, rest is platform independent
PATH = "/tmp/tf_dataset_and_estimator_apis"
# Fetch and store Training and Test dataset files
PATH_DATASET = PATH + os.sep + "dataset"
FILE_TRAIN = PATH_DATASET + os.sep + "iris_training.csv"
FILE_TEST = PATH_DATASET + os.sep + "iris_test.csv"
URL_TRAIN = "http://download.tensorflow.org/data/iris_training.csv"
URL_TEST = "http://download.tensorflow.org/data/iris_test.csv"
def downloadDataset(url, file):
if not os.path.exists(PATH_DATASET):
os.makedirs(PATH_DATASET)
if not os.path.exists(file):
data = request.urlopen(url).read()
with open(file, "wb") as f:
f.write(data)
f.close()
downloadDataset(URL_TRAIN, FILE_TRAIN)
downloadDataset(URL_TEST, FILE_TEST)
tf.logging.set_verbosity(tf.logging.INFO)
# The CSV features in our training & test data
feature_names = [
'SepalLength',
'SepalWidth',
'PetalLength',
'PetalWidth']
# Create an input function reading a file using the Dataset API
# Then provide the results to the Estimator API
def my_input_fn(file_path, perform_shuffle=False, repeat_count=1):
def decode_csv(line):
parsed_line = tf.decode_csv(line, [[0.], [0.], [0.], [0.], [0]])
label = parsed_line[-1:] # Last element is the label
del parsed_line[-1] # Delete last element
features = parsed_line # Everything but last elements are the features
d = dict(zip(feature_names, features)), label
return d
dataset = (tf.data.TextLineDataset(file_path) # Read text file
.skip(1) # Skip header row
.map(decode_csv)) # Transform each elem by applying decode_csv fn
if perform_shuffle:
# Randomizes input using a window of 256 elements (read into memory)
dataset = dataset.shuffle(buffer_size=256)
dataset = dataset.repeat(repeat_count) # Repeats dataset this # times
dataset = dataset.batch(32) # Batch size to use
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
next_batch = my_input_fn(FILE_TRAIN, True) # Will return 32 random elements
# Create the feature_columns, which specifies the input to our model
# All our input features are numeric, so use numeric_column for each one
feature_columns = [tf.feature_column.numeric_column(k) for k in feature_names]
# Create a deep neural network regression classifier
# Use the DNNClassifier pre-made estimator
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns, # The input features to our model
hidden_units=[10, 10], # Two layers, each with 10 neurons
n_classes=3,
model_dir=PATH) # Path to where checkpoints etc are stored
# Train our model, use the previously function my_input_fn
# Input to training is a file with training example
# Stop training after 8 iterations of train data (epochs)
classifier.train(
input_fn=lambda: my_input_fn(FILE_TRAIN, True, 8))
# Evaluate our model using the examples contained in FILE_TEST
# Return value will contain evaluation_metrics such as: loss & average_loss
evaluate_result = classifier.evaluate(
input_fn=lambda: my_input_fn(FILE_TEST, False, 4))
print("Evaluation results")
for key in evaluate_result:
print(" {}, was: {}".format(key, evaluate_result[key]))
# Predict the type of some Iris flowers.
# Let's predict the examples in FILE_TEST, repeat only once.
predict_results = classifier.predict(
input_fn=lambda: my_input_fn(FILE_TEST, False, 1))
print("Predictions on test file")
for prediction in predict_results:
# Will print the predicted class, i.e: 0, 1, or 2 if the prediction
# is Iris Sentosa, Vericolor, Virginica, respectively.
print(prediction["class_ids"][0])
# Let create a dataset for prediction
# We've taken the first 3 examples in FILE_TEST
prediction_input = [[5.9, 3.0, 4.2, 1.5], # -> 1, Iris Versicolor
[6.9, 3.1, 5.4, 2.1], # -> 2, Iris Virginica
[5.1, 3.3, 1.7, 0.5]] # -> 0, Iris Sentosa
def new_input_fn():
def decode(x):
x = tf.split(x, 4) # Need to split into our 4 features
return dict(zip(feature_names, x)) # To build a dict of them
dataset = tf.data.Dataset.from_tensor_slices(prediction_input)
dataset = dataset.map(decode)
iterator = dataset.make_one_shot_iterator()
next_feature_batch = iterator.get_next()
return next_feature_batch, None # In prediction, we have no labels
# Predict all our prediction_input
predict_results = classifier.predict(input_fn=new_input_fn)
# Print results
print("Predictions:")
for idx, prediction in enumerate(predict_results):
type = prediction["class_ids"][0] # Get the predicted class (index)
if type == 0:
print(" I think: {}, is Iris Sentosa".format(prediction_input[idx]))
elif type == 1:
print(" I think: {}, is Iris Versicolor".format(prediction_input[idx]))
else:
print(" I think: {}, is Iris Virginica".format(prediction_input[idx]))
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This example demonstrates the use `tf.feature_column.crossed_column` on some simulated Atlanta housing price data. \n",
"This spatial data is used primarily so the results can be easily visualized. \n",
"\n",
"These functions are designed primarily for categorical data, not to build interpolation tables. \n",
"\n",
"If you actually want to build smart interpolation tables in TensorFlow you may want to consider [TensorFlow Lattice](https://research.googleblog.com/2017/10/tensorflow-lattice-flexibility.html)."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "yEHBFimYk-Mu"
},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "DiAklWTFk-My"
},
"outputs": [],
"source": [
"import os\n",
"import subprocess\n",
"import tempfile\n",
"\n",
"import tensorflow as tf\n",
"import numpy as np\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "fHqxuCUu8Bvm"
},
"outputs": [],
"source": [
"assert tf.VERSION.split('.') >= ['1','4']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"mpl.rcParams['figure.figsize'] = 12, 6\n",
"mpl.rcParams['image.cmap'] = 'viridis'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "Oj4Jv4Pik-M1"
},
"outputs": [],
"source": [
"logdir = tempfile.mkdtemp()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{}
]
},
"colab_type": "code",
"id": "cTrSkk1zmvO0",
"outputId": "41532b3b-2bf8-4abb-bc46-a92c76fe70f8"
},
"outputs": [],
"source": [
"logdir"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "p3me9zPGk-M3"
},
"source": [
"# Start TensorBoard\n",
"The following command will kill all running TensorBoard processes, and start a new one monitoring to the above logdir. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 34,
"output_extras": [
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 327,
"status": "ok",
"timestamp": 1508962289209,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "umxoWdz9k-M3",
"outputId": "e2d2af2f-e56f-4b5e-aa62-bd9119966b53"
},
"outputs": [],
"source": [
"subprocess.Popen(['pkill','-f','tensorboard'])\n",
"subprocess.Popen(['tensorboard', '--logdir', logdir])"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "WFYz5eg1k-M7"
},
"source": [
"# Build Synthetic Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "F3ouYc9N9zW3"
},
"outputs": [],
"source": [
"# Define the grid\n",
"min_latitude = 33.641336\n",
"max_latitude = 33.887157\n",
"delta_latitude = max_latitude-min_latitude\n",
"\n",
"min_longitude = -84.558798\n",
"max_longitude = -84.287259\n",
"delta_longitude = max_longitude-min_longitude\n",
"\n",
"resolution = 100"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Use RandomState so the behavior is repeatable. \n",
"R = np.random.RandomState(1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "JQ7wXpURk-M8"
},
"outputs": [],
"source": [
"# The price data will be a sum of Gaussians, at random locations.\n",
"n_centers = 20\n",
"centers = R.rand(n_centers, 2) # shape: (centers, dimensions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "nR1wQiqSk-NA"
},
"outputs": [],
"source": [
"# Each Gaussian has a maximum price contribution, at the center.\n",
"# Price_\n",
"price_delta = 0.5+2*R.rand(n_centers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "rGX3j-DOk-NC"
},
"outputs": [],
"source": [
"# Each Gaussian also has a standard-deviation and variance.\n",
"std = 0.2*R.rand(n_centers) # shape: (centers)\n",
"var = std**2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "kWu2ba4Tk-NE"
},
"outputs": [],
"source": [
"def price(latitude, longitude):\n",
" # Convert latitude, longitude to x,y in [0,1]\n",
" x = (longitude - min_longitude)/delta_longitude\n",
" y = (latitude - min_latitude)/delta_latitude\n",
" \n",
" # Cache the shape, and flatten the inputs.\n",
" shape = x.shape\n",
" assert y.shape == x.shape\n",
" x = x.flatten()\n",
" y = y.flatten()\n",
" \n",
" # Convert x, y examples into an array with shape (examples, dimensions)\n",
" xy = np.array([x,y]).T\n",
"\n",
" # Calculate the square distance from each example to each center. \n",
" components2 = (xy[:,None,:] - centers[None,:,:])**2 # shape: (examples, centers, dimensions)\n",
" r2 = components2.sum(axis=2) # shape: (examples, centers)\n",
" \n",
" # Calculate the z**2 for each example from each center.\n",
" z2 = r2/var[None,:]\n",
" price = (np.exp(-z2)*price_delta).sum(1) # shape: (examples,)\n",
" \n",
" # Restore the original shape.\n",
" return price.reshape(shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "BPoSndW8k-NG"
},
"outputs": [],
"source": [
"# Build the grid. We want `resolution` cells between `min` and `max` on each dimension\n",
"# so we need `resolution+1` evenly spaced edges. The centers are at the average of the\n",
"# upper and lower edge. \n",
"\n",
"latitude_edges = np.linspace(min_latitude, max_latitude, resolution+1)\n",
"latitude_centers = (latitude_edges[:-1] + latitude_edges[1:])/2\n",
"\n",
"longitude_edges = np.linspace(min_longitude, max_longitude, resolution+1)\n",
"longitude_centers = (longitude_edges[:-1] + longitude_edges[1:])/2\n",
"\n",
"latitude_grid, longitude_grid = np.meshgrid(\n",
" latitude_centers,\n",
" longitude_centers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{}
]
},
"colab_type": "code",
"id": "0Y5fSCpWk-NI",
"outputId": "35737491-93bd-4911-cb6e-8163849983a3"
},
"outputs": [],
"source": [
"# Evaluate the price at each center-point\n",
"actual_price_grid = price(latitude_grid, longitude_grid)\n",
"\n",
"price_min = actual_price_grid.min()\n",
"price_max = actual_price_grid.max()\n",
"price_mean = actual_price_grid.mean()\n",
"price_mean"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "wN2wMUOck-NK"
},
"outputs": [],
"source": [
"def show_price(price):\n",
" plt.imshow(\n",
" price, \n",
" # The color axis goes from `price_min` to `price_max`.\n",
" vmin=price_min, vmax=price_max,\n",
" # Put the image at the correct latitude and longitude.\n",
" extent=(min_longitude, max_longitude, min_latitude, max_latitude), \n",
" # Make the image square.\n",
" aspect = 1.0*delta_longitude/delta_latitude)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 592,
"output_extras": [
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 678,
"status": "ok",
"timestamp": 1508962293265,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "FkHXxJGuAsC1",
"outputId": "40bf2c0f-51b0-4026-a275-c3669e5366cc"
},
"outputs": [],
"source": [
"show_price(actual_price_grid)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "miDtqLRek-NM"
},
"source": [
"# Build Datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "zqBPKjBvk-NM"
},
"outputs": [],
"source": [
"# For test data we will use the grid centers.\n",
"test_features = {'latitude':latitude_grid.flatten(), 'longitude':longitude_grid.flatten()}\n",
"test_ds = tf.data.Dataset.from_tensor_slices((test_features, \n",
" actual_price_grid.flatten()))\n",
"test_ds = test_ds.cache().batch(512).prefetch(1)\n",
"\n",
"# For training data we will use a set of random points.\n",
"train_latitude = min_latitude + np.random.rand(50000)*delta_latitude\n",
"train_longitude = min_longitude + np.random.rand(50000)*delta_longitude\n",
"train_price = price(train_latitude, train_longitude)\n",
"\n",
"train_features = {'latitude':train_latitude, 'longitude':train_longitude}\n",
"train_ds = tf.data.Dataset.from_tensor_slices((train_features, train_price))\n",
"train_ds = train_ds.cache().repeat().shuffle(100000).batch(512).prefetch(1)\n",
"\n",
"# A shortcut to build an `input_fn` from a `Dataset`\n",
"def in_fn(ds):\n",
" return lambda : ds.make_one_shot_iterator().get_next()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "i-ToQzSqk-NO"
},
"source": [
"# Generate a plot from an Estimator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "N9NmQLUOk-NP"
},
"outputs": [],
"source": [
"def plot_est(est, ds = test_ds):\n",
" # Create two plot axes\n",
" actual, predicted = plt.subplot(1,2,1), plt.subplot(1,2,2)\n",
"\n",
" # Plot the actual price.\n",
" plt.sca(actual)\n",
" show_price(actual_price_grid.reshape(resolution, resolution))\n",
" \n",
" # Generate predictions over the grid from the estimator.\n",
" pred = est.predict(in_fn(ds))\n",
" # Convert them to a numpy array.\n",
" pred = np.fromiter((item['predictions'] for item in pred), np.float32)\n",
" # Plot the predictions on the secodn axis.\n",
" plt.sca(predicted)\n",
" show_price(pred.reshape(resolution, resolution))"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "gfgngu0lk-NQ"
},
"source": [
"# Using `numeric_column` with DNNRegressor\n",
"Important: Pure categorical data doesn't the spatial relationships that make this example possible. Embeddings are a way your model can learn spatial relationships."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 2010,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 8234,
"status": "ok",
"timestamp": 1508962479411,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "On-_j4Jtk-NR",
"outputId": "1c18ba56-7246-4096-c32d-822c2fe3dd36"
},
"outputs": [],
"source": [
"# Use `normalizer_fn` so that the model only sees values in [0, 1]\n",
"norm_latitude = lambda latitude:(latitude-min_latitude)/delta_latitude - 0.5\n",
"norm_longitude = lambda longitude:(longitude-min_longitude)/delta_longitude - 0.5\n",
"\n",
"fc = [tf.feature_column.numeric_column('latitude', normalizer_fn = norm_latitude), \n",
" tf.feature_column.numeric_column('longitude', normalizer_fn = norm_longitude)]\n",
"\n",
"# Build and train the Estimator\n",
"est = tf.estimator.DNNRegressor(\n",
" hidden_units=[100,100], \n",
" feature_columns=fc, \n",
" model_dir = os.path.join(logdir,'DNN'))\n",
"\n",
"est.train(in_fn(train_ds), steps = 5000)\n",
"est.evaluate(in_fn(test_ds))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 388,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1002,
"status": "ok",
"timestamp": 1508962552800,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "WTcIX78Tk-NV",
"outputId": "8d08be57-7c37-4268-e38a-9675978567ff"
},
"outputs": [],
"source": [
"plot_est(est)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "WIlTd5VEk-NZ"
},
"source": [
"# Using `bucketized_column`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 2010,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9805,
"status": "ok",
"timestamp": 1508962565572,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "NdKt4g2Kk-NZ",
"outputId": "a72e8a55-0239-4b42-e66b-a57ba79fb47b"
},
"outputs": [],
"source": [
"# Bucketize the latitude and longitude usig the `edges`\n",
"latitude_bucket_fc = tf.feature_column.bucketized_column(\n",
" tf.feature_column.numeric_column('latitude'), \n",
" list(latitude_edges))\n",
"\n",
"longitude_bucket_fc = tf.feature_column.bucketized_column(\n",
" tf.feature_column.numeric_column('longitude'),\n",
" list(longitude_edges))\n",
"\n",
"fc = [\n",
" latitude_bucket_fc,\n",
" longitude_bucket_fc]\n",
"\n",
"# Build and train the Estimator.\n",
"est = tf.estimator.LinearRegressor(fc, model_dir = os.path.join(logdir,'separable'))\n",
"est.train(in_fn(train_ds), steps = 5000)\n",
"est.evaluate(in_fn(test_ds))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 388,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1152,
"status": "ok",
"timestamp": 1508962568631,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "j4lhsk5rk-Nc",
"outputId": "f12b5ba5-1d28-46b2-b1d6-322a60ba40d9"
},
"outputs": [],
"source": [
"plot_est(est)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "G-QLbA0hKBbY"
},
"source": [
"# Using `crossed_column` on its own.\n",
"The single-cell \"holes\" in the figure are caused by cells which do not contain examples."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 2010,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 11707,
"status": "ok",
"timestamp": 1508962619513,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "JoIXtYykKJei",
"outputId": "08e45310-f006-454a-8bb4-4f14466c5013"
},
"outputs": [],
"source": [
"# Cross the bucketized columns, using 5000 hash bins (for an average weight sharing of 2).\n",
"crossed_lat_lon_fc = tf.feature_column.crossed_column(\n",
" [latitude_bucket_fc, longitude_bucket_fc], int(5e3))\n",
"\n",
"fc = [crossed_lat_lon_fc]\n",
"\n",
"# Build and train the Estimator.\n",
"est = tf.estimator.LinearRegressor(fc, model_dir=os.path.join(logdir, 'crossed'))\n",
"\n",
"est.train(in_fn(train_ds), steps = 5000)\n",
"est.evaluate(in_fn(test_ds))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 388,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1112,
"status": "ok",
"timestamp": 1508962600110,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "R-itu9itLe0K",
"outputId": "449a6d84-c6f5-4582-f8e3-0f007ae68e8a",
"scrolled": false
},
"outputs": [],
"source": [
"plot_est(est)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "KHJ_KsRUk-Nj"
},
"source": [
"# Using raw categories with `crossed_column` \n",
"The model generalizes better if it also has access to the raw categories, outside of the cross. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 2010,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 13233,
"status": "ok",
"timestamp": 1508963622115,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "ukHo6NrTk-Nk",
"outputId": "12fba55e-c496-4007-b7b7-11b1aad38ba8"
},
"outputs": [],
"source": [
"fc = [\n",
" latitude_bucket_fc,\n",
" longitude_bucket_fc,\n",
" crossed_lat_lon_fc]\n",
"\n",
"# Build and train the Estimator.\n",
"est = tf.estimator.LinearRegressor(fc, model_dir=os.path.join(logdir, 'both'))\n",
"est.train(in_fn(train_ds), steps = 5000)\n",
"est.evaluate(in_fn(test_ds))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 388,
"output_extras": [
{},
{}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1307,
"status": "ok",
"timestamp": 1508963623450,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "QjOwalvDk-Nm",
"outputId": "f0a520a1-8253-494e-f3c8-2e6df168100a"
},
"outputs": [],
"source": [
"plot_est(est)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Mx66A5ETk-Ns"
},
"source": [
"# Open TensorBoard"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"base_uri": "https://localhost:8080/",
"height": 820,
"output_extras": [
{
"item_id": 1
}
]
},
"colab_type": "code",
"executionInfo": {
"elapsed": 478,
"status": "ok",
"timestamp": 1508986589529,
"user": {
"displayName": "Mark Daoust",
"photoUrl": "//lh5.googleusercontent.com/-2bdrhkqhwhc/AAAAAAAAAAI/AAAAAAAAAYY/WEdKp4OXSFY/s50-c-k-no/photo.jpg",
"userId": "106546680081284977106"
},
"user_tz": 240
},
"id": "fESYrJamm_Z5",
"outputId": "d982a677-a217-491b-ef85-93f932e6afc5"
},
"outputs": [],
"source": [
"%%html\n",
"<iframe width=\"900\" height=\"800\" src=\"http://0.0.0.0:6006#scalars&_smoothingWeight=0.85\" frameborder=\"0\"></iframe>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
},
"colab_type": "code",
"collapsed": true,
"id": "_YHrJneHnA9K"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"default_view": {},
"name": "Housing Prices (1).ipynb",
"provenance": [],
"version": "0.3.2",
"views": {}
},
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment