Commit 9dafea91 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'qianyj_tf' into 'main'

update tf code

See merge request dcutoolkit/deeplearing/dlexamples_new!35
parents 92a2ca36 a4146470
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Train DNN on census income dataset."""
import os
from absl import app as absl_app
from absl import flags
import tensorflow as tf
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.wide_deep import census_dataset
from official.wide_deep import wide_deep_run_loop
def define_census_flags():
wide_deep_run_loop.define_wide_deep_flags()
flags.adopt_module_key_flags(wide_deep_run_loop)
flags_core.set_defaults(data_dir='/tmp/census_data',
model_dir='/tmp/census_model',
train_epochs=40,
epochs_between_evals=2,
inter_op_parallelism_threads=0,
intra_op_parallelism_threads=0,
batch_size=40)
def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op):
"""Build an estimator appropriate for the given model type."""
wide_columns, deep_columns = model_column_fn()
hidden_units = [100, 75, 50, 25]
# Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
# trains faster than GPU for this model.
run_config = tf.estimator.RunConfig().replace(
session_config=tf.ConfigProto(device_count={'GPU': 0},
inter_op_parallelism_threads=inter_op,
intra_op_parallelism_threads=intra_op))
if model_type == 'wide':
return tf.estimator.LinearClassifier(
model_dir=model_dir,
feature_columns=wide_columns,
config=run_config)
elif model_type == 'deep':
return tf.estimator.DNNClassifier(
model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=hidden_units,
config=run_config)
else:
return tf.estimator.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=hidden_units,
config=run_config)
def run_census(flags_obj):
"""Construct all necessary functions and call run_loop.
Args:
flags_obj: Object containing user specified flags.
"""
if flags_obj.download_if_missing:
census_dataset.download(flags_obj.data_dir)
train_file = os.path.join(flags_obj.data_dir, census_dataset.TRAINING_FILE)
test_file = os.path.join(flags_obj.data_dir, census_dataset.EVAL_FILE)
# Train and evaluate the model every `flags.epochs_between_evals` epochs.
def train_input_fn():
return census_dataset.input_fn(
train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)
def eval_input_fn():
return census_dataset.input_fn(test_file, 1, False, flags_obj.batch_size)
tensors_to_log = {
'average_loss': '{loss_prefix}head/truediv',
'loss': '{loss_prefix}head/weighted_loss/Sum'
}
wide_deep_run_loop.run_loop(
name="Census Income", train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
model_column_fn=census_dataset.build_model_columns,
build_estimator_fn=build_estimator,
flags_obj=flags_obj,
tensors_to_log=tensors_to_log,
early_stop=True)
def main(_):
with logger.benchmark_context(flags.FLAGS):
run_census(flags.FLAGS)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
define_census_flags()
absl_app.run(main)
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,,,2174,0,40,,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,,,0,0,13,,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,,,0,0,40,,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,,,0,0,40,,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,,,0,0,40,,<=50K
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,,,0,0,40,,<=50K
49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,,,0,0,16,,<=50K
52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,,,0,0,45,,>50K
31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,,,14084,0,50,,>50K
42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,,,5178,0,40,,>50K
37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,,,0,0,80,,>50K
30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,,,0,0,40,,>50K
23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,,,0,0,30,,<=50K
32,Private,205019,Assoc-acdm,12,Never-married,Sales,Not-in-family,,,0,0,50,,<=50K
40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,,,0,0,40,,>50K
34,Private,245487,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,,,0,0,45,,<=50K
25,Self-emp-not-inc,176756,HS-grad,9,Never-married,Farming-fishing,Own-child,,,0,0,35,,<=50K
32,Private,186824,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,,,0,0,40,,<=50K
38,Private,28887,11th,7,Married-civ-spouse,Sales,Husband,,,0,0,50,,<=50K
43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,,,0,0,45,,>50K
40,Private,193524,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,,,0,0,60,,>50K
56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,,,0,0,40,,>50K
54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,,,0,0,60,,>50K
22,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,,,0,0,15,,<=50K
31,Private,84154,Some-college,10,Married-civ-spouse,Sales,Husband,,,0,0,38,,>50K
57,Federal-gov,337895,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,,,0,0,40,,>50K
47,Private,51835,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,,,0,1902,60,,>50K
50,Federal-gov,251585,Bachelors,13,Divorced,Exec-managerial,Not-in-family,,,0,0,55,,>50K
25,Private,289980,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,,,0,0,35,,<=50K
42,Private,116632,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,,,0,0,45,,>50K
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.testing import integration
from official.wide_deep import census_dataset
from official.wide_deep import census_main
from official.wide_deep import wide_deep_run_loop
tf.logging.set_verbosity(tf.logging.ERROR)
TEST_INPUT = ('18,Self-emp-not-inc,987,Bachelors,12,Married-civ-spouse,abc,'
'Husband,zyx,wvu,34,56,78,tsr,<=50K')
TEST_INPUT_VALUES = {
'age': 18,
'education_num': 12,
'capital_gain': 34,
'capital_loss': 56,
'hours_per_week': 78,
'education': 'Bachelors',
'marital_status': 'Married-civ-spouse',
'relationship': 'Husband',
'workclass': 'Self-emp-not-inc',
'occupation': 'abc',
}
TEST_CSV = os.path.join(os.path.dirname(__file__), 'census_test.csv')
class BaseTest(tf.test.TestCase):
"""Tests for Wide Deep model."""
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(BaseTest, cls).setUpClass()
census_main.define_census_flags()
def setUp(self):
# Create temporary CSV file
self.temp_dir = self.get_temp_dir()
self.input_csv = os.path.join(self.temp_dir, 'test.csv')
with tf.gfile.Open(self.input_csv, 'w') as temp_csv:
temp_csv.write(TEST_INPUT)
with tf.gfile.Open(TEST_CSV, "r") as temp_csv:
test_csv_contents = temp_csv.read()
# Used for end-to-end tests.
for fname in [census_dataset.TRAINING_FILE, census_dataset.EVAL_FILE]:
with tf.gfile.Open(os.path.join(self.temp_dir, fname), 'w') as test_csv:
test_csv.write(test_csv_contents)
def test_input_fn(self):
dataset = census_dataset.input_fn(self.input_csv, 1, False, 1)
features, labels = dataset.make_one_shot_iterator().get_next()
with self.test_session() as sess:
features, labels = sess.run((features, labels))
# Compare the two features dictionaries.
for key in TEST_INPUT_VALUES:
self.assertTrue(key in features)
self.assertEqual(len(features[key]), 1)
feature_value = features[key][0]
# Convert from bytes to string for Python 3.
if isinstance(feature_value, bytes):
feature_value = feature_value.decode()
self.assertEqual(TEST_INPUT_VALUES[key], feature_value)
self.assertFalse(labels)
def build_and_test_estimator(self, model_type):
"""Ensure that model trains and minimizes loss."""
model = census_main.build_estimator(
self.temp_dir, model_type,
model_column_fn=census_dataset.build_model_columns,
inter_op=0, intra_op=0)
# Train for 1 step to initialize model and evaluate initial loss
def get_input_fn(num_epochs, shuffle, batch_size):
def input_fn():
return census_dataset.input_fn(
TEST_CSV, num_epochs=num_epochs, shuffle=shuffle,
batch_size=batch_size)
return input_fn
model.train(input_fn=get_input_fn(1, True, 1), steps=1)
initial_results = model.evaluate(input_fn=get_input_fn(1, False, 1))
# Train for 100 epochs at batch size 3 and evaluate final loss
model.train(input_fn=get_input_fn(100, True, 3))
final_results = model.evaluate(input_fn=get_input_fn(1, False, 1))
print('%s initial results:' % model_type, initial_results)
print('%s final results:' % model_type, final_results)
# Ensure loss has decreased, while accuracy and both AUCs have increased.
self.assertLess(final_results['loss'], initial_results['loss'])
self.assertGreater(final_results['auc'], initial_results['auc'])
self.assertGreater(final_results['auc_precision_recall'],
initial_results['auc_precision_recall'])
self.assertGreater(final_results['accuracy'], initial_results['accuracy'])
def test_wide_deep_estimator_training(self):
self.build_and_test_estimator('wide_deep')
def test_end_to_end_wide(self):
integration.run_synthetic(
main=census_main.main, tmp_root=self.get_temp_dir(),
extra_flags=[
'--data_dir', self.get_temp_dir(),
'--model_type', 'wide',
'--download_if_missing=false'
],
synth=False, max_train=None)
def test_end_to_end_deep(self):
integration.run_synthetic(
main=census_main.main, tmp_root=self.get_temp_dir(),
extra_flags=[
'--data_dir', self.get_temp_dir(),
'--model_type', 'deep',
'--download_if_missing=false'
],
synth=False, max_train=None)
def test_end_to_end_wide_deep(self):
integration.run_synthetic(
main=census_main.main, tmp_root=self.get_temp_dir(),
extra_flags=[
'--data_dir', self.get_temp_dir(),
'--model_type', 'wide_deep',
'--download_if_missing=false'
],
synth=False, max_train=None)
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Prepare MovieLens dataset for wide-deep."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import os
# pylint: disable=wrong-import-order
from absl import app as absl_app
from absl import flags
import numpy as np
import tensorflow as tf
# pylint: enable=wrong-import-order
from official.datasets import movielens
from official.utils.data import file_io
from official.utils.flags import core as flags_core
_BUFFER_SUBDIR = "wide_deep_buffer"
_FEATURE_MAP = {
movielens.USER_COLUMN: tf.FixedLenFeature([1], dtype=tf.int64),
movielens.ITEM_COLUMN: tf.FixedLenFeature([1], dtype=tf.int64),
movielens.TIMESTAMP_COLUMN: tf.FixedLenFeature([1], dtype=tf.int64),
movielens.GENRE_COLUMN: tf.FixedLenFeature(
[movielens.N_GENRE], dtype=tf.int64),
movielens.RATING_COLUMN: tf.FixedLenFeature([1], dtype=tf.float32),
}
_BUFFER_SIZE = {
movielens.ML_1M: {"train": 107978119, "eval": 26994538},
movielens.ML_20M: {"train": 2175203810, "eval": 543802008}
}
_USER_EMBEDDING_DIM = 16
_ITEM_EMBEDDING_DIM = 64
def build_model_columns(dataset):
"""Builds a set of wide and deep feature columns."""
user_id = tf.feature_column.categorical_column_with_vocabulary_list(
movielens.USER_COLUMN, range(1, movielens.NUM_USER_IDS[dataset]))
user_embedding = tf.feature_column.embedding_column(
user_id, _USER_EMBEDDING_DIM, max_norm=np.sqrt(_USER_EMBEDDING_DIM))
item_id = tf.feature_column.categorical_column_with_vocabulary_list(
movielens.ITEM_COLUMN, range(1, movielens.NUM_ITEM_IDS))
item_embedding = tf.feature_column.embedding_column(
item_id, _ITEM_EMBEDDING_DIM, max_norm=np.sqrt(_ITEM_EMBEDDING_DIM))
time = tf.feature_column.numeric_column(movielens.TIMESTAMP_COLUMN)
genres = tf.feature_column.numeric_column(
movielens.GENRE_COLUMN, shape=(movielens.N_GENRE,), dtype=tf.uint8)
deep_columns = [user_embedding, item_embedding, time, genres]
wide_columns = []
return wide_columns, deep_columns
def _deserialize(examples_serialized):
features = tf.parse_example(examples_serialized, _FEATURE_MAP)
return features, features[movielens.RATING_COLUMN] / movielens.MAX_RATING
def _buffer_path(data_dir, dataset, name):
return os.path.join(data_dir, _BUFFER_SUBDIR,
"{}_{}_buffer".format(dataset, name))
def _df_to_input_fn(df, name, dataset, data_dir, batch_size, repeat, shuffle):
"""Serialize a dataframe and write it to a buffer file."""
buffer_path = _buffer_path(data_dir, dataset, name)
expected_size = _BUFFER_SIZE[dataset].get(name)
file_io.write_to_buffer(
dataframe=df, buffer_path=buffer_path,
columns=list(_FEATURE_MAP.keys()), expected_size=expected_size)
def input_fn():
dataset = tf.data.TFRecordDataset(buffer_path)
# batch comes before map because map can deserialize multiple examples.
dataset = dataset.batch(batch_size)
dataset = dataset.map(_deserialize, num_parallel_calls=16)
if shuffle:
dataset = dataset.shuffle(shuffle)
dataset = dataset.repeat(repeat)
return dataset.prefetch(1)
return input_fn
def _check_buffers(data_dir, dataset):
train_path = os.path.join(data_dir, _BUFFER_SUBDIR,
"{}_{}_buffer".format(dataset, "train"))
eval_path = os.path.join(data_dir, _BUFFER_SUBDIR,
"{}_{}_buffer".format(dataset, "eval"))
if not tf.gfile.Exists(train_path) or not tf.gfile.Exists(eval_path):
return False
return all([
tf.gfile.Stat(_buffer_path(data_dir, dataset, "train")).length ==
_BUFFER_SIZE[dataset]["train"],
tf.gfile.Stat(_buffer_path(data_dir, dataset, "eval")).length ==
_BUFFER_SIZE[dataset]["eval"],
])
def construct_input_fns(dataset, data_dir, batch_size=16, repeat=1):
"""Construct train and test input functions, as well as the column fn."""
if _check_buffers(data_dir, dataset):
train_df, eval_df = None, None
else:
df = movielens.csv_to_joint_dataframe(dataset=dataset, data_dir=data_dir)
df = movielens.integerize_genres(dataframe=df)
df = df.drop(columns=[movielens.TITLE_COLUMN])
train_df = df.sample(frac=0.8, random_state=0)
eval_df = df.drop(train_df.index)
train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)
train_input_fn = _df_to_input_fn(
df=train_df, name="train", dataset=dataset, data_dir=data_dir,
batch_size=batch_size, repeat=repeat,
shuffle=movielens.NUM_RATINGS[dataset])
eval_input_fn = _df_to_input_fn(
df=eval_df, name="eval", dataset=dataset, data_dir=data_dir,
batch_size=batch_size, repeat=repeat, shuffle=None)
model_column_fn = functools.partial(build_model_columns, dataset=dataset)
train_input_fn()
return train_input_fn, eval_input_fn, model_column_fn
def main(_):
movielens.download(dataset=flags.FLAGS.dataset, data_dir=flags.FLAGS.data_dir)
construct_input_fns(flags.FLAGS.dataset, flags.FLAGS.data_dir)
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
movielens.define_data_download_flags()
flags.adopt_module_key_flags(movielens)
flags_core.set_defaults(dataset="ml-1m")
absl_app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Train DNN on Kaggle movie dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import app as absl_app
from absl import flags
import tensorflow as tf
from official.datasets import movielens
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.wide_deep import movielens_dataset
from official.wide_deep import wide_deep_run_loop
def define_movie_flags():
"""Define flags for movie dataset training."""
wide_deep_run_loop.define_wide_deep_flags()
flags.DEFINE_enum(
name="dataset", default=movielens.ML_1M,
enum_values=movielens.DATASETS, case_sensitive=False,
help=flags_core.help_wrap("Dataset to be trained and evaluated."))
flags.adopt_module_key_flags(wide_deep_run_loop)
flags_core.set_defaults(data_dir="/tmp/movielens-data/",
model_dir='/tmp/movie_model',
model_type="deep",
train_epochs=50,
epochs_between_evals=5,
inter_op_parallelism_threads=0,
intra_op_parallelism_threads=0,
batch_size=256)
@flags.validator("stop_threshold",
message="stop_threshold not supported for movielens model")
def _no_stop(stop_threshold):
return stop_threshold is None
def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op):
"""Build an estimator appropriate for the given model type."""
if model_type != "deep":
raise NotImplementedError("movie dataset only supports `deep` model_type")
_, deep_columns = model_column_fn()
hidden_units = [256, 256, 256, 128]
run_config = tf.estimator.RunConfig().replace(
session_config=tf.ConfigProto(device_count={'GPU': 0},
inter_op_parallelism_threads=inter_op,
intra_op_parallelism_threads=intra_op))
return tf.estimator.DNNRegressor(
model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=hidden_units,
optimizer=tf.train.AdamOptimizer(),
activation_fn=tf.nn.sigmoid,
dropout=0.3,
loss_reduction=tf.losses.Reduction.MEAN)
def run_movie(flags_obj):
"""Construct all necessary functions and call run_loop.
Args:
flags_obj: Object containing user specified flags.
"""
if flags_obj.download_if_missing:
movielens.download(dataset=flags_obj.dataset, data_dir=flags_obj.data_dir)
train_input_fn, eval_input_fn, model_column_fn = \
movielens_dataset.construct_input_fns(
dataset=flags_obj.dataset, data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size, repeat=flags_obj.epochs_between_evals)
tensors_to_log = {
'loss': '{loss_prefix}head/weighted_loss/value'
}
wide_deep_run_loop.run_loop(
name="MovieLens", train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
model_column_fn=model_column_fn,
build_estimator_fn=build_estimator,
flags_obj=flags_obj,
tensors_to_log=tensors_to_log,
early_stop=False)
def main(_):
with logger.benchmark_context(flags.FLAGS):
run_movie(flags.FLAGS)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
define_movie_flags()
absl_app.run(main)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment