".github/git@developer.sourcefind.cn:change/sglang.git" did not exist on "6c856b4f3a4e63a25f5adc3388bf79ac2a6e4f72"
Unverified Commit 191d99a7 authored by Yanhui Liang's avatar Yanhui Liang Committed by GitHub
Browse files

Make boosted_trees Garden-official (#4377)

* Make boosted_trees Garden-official

* Fix nits
parent 1886043f
...@@ -39,7 +39,7 @@ Note that the model_dir is cleaned up before every time training starts. ...@@ -39,7 +39,7 @@ Note that the model_dir is cleaned up before every time training starts.
Model parameters can be adjusted by flags, like `--n_trees`, `--max_depth`, `--learning_rate` and so on. Check out the code for details. Model parameters can be adjusted by flags, like `--n_trees`, `--max_depth`, `--learning_rate` and so on. Check out the code for details.
The final accuacy will be around 74% and loss will be around 0.516 over the eval set, when trained with the default parameters. The final accuracy will be around 74% and loss will be around 0.516 over the eval set, when trained with the default parameters.
By default, the first 1 million examples among 11 millions are used for training, and the last 1 million examples are used for evaluation. By default, the first 1 million examples among 11 millions are used for training, and the last 1 million examples are used for evaluation.
The training/evaluation data can be selected as index ranges by flags `--train_start`, `--train_count`, `--eval_start`, `--eval_count`, etc. The training/evaluation data can be selected as index ranges by flags `--train_start`, `--train_count`, `--eval_start`, `--eval_count`, etc.
......
...@@ -12,28 +12,23 @@ from __future__ import absolute_import ...@@ -12,28 +12,23 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import gzip
import os import os
import sys
import tempfile import tempfile
# pylint: disable=g-bad-import-order
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from six.moves import urllib from six.moves import urllib
from absl import app as absl_app
from absl import flags
import tensorflow as tf import tensorflow as tf
URL_ROOT = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280' from official.utils.flags import core as flags_core
INPUT_FILE = 'HIGGS.csv.gz'
NPZ_FILE = 'HIGGS.csv.gz.npz' # numpy compressed file to contain 'data' array.
URL_ROOT = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280"
def parse_args(): INPUT_FILE = "HIGGS.csv.gz"
"""Parses arguments and returns a tuple (known_args, unparsed_args).""" NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file to contain "data" array.
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_dir', type=str, default='/tmp/higgs_data',
help='Directory to download higgs dataset and store training/eval data.')
return parser.parse_known_args()
def _download_higgs_data_and_save_npz(data_dir): def _download_higgs_data_and_save_npz(data_dir):
...@@ -41,30 +36,30 @@ def _download_higgs_data_and_save_npz(data_dir): ...@@ -41,30 +36,30 @@ def _download_higgs_data_and_save_npz(data_dir):
input_url = os.path.join(URL_ROOT, INPUT_FILE) input_url = os.path.join(URL_ROOT, INPUT_FILE)
np_filename = os.path.join(data_dir, NPZ_FILE) np_filename = os.path.join(data_dir, NPZ_FILE)
if tf.gfile.Exists(np_filename): if tf.gfile.Exists(np_filename):
raise ValueError('data_dir already has the processed data file: {}'.format( raise ValueError("data_dir already has the processed data file: {}".format(
np_filename)) np_filename))
if not tf.gfile.Exists(data_dir): if not tf.gfile.Exists(data_dir):
tf.gfile.MkDir(data_dir) tf.gfile.MkDir(data_dir)
# 2.8 GB to download. # 2.8 GB to download.
try: try:
print('Data downloading..') tf.logging.info("Data downloading...")
temp_filename, _ = urllib.request.urlretrieve(input_url) temp_filename, _ = urllib.request.urlretrieve(input_url)
# Reading and parsing 11 million csv lines takes 2~3 minutes. # Reading and parsing 11 million csv lines takes 2~3 minutes.
print('Data processing.. taking multiple minutes..') tf.logging.info("Data processing... taking multiple minutes...")
data = pd.read_csv( with gzip.open(temp_filename, "rb") as csv_file:
temp_filename, data = pd.read_csv(
dtype=np.float32, csv_file,
names=['c%02d' % i for i in range(29)] # label + 28 features. dtype=np.float32,
).as_matrix() names=["c%02d" % i for i in range(29)] # label + 28 features.
).as_matrix()
finally: finally:
os.remove(temp_filename) tf.gfile.Remove(temp_filename)
# Writing to temporary location then copy to the data_dir (0.8 GB). # Writing to temporary location then copy to the data_dir (0.8 GB).
f = tempfile.NamedTemporaryFile() f = tempfile.NamedTemporaryFile()
np.savez_compressed(f, data=data) np.savez_compressed(f, data=data)
tf.gfile.Copy(f.name, np_filename) tf.gfile.Copy(f.name, np_filename)
print('Data saved to: {}'.format(np_filename)) tf.logging.info("Data saved to: {}".format(np_filename))
def main(unused_argv): def main(unused_argv):
...@@ -73,6 +68,16 @@ def main(unused_argv): ...@@ -73,6 +68,16 @@ def main(unused_argv):
_download_higgs_data_and_save_npz(FLAGS.data_dir) _download_higgs_data_and_save_npz(FLAGS.data_dir)
if __name__ == '__main__': def define_data_download_flags():
FLAGS, unparsed = parse_args() """Add flags specifying data download arguments."""
tf.app.run(argv=[sys.argv[0]] + unparsed) flags.DEFINE_string(
name="data_dir", default="/tmp/higgs_data",
help=flags_core.help_wrap(
"Directory to download higgs dataset and store training/eval data."))
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
define_data_download_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
...@@ -29,64 +29,44 @@ from __future__ import absolute_import ...@@ -29,64 +29,44 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse
import os import os
import sys
# pylint: disable=g-bad-import-order
import numpy as np
from absl import app as absl_app from absl import app as absl_app
from absl import flags from absl import flags
import numpy as np # pylint: disable=wrong-import-order import tensorflow as tf
import tensorflow as tf # pylint: disable=wrong-import-order # pylint: enable=g-bad-import-order
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
from official.utils.flags._conventions import help_wrap from official.utils.flags._conventions import help_wrap
from official.utils.logs import logger
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file containing "data" array
NPZ_FILE = 'HIGGS.csv.gz.npz' # numpy compressed file containing 'data' array
def define_train_higgs_flags():
"""Add tree related flags as well as training/eval configuration."""
flags_core.define_base(stop_threshold=False, batch_size=False, num_gpu=False)
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_integer(
name='train_start', default=0,
help=help_wrap('Start index of train examples within the data.'))
flags.DEFINE_integer(
name='train_count', default=1000000,
help=help_wrap('Number of train examples within the data.'))
flags.DEFINE_integer(
name='eval_start', default=10000000,
help=help_wrap('Start index of eval examples within the data.'))
flags.DEFINE_integer(
name='eval_count', default=1000000,
help=help_wrap('Number of eval examples within the data.'))
flags.DEFINE_integer(
'n_trees', default=100, help=help_wrap('Number of trees to build.'))
flags.DEFINE_integer(
'max_depth', default=6, help=help_wrap('Maximum depths of each tree.'))
flags.DEFINE_float(
'learning_rate', default=0.1,
help=help_wrap('Maximum depths of each tree.'))
flags_core.set_defaults(data_dir='/tmp/higgs_data',
model_dir='/tmp/higgs_model')
def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count):
"""Reads higgs data from csv and returns train and eval data.
Args:
data_dir: A string, the directory of higgs dataset.
train_start: An integer, the start index of train examples within the data.
train_count: An integer, the number of train examples within the data.
eval_start: An integer, the start index of eval examples within the data.
eval_count: An integer, the number of eval examples within the data.
def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count): Returns:
"""Reads higgs data from csv and returns train and eval data.""" Numpy array of train data and eval data.
"""
npz_filename = os.path.join(data_dir, NPZ_FILE) npz_filename = os.path.join(data_dir, NPZ_FILE)
try: try:
# gfile allows numpy to read data from network data sources as well. # gfile allows numpy to read data from network data sources as well.
with tf.gfile.Open(npz_filename, 'rb') as npz_file: with tf.gfile.Open(npz_filename, "rb") as npz_file:
with np.load(npz_file) as npz: with np.load(npz_file) as npz:
data = npz['data'] data = npz["data"]
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
'Error loading data; use data_download.py to prepare the data:\n{}: {}' "Error loading data; use data_download.py to prepare the data:\n{}: {}"
.format(type(e).__name__, e)) .format(type(e).__name__, e))
return (data[train_start:train_start+train_count], return (data[train_start:train_start+train_count],
data[eval_start:eval_start+eval_count]) data[eval_start:eval_start+eval_count])
...@@ -105,18 +85,18 @@ def make_inputs_from_np_arrays(features_np, label_np): ...@@ -105,18 +85,18 @@ def make_inputs_from_np_arrays(features_np, label_np):
as a single tensor. Don't use batch. as a single tensor. Don't use batch.
Args: Args:
features_np: a numpy ndarray (shape=[batch_size, num_features]) for features_np: A numpy ndarray (shape=[batch_size, num_features]) for
float32 features. float32 features.
label_np: a numpy ndarray (shape=[batch_size, 1]) for labels. label_np: A numpy ndarray (shape=[batch_size, 1]) for labels.
Returns: Returns:
input_fn: a function returning a Dataset of feature dict and label. input_fn: A function returning a Dataset of feature dict and label.
feature_column: a list of tf.feature_column.BucketizedColumn. feature_column: A list of tf.feature_column.BucketizedColumn.
""" """
num_features = features_np.shape[1] num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1) features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names. # 1-based feature names.
feature_names = ['feature_%02d' % (i + 1) for i in range(num_features)] feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
# Create source feature_columns and bucketized_columns. # Create source feature_columns and bucketized_columns.
def get_bucket_boundaries(feature): def get_bucket_boundaries(feature):
...@@ -155,16 +135,16 @@ def make_eval_inputs_from_np_arrays(features_np, label_np): ...@@ -155,16 +135,16 @@ def make_eval_inputs_from_np_arrays(features_np, label_np):
num_features = features_np.shape[1] num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1) features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names. # 1-based feature names.
feature_names = ['feature_%02d' % (i + 1) for i in range(num_features)] feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
def input_fn(): def input_fn():
features = { features = {
feature_name: tf.constant(features_np_list[i]) feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names) for i, feature_name in enumerate(feature_names)
} }
return tf.data.Dataset.zip( return tf.data.Dataset.zip((
(tf.data.Dataset.from_tensor_slices(features), tf.data.Dataset.from_tensor_slices(features),
tf.data.Dataset.from_tensor_slices(label_np),)).batch(1000) tf.data.Dataset.from_tensor_slices(label_np),)).batch(1000)
return input_fn return input_fn
...@@ -175,22 +155,37 @@ def train_boosted_trees(flags_obj): ...@@ -175,22 +155,37 @@ def train_boosted_trees(flags_obj):
Args: Args:
flags_obj: An object containing parsed flag values. flags_obj: An object containing parsed flag values.
""" """
# Clean up the model directory if present. # Clean up the model directory if present.
if tf.gfile.Exists(flags_obj.model_dir): if tf.gfile.Exists(flags_obj.model_dir):
tf.gfile.DeleteRecursively(flags_obj.model_dir) tf.gfile.DeleteRecursively(flags_obj.model_dir)
print('## data loading..') tf.logging.info("## Data loading...")
train_data, eval_data = read_higgs_data( train_data, eval_data = read_higgs_data(
flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count, flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
flags_obj.eval_start, flags_obj.eval_count) flags_obj.eval_start, flags_obj.eval_count)
print('## data loaded; train: {}{}, eval: {}{}'.format( tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape)) train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
# data consists of one label column and 28 feature columns following.
# Data consists of one label column followed by 28 feature columns.
train_input_fn, feature_columns = make_inputs_from_np_arrays( train_input_fn, feature_columns = make_inputs_from_np_arrays(
features_np=train_data[:, 1:], label_np=train_data[:, 0:1]) features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
eval_input_fn = make_eval_inputs_from_np_arrays( eval_input_fn = make_eval_inputs_from_np_arrays(
features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1]) features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
print('## features prepared. training starts..') tf.logging.info("## Features prepared. Training starts...")
# Create benchmark logger to log info about the training and metric values
run_params = {
"train_start": flags_obj.train_start,
"train_count": flags_obj.train_count,
"eval_start": flags_obj.eval_start,
"eval_count": flags_obj.eval_count,
"n_trees": flags_obj.n_trees,
"max_depth": flags_obj.max_depth,
}
benchmark_logger = logger.config_benchmark_logger(flags_obj)
benchmark_logger.log_run_info(
model_name="boosted_trees",
dataset_name="higgs",
run_params=run_params)
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory # Though BoostedTreesClassifier is under tf.estimator, faster in-memory
# training is yet provided as a contrib library. # training is yet provided as a contrib library.
...@@ -203,7 +198,9 @@ def train_boosted_trees(flags_obj): ...@@ -203,7 +198,9 @@ def train_boosted_trees(flags_obj):
learning_rate=flags_obj.learning_rate) learning_rate=flags_obj.learning_rate)
# Evaluation. # Evaluation.
eval_result = classifier.evaluate(eval_input_fn) eval_results = classifier.evaluate(eval_input_fn)
# Benchmark the evaluation results
benchmark_logger.log_evaluation_result(eval_results)
# Exporting the savedmodel. # Exporting the savedmodel.
if flags_obj.export_dir is not None: if flags_obj.export_dir is not None:
...@@ -216,7 +213,37 @@ def main(_): ...@@ -216,7 +213,37 @@ def main(_):
train_boosted_trees(flags.FLAGS) train_boosted_trees(flags.FLAGS)
if __name__ == '__main__': def define_train_higgs_flags():
"""Add tree related flags as well as training/eval configuration."""
flags_core.define_base(stop_threshold=False, batch_size=False, num_gpu=False)
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_integer(
name="train_start", default=0,
help=help_wrap("Start index of train examples within the data."))
flags.DEFINE_integer(
name="train_count", default=1000000,
help=help_wrap("Number of train examples within the data."))
flags.DEFINE_integer(
name="eval_start", default=10000000,
help=help_wrap("Start index of eval examples within the data."))
flags.DEFINE_integer(
name="eval_count", default=1000000,
help=help_wrap("Number of eval examples within the data."))
flags.DEFINE_integer(
"n_trees", default=100, help=help_wrap("Number of trees to build."))
flags.DEFINE_integer(
"max_depth", default=6, help=help_wrap("Maximum depths of each tree."))
flags.DEFINE_float(
"learning_rate", default=0.1,
help=help_wrap("The learning rate."))
flags_core.set_defaults(data_dir="/tmp/higgs_data",
model_dir="/tmp/higgs_model")
if __name__ == "__main__":
# Training progress and eval results are shown as logging.INFO; so enables it. # Training progress and eval results are shown as logging.INFO; so enables it.
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
define_train_higgs_flags() define_train_higgs_flags()
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Tests for boosted_tree."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
...@@ -22,16 +22,16 @@ import tempfile ...@@ -22,16 +22,16 @@ import tempfile
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import tensorflow as tf
import tensorflow as tf # pylint: disable=g-bad-import-order # pylint: disable=g-bad-import-order
from official.utils.testing import integration
from official.boosted_trees import train_higgs from official.boosted_trees import train_higgs
from official.utils.testing import integration
tf.logging.set_verbosity(tf.logging.ERROR) TEST_CSV = os.path.join(os.path.dirname(__file__), "train_higgs_test.csv")
tf.logging.set_verbosity(tf.logging.ERROR)
TEST_CSV = os.path.join(os.path.dirname(__file__), 'train_higgs_test.csv')
class BaseTest(tf.test.TestCase): class BaseTest(tf.test.TestCase):
"""Tests for Wide Deep model.""" """Tests for Wide Deep model."""
...@@ -45,7 +45,7 @@ class BaseTest(tf.test.TestCase): ...@@ -45,7 +45,7 @@ class BaseTest(tf.test.TestCase):
# Create temporary CSV file # Create temporary CSV file
self.data_dir = self.get_temp_dir() self.data_dir = self.get_temp_dir()
data = pd.read_csv( data = pd.read_csv(
TEST_CSV, dtype=np.float32, names=['c%02d' % i for i in range(29)] TEST_CSV, dtype=np.float32, names=["c%02d" % i for i in range(29)]
).as_matrix() ).as_matrix()
self.input_npz = os.path.join(self.data_dir, train_higgs.NPZ_FILE) self.input_npz = os.path.join(self.data_dir, train_higgs.NPZ_FILE)
# numpy.savez doesn't take gfile.Gfile, so need to write down and copy. # numpy.savez doesn't take gfile.Gfile, so need to write down and copy.
...@@ -56,9 +56,9 @@ class BaseTest(tf.test.TestCase): ...@@ -56,9 +56,9 @@ class BaseTest(tf.test.TestCase):
def test_read_higgs_data(self): def test_read_higgs_data(self):
"""Tests read_higgs_data() function.""" """Tests read_higgs_data() function."""
# Error when a wrong data_dir is given. # Error when a wrong data_dir is given.
with self.assertRaisesRegexp(RuntimeError, 'Error loading data.*'): with self.assertRaisesRegexp(RuntimeError, "Error loading data.*"):
train_data, eval_data = train_higgs.read_higgs_data( train_data, eval_data = train_higgs.read_higgs_data(
self.data_dir + 'non-existing-path', self.data_dir + "non-existing-path",
train_start=0, train_count=15, eval_start=15, eval_count=5) train_start=0, train_count=15, eval_start=15, eval_count=5)
# Loading fine with the correct data_dir. # Loading fine with the correct data_dir.
...@@ -80,13 +80,13 @@ class BaseTest(tf.test.TestCase): ...@@ -80,13 +80,13 @@ class BaseTest(tf.test.TestCase):
self.assertEqual(28, len(feature_columns)) self.assertEqual(28, len(feature_columns))
bucketized_column_type = type( bucketized_column_type = type(
tf.feature_column.bucketized_column( tf.feature_column.bucketized_column(
tf.feature_column.numeric_column('feature_01'), tf.feature_column.numeric_column("feature_01"),
boundaries=[0, 1, 2])) # dummy boundaries. boundaries=[0, 1, 2])) # dummy boundaries.
for feature_column in feature_columns: for feature_column in feature_columns:
self.assertIsInstance(feature_column, bucketized_column_type) self.assertIsInstance(feature_column, bucketized_column_type)
# At least 2 boundaries. # At least 2 boundaries.
self.assertGreaterEqual(len(feature_column.boundaries), 2) self.assertGreaterEqual(len(feature_column.boundaries), 2)
feature_names = ['feature_%02d' % (i+1) for i in range(28)] feature_names = ["feature_%02d" % (i+1) for i in range(28)]
# Tests that the source column names of the bucketized columns match. # Tests that the source column names of the bucketized columns match.
self.assertAllEqual(feature_names, self.assertAllEqual(feature_names,
[col.source_column.name for col in feature_columns]) [col.source_column.name for col in feature_columns])
...@@ -113,39 +113,39 @@ class BaseTest(tf.test.TestCase): ...@@ -113,39 +113,39 @@ class BaseTest(tf.test.TestCase):
def test_end_to_end(self): def test_end_to_end(self):
"""Tests end-to-end running.""" """Tests end-to-end running."""
model_dir = os.path.join(self.get_temp_dir(), 'model') model_dir = os.path.join(self.get_temp_dir(), "model")
integration.run_synthetic( integration.run_synthetic(
main=train_higgs.main, tmp_root=self.get_temp_dir(), extra_flags=[ main=train_higgs.main, tmp_root=self.get_temp_dir(), extra_flags=[
'--data_dir', self.data_dir, "--data_dir", self.data_dir,
'--model_dir', model_dir, "--model_dir", model_dir,
'--n_trees', '5', "--n_trees", "5",
'--train_start', '0', "--train_start", "0",
'--train_count', '12', "--train_count", "12",
'--eval_start', '12', "--eval_start", "12",
'--eval_count', '8', "--eval_count", "8",
], ],
synth=False, max_train=None) synth=False, max_train=None)
self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, 'checkpoint'))) self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))
def test_end_to_end_with_export(self): def test_end_to_end_with_export(self):
"""Tests end-to-end running.""" """Tests end-to-end running."""
model_dir = os.path.join(self.get_temp_dir(), 'model') model_dir = os.path.join(self.get_temp_dir(), "model")
export_dir = os.path.join(self.get_temp_dir(), 'export') export_dir = os.path.join(self.get_temp_dir(), "export")
integration.run_synthetic( integration.run_synthetic(
main=train_higgs.main, tmp_root=self.get_temp_dir(), extra_flags=[ main=train_higgs.main, tmp_root=self.get_temp_dir(), extra_flags=[
'--data_dir', self.data_dir, "--data_dir", self.data_dir,
'--model_dir', model_dir, "--model_dir", model_dir,
'--export_dir', export_dir, "--export_dir", export_dir,
'--n_trees', '5', "--n_trees", "5",
'--train_start', '0', "--train_start", "0",
'--train_count', '12', "--train_count", "12",
'--eval_start', '12', "--eval_start", "12",
'--eval_count', '8', "--eval_count", "8",
], ],
synth=False, max_train=None) synth=False, max_train=None)
self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, 'checkpoint'))) self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))
self.assertTrue(tf.gfile.Exists(os.path.join(export_dir))) self.assertTrue(tf.gfile.Exists(os.path.join(export_dir)))
if __name__ == '__main__': if __name__ == "__main__":
tf.test.main() tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment