"megatron/git@developer.sourcefind.cn:OpenDAS/megatron-lm.git" did not exist on "9e0ee6fd24b23ebf07cbea10a7582aa8aaa10de9"
Commit 04c81871 authored by Younghee Kwon's avatar Younghee Kwon Committed by Yanhui Liang
Browse files

Exemplify csv handing in serving for boosted_trees model. (#4401)

* Exemplify csv handing in serving for boosted_trees model by using custom built signature_def.

* some minor touches.

* Reverted back to using the file instead of module in the example.
parent c519da2c
...@@ -74,15 +74,36 @@ saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/ \ ...@@ -74,15 +74,36 @@ saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/ \
``` ```
### Inference ### Inference
Let's use the model to predict the income group of two examples: Let's use the model to predict the income group of two examples.
Note that this model exports SavedModel with the custom parsing module that accepts csv lines as features. (Each line is an example with 28 columns; be careful to not add a label column, unlike in the training data.)
``` ```
saved_model_cli run --dir /tmp/boosted_trees_higgs_saved_model/${TIMESTAMP}/ \ saved_model_cli run --dir /tmp/boosted_trees_higgs_saved_model/${TIMESTAMP}/ \
--tag_set serve --signature_def="predict" \ --tag_set serve --signature_def="predict" \
--input_examples='examples=[{"feature_01":[0.8692932],"feature_02":[-0.6350818],"feature_03":[0.2256903],"feature_04":[0.3274701],"feature_05":[-0.6899932],"feature_06":[0.7542022],"feature_07":[-0.2485731],"feature_08":[-1.0920639],"feature_09":[0.0],"feature_10":[1.3749921],"feature_11":[-0.6536742],"feature_12":[0.9303491],"feature_13":[1.1074361],"feature_14":[1.1389043],"feature_15":[-1.5781983],"feature_16":[-1.0469854],"feature_17":[0.0],"feature_18":[0.6579295],"feature_19":[-0.0104546],"feature_20":[-0.0457672],"feature_21":[3.1019614],"feature_22":[1.3537600],"feature_23":[0.9795631],"feature_24":[0.9780762],"feature_25":[0.9200048],"feature_26":[0.7216575],"feature_27":[0.9887509],"feature_28":[0.8766783]}, {"feature_01":[1.5958393],"feature_02":[-0.6078107],"feature_03":[0.0070749],"feature_04":[1.8184496],"feature_05":[-0.1119060],"feature_06":[0.8475499],"feature_07":[-0.5664370],"feature_08":[1.5812393],"feature_09":[2.1730762],"feature_10":[0.7554210],"feature_11":[0.6431096],"feature_12":[1.4263668],"feature_13":[0.0],"feature_14":[0.9216608],"feature_15":[-1.1904324],"feature_16":[-1.6155890],"feature_17":[0.0],"feature_18":[0.6511141],"feature_19":[-0.6542270],"feature_20":[-1.2743449],"feature_21":[3.1019614],"feature_22":[0.8237606],"feature_23":[0.9381914],"feature_24":[0.9717582],"feature_25":[0.7891763],"feature_26":[0.4305533],"feature_27":[0.9613569],"feature_28":[0.9578179]}]' --input_exprs='inputs=["0.869293,-0.635082,0.225690,0.327470,-0.689993,0.754202,-0.248573,-1.092064,0.0,1.374992,-0.653674,0.930349,1.107436,1.138904,-1.578198,-1.046985,0.0,0.657930,-0.010455,-0.045767,3.101961,1.353760,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678", "1.595839,-0.607811,0.007075,1.818450,-0.111906,0.847550,-0.566437,1.581239,2.173076,0.755421,0.643110,1.426367,0.0,0.921661,-1.190432,-1.615589,0.0,0.651114,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818"]'
``` ```
This will print out the predicted classes and class probabilities. This will print out the predicted classes and class probabilities. Something like:
```
Result for output key class_ids:
[[1]
[0]]
Result for output key classes:
[['1']
['0']]
Result for output key logistic:
[[0.6440273 ]
[0.10902369]]
Result for output key logits:
[[ 0.59288704]
[-2.1007526 ]]
Result for output key probabilities:
[[0.3559727 0.6440273]
[0.8909763 0.1090237]]
```
Please note that "predict" signature_def gives out different (more detailed) results than "classification" or "serving_default".
## Additional Links ## Additional Links
......
...@@ -64,9 +64,9 @@ def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count): ...@@ -64,9 +64,9 @@ def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count):
with tf.gfile.Open(npz_filename, "rb") as npz_file: with tf.gfile.Open(npz_filename, "rb") as npz_file:
with np.load(npz_file) as npz: with np.load(npz_file) as npz:
data = npz["data"] data = npz["data"]
except Exception as e: except tf.errors.NotFoundError as e:
raise RuntimeError( raise RuntimeError(
"Error loading data; use data_download.py to prepare the data:\n{}: {}" "Error loading data; use data_download.py to prepare the data.\n{}: {}"
.format(type(e).__name__, e)) .format(type(e).__name__, e))
return (data[train_start:train_start+train_count], return (data[train_start:train_start+train_count],
data[eval_start:eval_start+eval_count]) data[eval_start:eval_start+eval_count])
...@@ -91,6 +91,7 @@ def make_inputs_from_np_arrays(features_np, label_np): ...@@ -91,6 +91,7 @@ def make_inputs_from_np_arrays(features_np, label_np):
Returns: Returns:
input_fn: A function returning a Dataset of feature dict and label. input_fn: A function returning a Dataset of feature dict and label.
feature_names: A list of feature names.
feature_column: A list of tf.feature_column.BucketizedColumn. feature_column: A list of tf.feature_column.BucketizedColumn.
""" """
num_features = features_np.shape[1] num_features = features_np.shape[1]
...@@ -127,7 +128,7 @@ def make_inputs_from_np_arrays(features_np, label_np): ...@@ -127,7 +128,7 @@ def make_inputs_from_np_arrays(features_np, label_np):
return tf.data.Dataset.zip((tf.data.Dataset.from_tensors(features), return tf.data.Dataset.zip((tf.data.Dataset.from_tensors(features),
tf.data.Dataset.from_tensors(label_np),)) tf.data.Dataset.from_tensors(label_np),))
return input_fn, bucketized_columns return input_fn, feature_names, bucketized_columns
def make_eval_inputs_from_np_arrays(features_np, label_np): def make_eval_inputs_from_np_arrays(features_np, label_np):
...@@ -149,6 +150,31 @@ def make_eval_inputs_from_np_arrays(features_np, label_np): ...@@ -149,6 +150,31 @@ def make_eval_inputs_from_np_arrays(features_np, label_np):
return input_fn return input_fn
def _make_csv_serving_input_receiver_fn(column_names, column_defaults):
"""Returns serving_input_receiver_fn for csv.
The input arguments are relevant to `tf.decode_csv()`.
Args:
column_names: a list of column names in the order within input csv.
column_defaults: a list of default values with the same size of
column_names. Each entity must be either a list of one scalar, or an
empty list to denote the corresponding column is required.
e.g. [[""], [2.5], []] indicates the third column is required while
the first column must be string and the second must be float/double.
Returns:
a serving_input_receiver_fn that handles csv for serving.
"""
def serving_input_receiver_fn():
csv = tf.placeholder(dtype=tf.string, shape=[None], name="csv")
features = dict(zip(column_names, tf.decode_csv(csv, column_defaults)))
receiver_tensors = {"inputs": csv}
return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
return serving_input_receiver_fn
def train_boosted_trees(flags_obj): def train_boosted_trees(flags_obj):
"""Train boosted_trees estimator on HIGGS data. """Train boosted_trees estimator on HIGGS data.
...@@ -164,9 +190,8 @@ def train_boosted_trees(flags_obj): ...@@ -164,9 +190,8 @@ def train_boosted_trees(flags_obj):
flags_obj.eval_start, flags_obj.eval_count) flags_obj.eval_start, flags_obj.eval_count)
tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format( tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape)) train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
# Data consists of one label column followed by 28 feature columns. # Data consists of one label column followed by 28 feature columns.
train_input_fn, feature_columns = make_inputs_from_np_arrays( train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
features_np=train_data[:, 1:], label_np=train_data[:, 0:1]) features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
eval_input_fn = make_eval_inputs_from_np_arrays( eval_input_fn = make_eval_inputs_from_np_arrays(
features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1]) features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
...@@ -202,11 +227,14 @@ def train_boosted_trees(flags_obj): ...@@ -202,11 +227,14 @@ def train_boosted_trees(flags_obj):
# Benchmark the evaluation results # Benchmark the evaluation results
benchmark_logger.log_evaluation_result(eval_results) benchmark_logger.log_evaluation_result(eval_results)
# Exporting the savedmodel. # Exporting the savedmodel with csv parsing.
if flags_obj.export_dir is not None: if flags_obj.export_dir is not None:
feature_spec = tf.estimator.export.build_parsing_serving_input_receiver_fn( classifier.export_savedmodel(
tf.feature_column.make_parse_example_spec(feature_columns)) flags_obj.export_dir,
classifier.export_savedmodel(flags_obj.export_dir, feature_spec) _make_csv_serving_input_receiver_fn(
column_names=feature_names,
# columns are all floats.
column_defaults=[[0.0]] * len(feature_names)))
def main(_): def main(_):
......
...@@ -73,8 +73,13 @@ class BaseTest(tf.test.TestCase): ...@@ -73,8 +73,13 @@ class BaseTest(tf.test.TestCase):
train_data, _ = train_higgs.read_higgs_data( train_data, _ = train_higgs.read_higgs_data(
self.data_dir, self.data_dir,
train_start=0, train_count=15, eval_start=15, eval_count=5) train_start=0, train_count=15, eval_start=15, eval_count=5)
input_fn, feature_columns = train_higgs.make_inputs_from_np_arrays( (input_fn, feature_names,
features_np=train_data[:, 1:], label_np=train_data[:, 0:1]) feature_columns) = train_higgs.make_inputs_from_np_arrays(
features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
# Check feature_names.
self.assertAllEqual(feature_names,
["feature_%02d" % (i+1) for i in range(28)])
# Check feature columns. # Check feature columns.
self.assertEqual(28, len(feature_columns)) self.assertEqual(28, len(feature_columns))
...@@ -86,7 +91,6 @@ class BaseTest(tf.test.TestCase): ...@@ -86,7 +91,6 @@ class BaseTest(tf.test.TestCase):
self.assertIsInstance(feature_column, bucketized_column_type) self.assertIsInstance(feature_column, bucketized_column_type)
# At least 2 boundaries. # At least 2 boundaries.
self.assertGreaterEqual(len(feature_column.boundaries), 2) self.assertGreaterEqual(len(feature_column.boundaries), 2)
feature_names = ["feature_%02d" % (i+1) for i in range(28)]
# Tests that the source column names of the bucketized columns match. # Tests that the source column names of the bucketized columns match.
self.assertAllEqual(feature_names, self.assertAllEqual(feature_names,
[col.source_column.name for col in feature_columns]) [col.source_column.name for col in feature_columns])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment