Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
191d99a7
Unverified
Commit
191d99a7
authored
May 29, 2018
by
Yanhui Liang
Committed by
GitHub
May 29, 2018
Browse files
Make boosted_trees Garden-official (#4377)
* Make boosted_trees Garden-official * Fix nits
parent
1886043f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
148 additions
and
116 deletions
+148
-116
official/boosted_trees/README.md
official/boosted_trees/README.md
+1
-1
official/boosted_trees/data_download.py
official/boosted_trees/data_download.py
+32
-27
official/boosted_trees/train_higgs.py
official/boosted_trees/train_higgs.py
+83
-56
official/boosted_trees/train_higgs_test.py
official/boosted_trees/train_higgs_test.py
+32
-32
No files found.
official/boosted_trees/README.md
View file @
191d99a7
...
@@ -39,7 +39,7 @@ Note that the model_dir is cleaned up before every time training starts.
...
@@ -39,7 +39,7 @@ Note that the model_dir is cleaned up before every time training starts.
Model parameters can be adjusted by flags, like
`--n_trees`
,
`--max_depth`
,
`--learning_rate`
and so on. Check out the code for details.
Model parameters can be adjusted by flags, like
`--n_trees`
,
`--max_depth`
,
`--learning_rate`
and so on. Check out the code for details.
The final accuacy will be around 74% and loss will be around 0.516 over the eval set, when trained with the default parameters.
The final accu
r
acy will be around 74% and loss will be around 0.516 over the eval set, when trained with the default parameters.
By default, the first 1 million examples among 11 millions are used for training, and the last 1 million examples are used for evaluation.
By default, the first 1 million examples among 11 millions are used for training, and the last 1 million examples are used for evaluation.
The training/evaluation data can be selected as index ranges by flags
`--train_start`
,
`--train_count`
,
`--eval_start`
,
`--eval_count`
, etc.
The training/evaluation data can be selected as index ranges by flags
`--train_start`
,
`--train_count`
,
`--eval_start`
,
`--eval_count`
, etc.
...
...
official/boosted_trees/data_download.py
View file @
191d99a7
...
@@ -12,28 +12,23 @@ from __future__ import absolute_import
...
@@ -12,28 +12,23 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
print_function
import
argparse
import
gzip
import
os
import
os
import
sys
import
tempfile
import
tempfile
# pylint: disable=g-bad-import-order
import
numpy
as
np
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
from
six.moves
import
urllib
from
six.moves
import
urllib
from
absl
import
app
as
absl_app
from
absl
import
flags
import
tensorflow
as
tf
import
tensorflow
as
tf
URL_ROOT
=
'https://archive.ics.uci.edu/ml/machine-learning-databases/00280'
from
official.utils.flags
import
core
as
flags_core
INPUT_FILE
=
'HIGGS.csv.gz'
NPZ_FILE
=
'HIGGS.csv.gz.npz'
# numpy compressed file to contain 'data' array.
URL_ROOT
=
"https://archive.ics.uci.edu/ml/machine-learning-databases/00280"
def
parse_args
():
INPUT_FILE
=
"HIGGS.csv.gz"
"""Parses arguments and returns a tuple (known_args, unparsed_args)."""
NPZ_FILE
=
"HIGGS.csv.gz.npz"
# numpy compressed file to contain "data" array.
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--data_dir'
,
type
=
str
,
default
=
'/tmp/higgs_data'
,
help
=
'Directory to download higgs dataset and store training/eval data.'
)
return
parser
.
parse_known_args
()
def
_download_higgs_data_and_save_npz
(
data_dir
):
def
_download_higgs_data_and_save_npz
(
data_dir
):
...
@@ -41,30 +36,30 @@ def _download_higgs_data_and_save_npz(data_dir):
...
@@ -41,30 +36,30 @@ def _download_higgs_data_and_save_npz(data_dir):
input_url
=
os
.
path
.
join
(
URL_ROOT
,
INPUT_FILE
)
input_url
=
os
.
path
.
join
(
URL_ROOT
,
INPUT_FILE
)
np_filename
=
os
.
path
.
join
(
data_dir
,
NPZ_FILE
)
np_filename
=
os
.
path
.
join
(
data_dir
,
NPZ_FILE
)
if
tf
.
gfile
.
Exists
(
np_filename
):
if
tf
.
gfile
.
Exists
(
np_filename
):
raise
ValueError
(
'
data_dir already has the processed data file: {}
'
.
format
(
raise
ValueError
(
"
data_dir already has the processed data file: {}
"
.
format
(
np_filename
))
np_filename
))
if
not
tf
.
gfile
.
Exists
(
data_dir
):
if
not
tf
.
gfile
.
Exists
(
data_dir
):
tf
.
gfile
.
MkDir
(
data_dir
)
tf
.
gfile
.
MkDir
(
data_dir
)
# 2.8 GB to download.
# 2.8 GB to download.
try
:
try
:
print
(
'
Data downloading..
'
)
tf
.
logging
.
info
(
"
Data downloading..
."
)
temp_filename
,
_
=
urllib
.
request
.
urlretrieve
(
input_url
)
temp_filename
,
_
=
urllib
.
request
.
urlretrieve
(
input_url
)
# Reading and parsing 11 million csv lines takes 2~3 minutes.
# Reading and parsing 11 million csv lines takes 2~3 minutes.
print
(
'Data processing.. taking multiple minutes..'
)
tf
.
logging
.
info
(
"Data processing... taking multiple minutes..."
)
data
=
pd
.
read_csv
(
with
gzip
.
open
(
temp_filename
,
"rb"
)
as
csv_file
:
temp_filename
,
data
=
pd
.
read_csv
(
dtype
=
np
.
float32
,
csv_file
,
names
=
[
'c%02d'
%
i
for
i
in
range
(
29
)]
# label + 28 features.
dtype
=
np
.
float32
,
).
as_matrix
()
names
=
[
"c%02d"
%
i
for
i
in
range
(
29
)]
# label + 28 features.
).
as_matrix
()
finally
:
finally
:
os
.
r
emove
(
temp_filename
)
tf
.
gfile
.
R
emove
(
temp_filename
)
# Writing to temporary location then copy to the data_dir (0.8 GB).
# Writing to temporary location then copy to the data_dir (0.8 GB).
f
=
tempfile
.
NamedTemporaryFile
()
f
=
tempfile
.
NamedTemporaryFile
()
np
.
savez_compressed
(
f
,
data
=
data
)
np
.
savez_compressed
(
f
,
data
=
data
)
tf
.
gfile
.
Copy
(
f
.
name
,
np_filename
)
tf
.
gfile
.
Copy
(
f
.
name
,
np_filename
)
print
(
'
Data saved to: {}
'
.
format
(
np_filename
))
tf
.
logging
.
info
(
"
Data saved to: {}
"
.
format
(
np_filename
))
def
main
(
unused_argv
):
def
main
(
unused_argv
):
...
@@ -73,6 +68,16 @@ def main(unused_argv):
...
@@ -73,6 +68,16 @@ def main(unused_argv):
_download_higgs_data_and_save_npz
(
FLAGS
.
data_dir
)
_download_higgs_data_and_save_npz
(
FLAGS
.
data_dir
)
if
__name__
==
'__main__'
:
def
define_data_download_flags
():
FLAGS
,
unparsed
=
parse_args
()
"""Add flags specifying data download arguments."""
tf
.
app
.
run
(
argv
=
[
sys
.
argv
[
0
]]
+
unparsed
)
flags
.
DEFINE_string
(
name
=
"data_dir"
,
default
=
"/tmp/higgs_data"
,
help
=
flags_core
.
help_wrap
(
"Directory to download higgs dataset and store training/eval data."
))
if
__name__
==
"__main__"
:
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
define_data_download_flags
()
FLAGS
=
flags
.
FLAGS
absl_app
.
run
(
main
)
official/boosted_trees/train_higgs.py
View file @
191d99a7
...
@@ -29,64 +29,44 @@ from __future__ import absolute_import
...
@@ -29,64 +29,44 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
print_function
import
argparse
import
os
import
os
import
sys
# pylint: disable=g-bad-import-order
import
numpy
as
np
from
absl
import
app
as
absl_app
from
absl
import
app
as
absl_app
from
absl
import
flags
from
absl
import
flags
import
numpy
as
np
# pylint: disable=wrong-import-order
import
tensorflow
as
tf
import
tensorflow
as
tf
# pylint:
dis
able=
wrong
-import-order
# pylint:
en
able=
g-bad
-import-order
from
official.utils.flags
import
core
as
flags_core
from
official.utils.flags
import
core
as
flags_core
from
official.utils.flags._conventions
import
help_wrap
from
official.utils.flags._conventions
import
help_wrap
from
official.utils.logs
import
logger
NPZ_FILE
=
"HIGGS.csv.gz.npz"
# numpy compressed file containing "data" array
NPZ_FILE
=
'HIGGS.csv.gz.npz'
# numpy compressed file containing 'data' array
def
define_train_higgs_flags
():
"""Add tree related flags as well as training/eval configuration."""
flags_core
.
define_base
(
stop_threshold
=
False
,
batch_size
=
False
,
num_gpu
=
False
)
flags
.
adopt_module_key_flags
(
flags_core
)
flags
.
DEFINE_integer
(
name
=
'train_start'
,
default
=
0
,
help
=
help_wrap
(
'Start index of train examples within the data.'
))
flags
.
DEFINE_integer
(
name
=
'train_count'
,
default
=
1000000
,
help
=
help_wrap
(
'Number of train examples within the data.'
))
flags
.
DEFINE_integer
(
name
=
'eval_start'
,
default
=
10000000
,
help
=
help_wrap
(
'Start index of eval examples within the data.'
))
flags
.
DEFINE_integer
(
name
=
'eval_count'
,
default
=
1000000
,
help
=
help_wrap
(
'Number of eval examples within the data.'
))
flags
.
DEFINE_integer
(
'n_trees'
,
default
=
100
,
help
=
help_wrap
(
'Number of trees to build.'
))
flags
.
DEFINE_integer
(
'max_depth'
,
default
=
6
,
help
=
help_wrap
(
'Maximum depths of each tree.'
))
flags
.
DEFINE_float
(
'learning_rate'
,
default
=
0.1
,
help
=
help_wrap
(
'Maximum depths of each tree.'
))
flags_core
.
set_defaults
(
data_dir
=
'/tmp/higgs_data'
,
model_dir
=
'/tmp/higgs_model'
)
def
read_higgs_data
(
data_dir
,
train_start
,
train_count
,
eval_start
,
eval_count
):
"""Reads higgs data from csv and returns train and eval data.
Args:
data_dir: A string, the directory of higgs dataset.
train_start: An integer, the start index of train examples within the data.
train_count: An integer, the number of train examples within the data.
eval_start: An integer, the start index of eval examples within the data.
eval_count: An integer, the number of eval examples within the data.
def
read_higgs_data
(
data_dir
,
train_start
,
train_count
,
eval_start
,
eval_count
):
Returns:
"""Reads higgs data from csv and returns train and eval data."""
Numpy array of train data and eval data.
"""
npz_filename
=
os
.
path
.
join
(
data_dir
,
NPZ_FILE
)
npz_filename
=
os
.
path
.
join
(
data_dir
,
NPZ_FILE
)
try
:
try
:
# gfile allows numpy to read data from network data sources as well.
# gfile allows numpy to read data from network data sources as well.
with
tf
.
gfile
.
Open
(
npz_filename
,
'
rb
'
)
as
npz_file
:
with
tf
.
gfile
.
Open
(
npz_filename
,
"
rb
"
)
as
npz_file
:
with
np
.
load
(
npz_file
)
as
npz
:
with
np
.
load
(
npz_file
)
as
npz
:
data
=
npz
[
'
data
'
]
data
=
npz
[
"
data
"
]
except
Exception
as
e
:
except
Exception
as
e
:
raise
RuntimeError
(
raise
RuntimeError
(
'
Error loading data; use data_download.py to prepare the data:
\n
{}: {}
'
"
Error loading data; use data_download.py to prepare the data:
\n
{}: {}
"
.
format
(
type
(
e
).
__name__
,
e
))
.
format
(
type
(
e
).
__name__
,
e
))
return
(
data
[
train_start
:
train_start
+
train_count
],
return
(
data
[
train_start
:
train_start
+
train_count
],
data
[
eval_start
:
eval_start
+
eval_count
])
data
[
eval_start
:
eval_start
+
eval_count
])
...
@@ -105,18 +85,18 @@ def make_inputs_from_np_arrays(features_np, label_np):
...
@@ -105,18 +85,18 @@ def make_inputs_from_np_arrays(features_np, label_np):
as a single tensor. Don't use batch.
as a single tensor. Don't use batch.
Args:
Args:
features_np:
a
numpy ndarray (shape=[batch_size, num_features]) for
features_np:
A
numpy ndarray (shape=[batch_size, num_features]) for
float32 features.
float32 features.
label_np:
a
numpy ndarray (shape=[batch_size, 1]) for labels.
label_np:
A
numpy ndarray (shape=[batch_size, 1]) for labels.
Returns:
Returns:
input_fn:
a
function returning a Dataset of feature dict and label.
input_fn:
A
function returning a Dataset of feature dict and label.
feature_column:
a
list of tf.feature_column.BucketizedColumn.
feature_column:
A
list of tf.feature_column.BucketizedColumn.
"""
"""
num_features
=
features_np
.
shape
[
1
]
num_features
=
features_np
.
shape
[
1
]
features_np_list
=
np
.
split
(
features_np
,
num_features
,
axis
=
1
)
features_np_list
=
np
.
split
(
features_np
,
num_features
,
axis
=
1
)
# 1-based feature names.
# 1-based feature names.
feature_names
=
[
'
feature_%02d
'
%
(
i
+
1
)
for
i
in
range
(
num_features
)]
feature_names
=
[
"
feature_%02d
"
%
(
i
+
1
)
for
i
in
range
(
num_features
)]
# Create source feature_columns and bucketized_columns.
# Create source feature_columns and bucketized_columns.
def
get_bucket_boundaries
(
feature
):
def
get_bucket_boundaries
(
feature
):
...
@@ -155,16 +135,16 @@ def make_eval_inputs_from_np_arrays(features_np, label_np):
...
@@ -155,16 +135,16 @@ def make_eval_inputs_from_np_arrays(features_np, label_np):
num_features
=
features_np
.
shape
[
1
]
num_features
=
features_np
.
shape
[
1
]
features_np_list
=
np
.
split
(
features_np
,
num_features
,
axis
=
1
)
features_np_list
=
np
.
split
(
features_np
,
num_features
,
axis
=
1
)
# 1-based feature names.
# 1-based feature names.
feature_names
=
[
'
feature_%02d
'
%
(
i
+
1
)
for
i
in
range
(
num_features
)]
feature_names
=
[
"
feature_%02d
"
%
(
i
+
1
)
for
i
in
range
(
num_features
)]
def
input_fn
():
def
input_fn
():
features
=
{
features
=
{
feature_name
:
tf
.
constant
(
features_np_list
[
i
])
feature_name
:
tf
.
constant
(
features_np_list
[
i
])
for
i
,
feature_name
in
enumerate
(
feature_names
)
for
i
,
feature_name
in
enumerate
(
feature_names
)
}
}
return
tf
.
data
.
Dataset
.
zip
(
return
tf
.
data
.
Dataset
.
zip
(
(
(
tf
.
data
.
Dataset
.
from_tensor_slices
(
features
),
tf
.
data
.
Dataset
.
from_tensor_slices
(
features
),
tf
.
data
.
Dataset
.
from_tensor_slices
(
label_np
),)).
batch
(
1000
)
tf
.
data
.
Dataset
.
from_tensor_slices
(
label_np
),)).
batch
(
1000
)
return
input_fn
return
input_fn
...
@@ -175,22 +155,37 @@ def train_boosted_trees(flags_obj):
...
@@ -175,22 +155,37 @@ def train_boosted_trees(flags_obj):
Args:
Args:
flags_obj: An object containing parsed flag values.
flags_obj: An object containing parsed flag values.
"""
"""
# Clean up the model directory if present.
# Clean up the model directory if present.
if
tf
.
gfile
.
Exists
(
flags_obj
.
model_dir
):
if
tf
.
gfile
.
Exists
(
flags_obj
.
model_dir
):
tf
.
gfile
.
DeleteRecursively
(
flags_obj
.
model_dir
)
tf
.
gfile
.
DeleteRecursively
(
flags_obj
.
model_dir
)
print
(
'
##
d
ata loading..
'
)
tf
.
logging
.
info
(
"
##
D
ata loading..
."
)
train_data
,
eval_data
=
read_higgs_data
(
train_data
,
eval_data
=
read_higgs_data
(
flags_obj
.
data_dir
,
flags_obj
.
train_start
,
flags_obj
.
train_count
,
flags_obj
.
data_dir
,
flags_obj
.
train_start
,
flags_obj
.
train_count
,
flags_obj
.
eval_start
,
flags_obj
.
eval_count
)
flags_obj
.
eval_start
,
flags_obj
.
eval_count
)
print
(
'
##
d
ata loaded; train: {}{}, eval: {}{}
'
.
format
(
tf
.
logging
.
info
(
"
##
D
ata loaded; train: {}{}, eval: {}{}
"
.
format
(
train_data
.
dtype
,
train_data
.
shape
,
eval_data
.
dtype
,
eval_data
.
shape
))
train_data
.
dtype
,
train_data
.
shape
,
eval_data
.
dtype
,
eval_data
.
shape
))
# data consists of one label column and 28 feature columns following.
# Data consists of one label column followed by 28 feature columns.
train_input_fn
,
feature_columns
=
make_inputs_from_np_arrays
(
train_input_fn
,
feature_columns
=
make_inputs_from_np_arrays
(
features_np
=
train_data
[:,
1
:],
label_np
=
train_data
[:,
0
:
1
])
features_np
=
train_data
[:,
1
:],
label_np
=
train_data
[:,
0
:
1
])
eval_input_fn
=
make_eval_inputs_from_np_arrays
(
eval_input_fn
=
make_eval_inputs_from_np_arrays
(
features_np
=
eval_data
[:,
1
:],
label_np
=
eval_data
[:,
0
:
1
])
features_np
=
eval_data
[:,
1
:],
label_np
=
eval_data
[:,
0
:
1
])
print
(
'## features prepared. training starts..'
)
tf
.
logging
.
info
(
"## Features prepared. Training starts..."
)
# Create benchmark logger to log info about the training and metric values
run_params
=
{
"train_start"
:
flags_obj
.
train_start
,
"train_count"
:
flags_obj
.
train_count
,
"eval_start"
:
flags_obj
.
eval_start
,
"eval_count"
:
flags_obj
.
eval_count
,
"n_trees"
:
flags_obj
.
n_trees
,
"max_depth"
:
flags_obj
.
max_depth
,
}
benchmark_logger
=
logger
.
config_benchmark_logger
(
flags_obj
)
benchmark_logger
.
log_run_info
(
model_name
=
"boosted_trees"
,
dataset_name
=
"higgs"
,
run_params
=
run_params
)
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory
# training is yet provided as a contrib library.
# training is yet provided as a contrib library.
...
@@ -203,7 +198,9 @@ def train_boosted_trees(flags_obj):
...
@@ -203,7 +198,9 @@ def train_boosted_trees(flags_obj):
learning_rate
=
flags_obj
.
learning_rate
)
learning_rate
=
flags_obj
.
learning_rate
)
# Evaluation.
# Evaluation.
eval_result
=
classifier
.
evaluate
(
eval_input_fn
)
eval_results
=
classifier
.
evaluate
(
eval_input_fn
)
# Benchmark the evaluation results
benchmark_logger
.
log_evaluation_result
(
eval_results
)
# Exporting the savedmodel.
# Exporting the savedmodel.
if
flags_obj
.
export_dir
is
not
None
:
if
flags_obj
.
export_dir
is
not
None
:
...
@@ -216,7 +213,37 @@ def main(_):
...
@@ -216,7 +213,37 @@ def main(_):
train_boosted_trees
(
flags
.
FLAGS
)
train_boosted_trees
(
flags
.
FLAGS
)
if
__name__
==
'__main__'
:
def
define_train_higgs_flags
():
"""Add tree related flags as well as training/eval configuration."""
flags_core
.
define_base
(
stop_threshold
=
False
,
batch_size
=
False
,
num_gpu
=
False
)
flags
.
adopt_module_key_flags
(
flags_core
)
flags
.
DEFINE_integer
(
name
=
"train_start"
,
default
=
0
,
help
=
help_wrap
(
"Start index of train examples within the data."
))
flags
.
DEFINE_integer
(
name
=
"train_count"
,
default
=
1000000
,
help
=
help_wrap
(
"Number of train examples within the data."
))
flags
.
DEFINE_integer
(
name
=
"eval_start"
,
default
=
10000000
,
help
=
help_wrap
(
"Start index of eval examples within the data."
))
flags
.
DEFINE_integer
(
name
=
"eval_count"
,
default
=
1000000
,
help
=
help_wrap
(
"Number of eval examples within the data."
))
flags
.
DEFINE_integer
(
"n_trees"
,
default
=
100
,
help
=
help_wrap
(
"Number of trees to build."
))
flags
.
DEFINE_integer
(
"max_depth"
,
default
=
6
,
help
=
help_wrap
(
"Maximum depths of each tree."
))
flags
.
DEFINE_float
(
"learning_rate"
,
default
=
0.1
,
help
=
help_wrap
(
"The learning rate."
))
flags_core
.
set_defaults
(
data_dir
=
"/tmp/higgs_data"
,
model_dir
=
"/tmp/higgs_model"
)
if
__name__
==
"__main__"
:
# Training progress and eval results are shown as logging.INFO; so enables it.
# Training progress and eval results are shown as logging.INFO; so enables it.
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
define_train_higgs_flags
()
define_train_higgs_flags
()
...
...
official/boosted_trees/train_higgs_test.py
View file @
191d99a7
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
# ==============================================================================
"""Tests for boosted_tree."""
from
__future__
import
absolute_import
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
print_function
...
@@ -22,16 +22,16 @@ import tempfile
...
@@ -22,16 +22,16 @@ import tempfile
import
numpy
as
np
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
import
tensorflow
as
tf
import
tensorflow
as
tf
# pylint: disable=g-bad-import-order
# pylint: disable=g-bad-import-order
from
official.utils.testing
import
integration
from
official.boosted_trees
import
train_higgs
from
official.boosted_trees
import
train_higgs
from
official.utils.testing
import
integration
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
ERROR
)
TEST_CSV
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"train_higgs_test.csv"
)
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
ERROR
)
TEST_CSV
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'train_higgs_test.csv'
)
class
BaseTest
(
tf
.
test
.
TestCase
):
class
BaseTest
(
tf
.
test
.
TestCase
):
"""Tests for Wide Deep model."""
"""Tests for Wide Deep model."""
...
@@ -45,7 +45,7 @@ class BaseTest(tf.test.TestCase):
...
@@ -45,7 +45,7 @@ class BaseTest(tf.test.TestCase):
# Create temporary CSV file
# Create temporary CSV file
self
.
data_dir
=
self
.
get_temp_dir
()
self
.
data_dir
=
self
.
get_temp_dir
()
data
=
pd
.
read_csv
(
data
=
pd
.
read_csv
(
TEST_CSV
,
dtype
=
np
.
float32
,
names
=
[
'
c%02d
'
%
i
for
i
in
range
(
29
)]
TEST_CSV
,
dtype
=
np
.
float32
,
names
=
[
"
c%02d
"
%
i
for
i
in
range
(
29
)]
).
as_matrix
()
).
as_matrix
()
self
.
input_npz
=
os
.
path
.
join
(
self
.
data_dir
,
train_higgs
.
NPZ_FILE
)
self
.
input_npz
=
os
.
path
.
join
(
self
.
data_dir
,
train_higgs
.
NPZ_FILE
)
# numpy.savez doesn't take gfile.Gfile, so need to write down and copy.
# numpy.savez doesn't take gfile.Gfile, so need to write down and copy.
...
@@ -56,9 +56,9 @@ class BaseTest(tf.test.TestCase):
...
@@ -56,9 +56,9 @@ class BaseTest(tf.test.TestCase):
def
test_read_higgs_data
(
self
):
def
test_read_higgs_data
(
self
):
"""Tests read_higgs_data() function."""
"""Tests read_higgs_data() function."""
# Error when a wrong data_dir is given.
# Error when a wrong data_dir is given.
with
self
.
assertRaisesRegexp
(
RuntimeError
,
'
Error loading data.*
'
):
with
self
.
assertRaisesRegexp
(
RuntimeError
,
"
Error loading data.*
"
):
train_data
,
eval_data
=
train_higgs
.
read_higgs_data
(
train_data
,
eval_data
=
train_higgs
.
read_higgs_data
(
self
.
data_dir
+
'
non-existing-path
'
,
self
.
data_dir
+
"
non-existing-path
"
,
train_start
=
0
,
train_count
=
15
,
eval_start
=
15
,
eval_count
=
5
)
train_start
=
0
,
train_count
=
15
,
eval_start
=
15
,
eval_count
=
5
)
# Loading fine with the correct data_dir.
# Loading fine with the correct data_dir.
...
@@ -80,13 +80,13 @@ class BaseTest(tf.test.TestCase):
...
@@ -80,13 +80,13 @@ class BaseTest(tf.test.TestCase):
self
.
assertEqual
(
28
,
len
(
feature_columns
))
self
.
assertEqual
(
28
,
len
(
feature_columns
))
bucketized_column_type
=
type
(
bucketized_column_type
=
type
(
tf
.
feature_column
.
bucketized_column
(
tf
.
feature_column
.
bucketized_column
(
tf
.
feature_column
.
numeric_column
(
'
feature_01
'
),
tf
.
feature_column
.
numeric_column
(
"
feature_01
"
),
boundaries
=
[
0
,
1
,
2
]))
# dummy boundaries.
boundaries
=
[
0
,
1
,
2
]))
# dummy boundaries.
for
feature_column
in
feature_columns
:
for
feature_column
in
feature_columns
:
self
.
assertIsInstance
(
feature_column
,
bucketized_column_type
)
self
.
assertIsInstance
(
feature_column
,
bucketized_column_type
)
# At least 2 boundaries.
# At least 2 boundaries.
self
.
assertGreaterEqual
(
len
(
feature_column
.
boundaries
),
2
)
self
.
assertGreaterEqual
(
len
(
feature_column
.
boundaries
),
2
)
feature_names
=
[
'
feature_%02d
'
%
(
i
+
1
)
for
i
in
range
(
28
)]
feature_names
=
[
"
feature_%02d
"
%
(
i
+
1
)
for
i
in
range
(
28
)]
# Tests that the source column names of the bucketized columns match.
# Tests that the source column names of the bucketized columns match.
self
.
assertAllEqual
(
feature_names
,
self
.
assertAllEqual
(
feature_names
,
[
col
.
source_column
.
name
for
col
in
feature_columns
])
[
col
.
source_column
.
name
for
col
in
feature_columns
])
...
@@ -113,39 +113,39 @@ class BaseTest(tf.test.TestCase):
...
@@ -113,39 +113,39 @@ class BaseTest(tf.test.TestCase):
def
test_end_to_end
(
self
):
def
test_end_to_end
(
self
):
"""Tests end-to-end running."""
"""Tests end-to-end running."""
model_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'
model
'
)
model_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"
model
"
)
integration
.
run_synthetic
(
integration
.
run_synthetic
(
main
=
train_higgs
.
main
,
tmp_root
=
self
.
get_temp_dir
(),
extra_flags
=
[
main
=
train_higgs
.
main
,
tmp_root
=
self
.
get_temp_dir
(),
extra_flags
=
[
'
--data_dir
'
,
self
.
data_dir
,
"
--data_dir
"
,
self
.
data_dir
,
'
--model_dir
'
,
model_dir
,
"
--model_dir
"
,
model_dir
,
'
--n_trees
'
,
'5'
,
"
--n_trees
"
,
"5"
,
'
--train_start
'
,
'0'
,
"
--train_start
"
,
"0"
,
'
--train_count
'
,
'
12
'
,
"
--train_count
"
,
"
12
"
,
'
--eval_start
'
,
'
12
'
,
"
--eval_start
"
,
"
12
"
,
'
--eval_count
'
,
'8'
,
"
--eval_count
"
,
"8"
,
],
],
synth
=
False
,
max_train
=
None
)
synth
=
False
,
max_train
=
None
)
self
.
assertTrue
(
tf
.
gfile
.
Exists
(
os
.
path
.
join
(
model_dir
,
'
checkpoint
'
)))
self
.
assertTrue
(
tf
.
gfile
.
Exists
(
os
.
path
.
join
(
model_dir
,
"
checkpoint
"
)))
def
test_end_to_end_with_export
(
self
):
def
test_end_to_end_with_export
(
self
):
"""Tests end-to-end running."""
"""Tests end-to-end running."""
model_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'
model
'
)
model_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"
model
"
)
export_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'
export
'
)
export_dir
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
"
export
"
)
integration
.
run_synthetic
(
integration
.
run_synthetic
(
main
=
train_higgs
.
main
,
tmp_root
=
self
.
get_temp_dir
(),
extra_flags
=
[
main
=
train_higgs
.
main
,
tmp_root
=
self
.
get_temp_dir
(),
extra_flags
=
[
'
--data_dir
'
,
self
.
data_dir
,
"
--data_dir
"
,
self
.
data_dir
,
'
--model_dir
'
,
model_dir
,
"
--model_dir
"
,
model_dir
,
'
--export_dir
'
,
export_dir
,
"
--export_dir
"
,
export_dir
,
'
--n_trees
'
,
'5'
,
"
--n_trees
"
,
"5"
,
'
--train_start
'
,
'0'
,
"
--train_start
"
,
"0"
,
'
--train_count
'
,
'
12
'
,
"
--train_count
"
,
"
12
"
,
'
--eval_start
'
,
'
12
'
,
"
--eval_start
"
,
"
12
"
,
'
--eval_count
'
,
'8'
,
"
--eval_count
"
,
"8"
,
],
],
synth
=
False
,
max_train
=
None
)
synth
=
False
,
max_train
=
None
)
self
.
assertTrue
(
tf
.
gfile
.
Exists
(
os
.
path
.
join
(
model_dir
,
'
checkpoint
'
)))
self
.
assertTrue
(
tf
.
gfile
.
Exists
(
os
.
path
.
join
(
model_dir
,
"
checkpoint
"
)))
self
.
assertTrue
(
tf
.
gfile
.
Exists
(
os
.
path
.
join
(
export_dir
)))
self
.
assertTrue
(
tf
.
gfile
.
Exists
(
os
.
path
.
join
(
export_dir
)))
if
__name__
==
'
__main__
'
:
if
__name__
==
"
__main__
"
:
tf
.
test
.
main
()
tf
.
test
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment