"sgl-kernel/python/sgl_kernel/load_utils.py" did not exist on "04b35190e27d808e656637475da4d5b8a2804be1"
Commit 02c6f1ac authored by Toby Boyd's avatar Toby Boyd
Browse files

Changed to params

parent abeb0356
...@@ -44,8 +44,9 @@ import tensorflow as tf ...@@ -44,8 +44,9 @@ import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
def get_model_fn(num_gpus, variable_strategy, data_format, num_workers): def get_model_fn(num_gpus, variable_strategy, num_workers):
"""Returns a function that will build the resnet model.""" """Returns a function that will build the resnet model."""
def _resnet_model_fn(features, labels, mode, params): def _resnet_model_fn(features, labels, mode, params):
"""Resnet model body. """Resnet model body.
...@@ -73,6 +74,16 @@ def get_model_fn(num_gpus, variable_strategy, data_format, num_workers): ...@@ -73,6 +74,16 @@ def get_model_fn(num_gpus, variable_strategy, data_format, num_workers):
tower_gradvars = [] tower_gradvars = []
tower_preds = [] tower_preds = []
# channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
# on CPU. The exception is Intel MKL on CPU which is optimal with
# channels_last.
data_format = params.data_format
if not data_format:
if num_gpus == 0:
data_format = 'channels_last'
else:
data_format = 'channels_first'
if num_gpus == 0: if num_gpus == 0:
num_devices = 1 num_devices = 1
device_type = 'cpu' device_type = 'cpu'
...@@ -276,7 +287,6 @@ def input_fn(data_dir, ...@@ -276,7 +287,6 @@ def input_fn(data_dir,
def get_experiment_fn(data_dir, def get_experiment_fn(data_dir,
num_gpus, num_gpus,
variable_strategy, variable_strategy,
data_format,
use_distortion_for_training=True): use_distortion_for_training=True):
"""Returns an Experiment function. """Returns an Experiment function.
...@@ -291,7 +301,6 @@ def get_experiment_fn(data_dir, ...@@ -291,7 +301,6 @@ def get_experiment_fn(data_dir,
num_gpus: int. Number of GPUs on each worker. num_gpus: int. Number of GPUs on each worker.
variable_strategy: String. CPU to use CPU as the parameter server variable_strategy: String. CPU to use CPU as the parameter server
and GPU to use the GPUs as the parameter server. and GPU to use the GPUs as the parameter server.
data_format: String. channels_first or channels_last.
use_distortion_for_training: bool. See cifar10.Cifar10DataSet. use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
Returns: Returns:
A function (tf.estimator.RunConfig, tf.contrib.training.HParams) -> A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
...@@ -338,11 +347,10 @@ def get_experiment_fn(data_dir, ...@@ -338,11 +347,10 @@ def get_experiment_fn(data_dir,
hooks = [logging_hook, examples_sec_hook] hooks = [logging_hook, examples_sec_hook]
classifier = tf.estimator.Estimator( classifier = tf.estimator.Estimator(
model_fn=get_model_fn(num_gpus, variable_strategy, data_format, model_fn=get_model_fn(num_gpus, variable_strategy,
run_config.num_worker_replicas or 1), run_config.num_worker_replicas or 1),
config=run_config, config=run_config,
params=hparams params=hparams)
)
# Create experiment. # Create experiment.
experiment = tf.contrib.learn.Experiment( experiment = tf.contrib.learn.Experiment(
...@@ -354,25 +362,17 @@ def get_experiment_fn(data_dir, ...@@ -354,25 +362,17 @@ def get_experiment_fn(data_dir,
# Adding hooks to be used by the estimator on training modes # Adding hooks to be used by the estimator on training modes
experiment.extend_train_hooks(hooks) experiment.extend_train_hooks(hooks)
return experiment return experiment
return _experiment_fn return _experiment_fn
def main(job_dir, data_dir, num_gpus, variable_strategy, data_format, def main(job_dir, data_dir, num_gpus, variable_strategy,
use_distortion_for_training, log_device_placement, num_intra_threads, use_distortion_for_training, log_device_placement, num_intra_threads,
**hparams): **hparams):
# The env variable is on deprecation path, default is set to off. # The env variable is on deprecation path, default is set to off.
os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_SYNC_ON_FINISH'] = '0'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
# channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
# on CPU. The exception is Intel MKL on CPU which is optimal with
# channels_last.
if not data_format:
if num_gpus == 0:
data_format = 'channels_last'
else:
data_format = 'channels_first'
# Session configuration. # Session configuration.
sess_config = tf.ConfigProto( sess_config = tf.ConfigProto(
allow_soft_placement=True, allow_soft_placement=True,
...@@ -383,7 +383,7 @@ def main(job_dir, data_dir, num_gpus, variable_strategy, data_format, ...@@ -383,7 +383,7 @@ def main(job_dir, data_dir, num_gpus, variable_strategy, data_format,
config = cifar10_utils.RunConfig( config = cifar10_utils.RunConfig(
session_config=sess_config, model_dir=job_dir) session_config=sess_config, model_dir=job_dir)
tf.contrib.learn.learn_runner.run( tf.contrib.learn.learn_runner.run(
get_experiment_fn(data_dir, num_gpus, variable_strategy, data_format, get_experiment_fn(data_dir, num_gpus, variable_strategy,
use_distortion_for_training), use_distortion_for_training),
run_config=config, run_config=config,
hparams=tf.contrib.training.HParams(**hparams)) hparams=tf.contrib.training.HParams(**hparams))
...@@ -505,13 +505,12 @@ if __name__ == '__main__': ...@@ -505,13 +505,12 @@ if __name__ == '__main__':
help='Epsilon for batch norm.') help='Epsilon for batch norm.')
args = parser.parse_args() args = parser.parse_args()
if args.num_gpus < 0: if args.num_gpus < 0:
raise ValueError( raise ValueError(
'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.') 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.')
if args.num_gpus == 0 and args.variable_strategy == 'GPU': if args.num_gpus == 0 and args.variable_strategy == 'GPU':
raise ValueError( raise ValueError('num-gpus=0, CPU must be used as parameter server. Set'
'num-gpus=0, CPU must be used as parameter server. Set' '--variable-strategy=CPU.')
'--variable-strategy=CPU.')
if (args.num_layers - 2) % 6 != 0: if (args.num_layers - 2) % 6 != 0:
raise ValueError('Invalid --num-layers parameter.') raise ValueError('Invalid --num-layers parameter.')
if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0: if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment