Unverified Commit 704b50e2 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #200 from microsoft/master

merge master
parents 755ac5f0 3a6d1372
...@@ -2,8 +2,8 @@ NNI 支持的训练平台介绍 ...@@ -2,8 +2,8 @@ NNI 支持的训练平台介绍
===================================== =====================================
.. toctree:: .. toctree::
本机<LocalMode> 本机<./TrainingService/LocalMode>
远程<RemoteMachineMode> 远程<./TrainingService/RemoteMachineMode>
OpenPAI<PaiMode> OpenPAI<./TrainingService/PaiMode>
Kubeflow<KubeflowMode> Kubeflow<./TrainingService/KubeflowMode>
FrameworkController<FrameworkControllerMode> FrameworkController<./TrainingService/FrameworkControllerMode>
\ No newline at end of file \ No newline at end of file
...@@ -13,6 +13,6 @@ Tuner 从 Trial 接收指标结果,来评估一组超参或网络结构的性 ...@@ -13,6 +13,6 @@ Tuner 从 Trial 接收指标结果,来评估一组超参或网络结构的性
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
内置 Tuner<BuiltinTuner> 内置 Tuner<builtin_tuner>
自定义 Tuner<CustomizeTuner> 自定义 Tuner<Tuner/CustomizeTuner>
自定义 Advisor<CustomizeAdvisor> 自定义 Advisor<Tuner/CustomizeAdvisor>
\ No newline at end of file
...@@ -5,12 +5,13 @@ ...@@ -5,12 +5,13 @@
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
安装<Installation> 安装<Tutorial/Installation>
实现 Trial<Trials> 实现 Trial<./TrialExample/Trials>
Tuner<tuners> Tuner<tuners>
Assessor<assessors> Assessor<assessors>
Web 界面<WebUI> Web 界面<Tutorial/WebUI>
训练平台<training_services> 训练平台<training_services>
如何使用 Docker <HowToUseDocker> 如何使用 Docker<Tutorial/HowToUseDocker>
高级功能<advanced> 高级功能<advanced>
如何调试<HowToDebug> 如何调试<Tutorial/HowToDebug>
\ No newline at end of file Windows 中使用 NNI<Tutorial/NniOnWindows>
\ No newline at end of file
**NNI 中的自动特征工程**
===
[示例](https://github.com/SpongebBob/tabular_automl_NNI)在 NNI 中实现了自动特征工程。
代码来自于贡献者。 谢谢可爱的贡献者!
欢迎越来越多的人加入我们!
\ No newline at end of file
...@@ -8,13 +8,11 @@ trainingServicePlatform: local ...@@ -8,13 +8,11 @@ trainingServicePlatform: local
#choice: true, false #choice: true, false
useAnnotation: true useAnnotation: true
tuner: tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner codeDir: ../../../tuners/random_nas_tuner
#SMAC (SMAC should be installed through nnictl)
#codeDir: ~/nni/nni/examples/tuners/random_nas_tuner
codeDir: ../../tuners/random_nas_tuner
classFileName: random_nas_tuner.py classFileName: random_nas_tuner.py
className: RandomNASTuner className: RandomNASTuner
trial: trial:
command: python3 mnist.py command: python3 mnist.py
codeDir: . codeDir: .
gpuNum: 0 gpuNum: 0
nasMode: classic_mode
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 10
#choice: local, remote, pai
trainingServicePlatform: local
#choice: true, false
useAnnotation: true
tuner:
codeDir: ../../../tuners/random_nas_tuner
classFileName: random_nas_tuner.py
className: RandomNASTuner
trial:
command: python3 mnist-darts.py
codeDir: .
gpuNum: 0
nasMode: oneshot_mode
"""A deep MNIST classifier using convolutional layers."""
import argparse
import logging
import math
import tempfile
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import operators as op
FLAGS = None
logger = logging.getLogger('mnist_AutoML')
class MnistNetwork(object):
'''
MnistNetwork is for initializing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
channel_2_num,
conv_size,
hidden_size,
pool_size,
learning_rate,
x_dim=784,
y_dim=10):
self.channel_1_num = channel_1_num
self.channel_2_num = channel_2_num
self.conv_size = conv_size
self.hidden_size = hidden_size
self.pool_size = pool_size
self.learning_rate = learning_rate
self.x_dim = x_dim
self.y_dim = y_dim
self.images = tf.placeholder(tf.float32, [None, self.x_dim], name='input_x')
self.labels = tf.placeholder(tf.float32, [None, self.y_dim], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.train_step = None
self.accuracy = None
def build_network(self):
'''
Building network for mnist, meanwhile specifying its neural architecture search space
'''
# Reshape to use within a convolutional neural net.
# Last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
with tf.name_scope('reshape'):
try:
input_dim = int(math.sqrt(self.x_dim))
except:
print(
'input dim cannot be sqrt and reshape. input dim: ' + str(self.x_dim))
logger.debug(
'input dim cannot be sqrt and reshape. input dim: %s', str(self.x_dim))
raise
x_image = tf.reshape(self.images, [-1, input_dim, input_dim, 1])
"""@nni.mutable_layers(
{
layer_choice: [op.conv2d(size=1, in_ch=1, out_ch=self.channel_1_num),
op.conv2d(size=3, in_ch=1, out_ch=self.channel_1_num),
op.twice_conv2d(size=3, in_ch=1, out_ch=self.channel_1_num),
op.twice_conv2d(size=7, in_ch=1, out_ch=self.channel_1_num),
op.dilated_conv(in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=3, in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=5, in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=7, in_ch=1, out_ch=self.channel_1_num)],
fixed_inputs: [x_image],
layer_output: conv1_out
},
{
layer_choice: [op.post_process(ch_size=self.channel_1_num)],
fixed_inputs: [conv1_out],
layer_output: post1_out
},
{
layer_choice: [op.max_pool(size=3),
op.max_pool(size=5),
op.max_pool(size=7),
op.avg_pool(size=3),
op.avg_pool(size=5),
op.avg_pool(size=7)],
fixed_inputs: [post1_out],
layer_output: pool1_out
},
{
layer_choice: [op.conv2d(size=1, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.conv2d(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.twice_conv2d(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.twice_conv2d(size=7, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.dilated_conv(in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=5, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=7, in_ch=self.channel_1_num, out_ch=self.channel_2_num)],
fixed_inputs: [pool1_out],
optional_inputs: [post1_out],
optional_input_size: [0, 1],
layer_output: conv2_out
},
{
layer_choice: [op.post_process(ch_size=self.channel_2_num)],
fixed_inputs: [conv2_out],
layer_output: post2_out
},
{
layer_choice: [op.max_pool(size=3),
op.max_pool(size=5),
op.max_pool(size=7),
op.avg_pool(size=3),
op.avg_pool(size=5),
op.avg_pool(size=7)],
fixed_inputs: [post2_out],
optional_inputs: [post1_out, pool1_out],
optional_input_size: [0, 1],
layer_output: pool2_out
}
)"""
# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
last_dim_list = pool2_out.get_shape().as_list()
assert(last_dim_list[1] == last_dim_list[2])
last_dim = last_dim_list[1]
with tf.name_scope('fc1'):
w_fc1 = op.weight_variable(
[last_dim * last_dim * self.channel_2_num, self.hidden_size])
b_fc1 = op.bias_variable([self.hidden_size])
h_pool2_flat = tf.reshape(
pool2_out, [-1, last_dim * last_dim * self.channel_2_num])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout - controls the complexity of the model, prevents co-adaptation of features.
with tf.name_scope('dropout'):
h_fc1_drop = tf.nn.dropout(h_fc1, self.keep_prob)
# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
w_fc2 = op.weight_variable([self.hidden_size, self.y_dim])
b_fc2 = op.bias_variable([self.y_dim])
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
with tf.name_scope('loss'):
self.cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=y_conv))
with tf.name_scope('adam_optimizer'):
self.train_step = tf.train.AdamOptimizer(
self.learning_rate).minimize(self.cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(
tf.argmax(y_conv, 1), tf.argmax(self.labels, 1))
self.accuracy = tf.reduce_mean(
tf.cast(correct_prediction, tf.float32))
def download_mnist_retry(data_dir, max_num_retries=20):
"""Try to download mnist dataset and avoid errors"""
for _ in range(max_num_retries):
try:
return input_data.read_data_sets(data_dir, one_hot=True)
except tf.errors.AlreadyExistsError:
time.sleep(1)
raise Exception("Failed to download MNIST.")
def main(params):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist = download_mnist_retry(params['data_dir'])
print('Mnist download data done.')
logger.debug('Mnist download data done.')
# Create the model
# Build the graph for the deep net
mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],
channel_2_num=params['channel_2_num'],
conv_size=params['conv_size'],
hidden_size=params['hidden_size'],
pool_size=params['pool_size'],
learning_rate=params['learning_rate'])
mnist_network.build_network()
logger.debug('Mnist build network done.')
# Write log
graph_location = tempfile.mkdtemp()
logger.debug('Saving graph to: %s', graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
test_acc = 0.0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(params['batch_num']):
batch = mnist.train.next_batch(params['batch_size'])
feed_dict={mnist_network.images: batch[0],
mnist_network.labels: batch[1],
mnist_network.keep_prob: 1 - params['dropout_rate']}
"""@nni.training_update(tf, sess, mnist_network.cross_entropy)"""
batch = mnist.train.next_batch(params['batch_size'])
feed_dict={mnist_network.images: batch[0],
mnist_network.labels: batch[1],
mnist_network.keep_prob: 1 - params['dropout_rate']}
mnist_network.train_step.run(feed_dict=feed_dict)
if i % 100 == 0:
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_intermediate_result(test_acc)"""
logger.debug('test accuracy %g', test_acc)
logger.debug('Pipe send intermediate result done.')
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_final_result(test_acc)"""
logger.debug('Final result is %g', test_acc)
logger.debug('Send final result done.')
def get_params():
''' Get parameters from command line '''
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, default='/tmp/tensorflow/mnist/input_data', help="data directory")
parser.add_argument("--dropout_rate", type=float, default=0.5, help="dropout rate")
parser.add_argument("--channel_1_num", type=int, default=32)
parser.add_argument("--channel_2_num", type=int, default=64)
parser.add_argument("--conv_size", type=int, default=5)
parser.add_argument("--pool_size", type=int, default=2)
parser.add_argument("--hidden_size", type=int, default=1024)
parser.add_argument("--learning_rate", type=float, default=1e-4)
parser.add_argument("--batch_num", type=int, default=2000)
parser.add_argument("--batch_size", type=int, default=32)
args, _ = parser.parse_known_args()
return args
if __name__ == '__main__':
try:
params = vars(get_params())
main(params)
except Exception as exception:
logger.exception(exception)
raise
import tensorflow as tf
import math
def weight_variable(shape):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def sum_op(inputs):
"""sum_op"""
fixed_input = inputs[0][0]
optional_input = tf.concat(inputs[1], axis=3)
fixed_shape = fixed_input.get_shape().as_list()
optional_shape = optional_input.get_shape().as_list()
assert fixed_shape[1] == fixed_shape[2]
assert optional_shape[1] == optional_shape[2]
pool_size = math.ceil(optional_shape[1] / fixed_shape[1])
pool_out = tf.nn.avg_pool(optional_input, ksize=[1, pool_size, pool_size, 1], strides=[1, pool_size, pool_size, 1], padding='SAME')
conv_matrix = weight_variable([1, 1, optional_shape[3], fixed_shape[3]])
conv_out = tf.nn.conv2d(pool_out, conv_matrix, strides=[1, 1, 1, 1], padding='SAME')
return fixed_input + conv_out
def conv2d(inputs, size=-1, in_ch=-1, out_ch=-1):
"""conv2d returns a 2d convolution layer with full stride."""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [1, 3]:
w_matrix = weight_variable([size, size, in_ch, out_ch])
return tf.nn.conv2d(x_input, w_matrix, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def twice_conv2d(inputs, size=-1, in_ch=-1, out_ch=-1):
"""twice_conv2d"""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 7]:
w_matrix1 = weight_variable([1, size, in_ch, int(out_ch/2)])
out = tf.nn.conv2d(x_input, w_matrix1, strides=[1, 1, 1, 1], padding='SAME')
w_matrix2 = weight_variable([size, 1, int(out_ch/2), out_ch])
return tf.nn.conv2d(out, w_matrix2, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def dilated_conv(inputs, size=3, in_ch=-1, out_ch=-1):
"""dilated_conv"""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size == 3:
w_matrix = weight_variable([size, size, in_ch, out_ch])
return tf.nn.atrous_conv2d(x_input, w_matrix, rate=2, padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def separable_conv(inputs, size=-1, in_ch=-1, out_ch=-1):
"""separable_conv"""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 5, 7]:
depth_matrix = weight_variable([size, size, in_ch, 1])
point_matrix = weight_variable([1, 1, 1*in_ch, out_ch])
return tf.nn.separable_conv2d(x_input, depth_matrix, point_matrix, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def avg_pool(inputs, size=-1):
"""avg_pool downsamples a feature map."""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 5, 7]:
return tf.nn.avg_pool(x_input, ksize=[1, size, size, 1], strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def max_pool(inputs, size=-1):
"""max_pool downsamples a feature map."""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 5, 7]:
return tf.nn.max_pool(x_input, ksize=[1, size, size, 1], strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def post_process(inputs, ch_size=-1):
"""post_process"""
x_input = inputs[0][0]
bias_matrix = bias_variable([ch_size])
return tf.nn.relu(x_input + bias_matrix)
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 10
#choice: local, remote, pai
trainingServicePlatform: local
#choice: true, false
useAnnotation: true
multiPhase: true
tuner:
codeDir: ../../../tuners/random_nas_tuner
classFileName: random_nas_tuner.py
className: RandomNASTuner
trial:
command: python3 mnist-enas.py
codeDir: .
gpuNum: 0
nasMode: enas_mode
"""A deep MNIST classifier using convolutional layers."""
import argparse
import logging
import math
import tempfile
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import operators as op
FLAGS = None
logger = logging.getLogger('mnist_AutoML')
class MnistNetwork(object):
'''
MnistNetwork is for initializing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
channel_2_num,
conv_size,
hidden_size,
pool_size,
learning_rate,
x_dim=784,
y_dim=10):
self.channel_1_num = channel_1_num
self.channel_2_num = channel_2_num
self.conv_size = conv_size
self.hidden_size = hidden_size
self.pool_size = pool_size
self.learning_rate = learning_rate
self.x_dim = x_dim
self.y_dim = y_dim
self.images = tf.placeholder(tf.float32, [None, self.x_dim], name='input_x')
self.labels = tf.placeholder(tf.float32, [None, self.y_dim], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.train_step = None
self.accuracy = None
def build_network(self):
'''
Building network for mnist, meanwhile specifying its neural architecture search space
'''
# Reshape to use within a convolutional neural net.
# Last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
with tf.name_scope('reshape'):
try:
input_dim = int(math.sqrt(self.x_dim))
except:
print(
'input dim cannot be sqrt and reshape. input dim: ' + str(self.x_dim))
logger.debug(
'input dim cannot be sqrt and reshape. input dim: %s', str(self.x_dim))
raise
x_image = tf.reshape(self.images, [-1, input_dim, input_dim, 1])
"""@nni.mutable_layers(
{
layer_choice: [op.conv2d(size=1, in_ch=1, out_ch=self.channel_1_num),
op.conv2d(size=3, in_ch=1, out_ch=self.channel_1_num),
op.twice_conv2d(size=3, in_ch=1, out_ch=self.channel_1_num),
op.twice_conv2d(size=7, in_ch=1, out_ch=self.channel_1_num),
op.dilated_conv(in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=3, in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=5, in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=7, in_ch=1, out_ch=self.channel_1_num)],
fixed_inputs: [x_image],
layer_output: conv1_out
},
{
layer_choice: [op.post_process(ch_size=self.channel_1_num)],
fixed_inputs: [conv1_out],
layer_output: post1_out
},
{
layer_choice: [op.max_pool(size=3),
op.max_pool(size=5),
op.max_pool(size=7),
op.avg_pool(size=3),
op.avg_pool(size=5),
op.avg_pool(size=7)],
fixed_inputs: [post1_out],
layer_output: pool1_out
},
{
layer_choice: [op.conv2d(size=1, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.conv2d(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.twice_conv2d(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.twice_conv2d(size=7, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.dilated_conv(in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=5, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=7, in_ch=self.channel_1_num, out_ch=self.channel_2_num)],
fixed_inputs: [pool1_out],
optional_inputs: [post1_out],
optional_input_size: [0, 1],
layer_output: conv2_out
},
{
layer_choice: [op.post_process(ch_size=self.channel_2_num)],
fixed_inputs: [conv2_out],
layer_output: post2_out
},
{
layer_choice: [op.max_pool(size=3),
op.max_pool(size=5),
op.max_pool(size=7),
op.avg_pool(size=3),
op.avg_pool(size=5),
op.avg_pool(size=7)],
fixed_inputs: [post2_out],
optional_inputs: [post1_out, pool1_out],
optional_input_size: [0, 1],
layer_output: pool2_out
}
)"""
# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
last_dim_list = pool2_out.get_shape().as_list()
assert(last_dim_list[1] == last_dim_list[2])
last_dim = last_dim_list[1]
with tf.name_scope('fc1'):
w_fc1 = op.weight_variable(
[last_dim * last_dim * self.channel_2_num, self.hidden_size])
b_fc1 = op.bias_variable([self.hidden_size])
h_pool2_flat = tf.reshape(
pool2_out, [-1, last_dim * last_dim * self.channel_2_num])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout - controls the complexity of the model, prevents co-adaptation of features.
with tf.name_scope('dropout'):
h_fc1_drop = tf.nn.dropout(h_fc1, self.keep_prob)
# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
w_fc2 = op.weight_variable([self.hidden_size, self.y_dim])
b_fc2 = op.bias_variable([self.y_dim])
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
with tf.name_scope('loss'):
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=y_conv))
with tf.name_scope('adam_optimizer'):
self.train_step = tf.train.AdamOptimizer(
self.learning_rate).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(
tf.argmax(y_conv, 1), tf.argmax(self.labels, 1))
self.accuracy = tf.reduce_mean(
tf.cast(correct_prediction, tf.float32))
def download_mnist_retry(data_dir, max_num_retries=20):
"""Try to download mnist dataset and avoid errors"""
for _ in range(max_num_retries):
try:
return input_data.read_data_sets(data_dir, one_hot=True)
except tf.errors.AlreadyExistsError:
time.sleep(1)
raise Exception("Failed to download MNIST.")
def main(params):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist = download_mnist_retry(params['data_dir'])
print('Mnist download data done.')
logger.debug('Mnist download data done.')
# Create the model
# Build the graph for the deep net
mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],
channel_2_num=params['channel_2_num'],
conv_size=params['conv_size'],
hidden_size=params['hidden_size'],
pool_size=params['pool_size'],
learning_rate=params['learning_rate'])
mnist_network.build_network()
logger.debug('Mnist build network done.')
# Write log
graph_location = tempfile.mkdtemp()
logger.debug('Saving graph to: %s', graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
test_acc = 0.0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(params['batch_num']):
"""@nni.training_update(tf, sess)"""
batch = mnist.train.next_batch(params['batch_size'])
mnist_network.train_step.run(feed_dict={mnist_network.images: batch[0],
mnist_network.labels: batch[1],
mnist_network.keep_prob: 1 - params['dropout_rate']}
)
if i % 100 == 0:
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_intermediate_result(test_acc)"""
logger.debug('test accuracy %g', test_acc)
logger.debug('Pipe send intermediate result done.')
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_final_result(test_acc)"""
logger.debug('Final result is %g', test_acc)
logger.debug('Send final result done.')
def get_params():
''' Get parameters from command line '''
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, default='/tmp/tensorflow/mnist/input_data', help="data directory")
parser.add_argument("--dropout_rate", type=float, default=0.5, help="dropout rate")
parser.add_argument("--channel_1_num", type=int, default=32)
parser.add_argument("--channel_2_num", type=int, default=64)
parser.add_argument("--conv_size", type=int, default=5)
parser.add_argument("--pool_size", type=int, default=2)
parser.add_argument("--hidden_size", type=int, default=1024)
parser.add_argument("--learning_rate", type=float, default=1e-4)
parser.add_argument("--batch_num", type=int, default=2000)
parser.add_argument("--batch_size", type=int, default=32)
args, _ = parser.parse_known_args()
return args
if __name__ == '__main__':
try:
params = vars(get_params())
main(params)
except Exception as exception:
logger.exception(exception)
raise
import tensorflow as tf
import math
def weight_variable(shape):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def sum_op(inputs):
"""sum_op"""
fixed_input = inputs[0][0]
optional_input = tf.reduce_sum(inputs[1], axis=0)
if len(optional_input.get_shape()) < 1:
return fixed_input
fixed_shape = fixed_input.get_shape().as_list()
optional_shape = optional_input.get_shape().as_list()
assert fixed_shape[1] == fixed_shape[2]
assert optional_shape[1] == optional_shape[2]
pool_size = math.ceil(optional_shape[1] / fixed_shape[1])
pool_out = tf.nn.avg_pool(optional_input, ksize=[1, pool_size, pool_size, 1], strides=[1, pool_size, pool_size, 1], padding='SAME')
conv_matrix = weight_variable([1, 1, optional_shape[3], fixed_shape[3]])
conv_out = tf.nn.conv2d(pool_out, conv_matrix, strides=[1, 1, 1, 1], padding='SAME')
return fixed_input + conv_out
def conv2d(inputs, size=-1, in_ch=-1, out_ch=-1):
"""conv2d returns a 2d convolution layer with full stride."""
x_input = sum_op(inputs)
if size in [1, 3]:
w_matrix = weight_variable([size, size, in_ch, out_ch])
return tf.nn.conv2d(x_input, w_matrix, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def twice_conv2d(inputs, size=-1, in_ch=-1, out_ch=-1):
"""twice_conv2d"""
x_input = sum_op(inputs)
if size in [3, 7]:
w_matrix1 = weight_variable([1, size, in_ch, int(out_ch/2)])
out = tf.nn.conv2d(x_input, w_matrix1, strides=[1, 1, 1, 1], padding='SAME')
w_matrix2 = weight_variable([size, 1, int(out_ch/2), out_ch])
return tf.nn.conv2d(out, w_matrix2, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def dilated_conv(inputs, size=3, in_ch=-1, out_ch=-1):
"""dilated_conv"""
x_input = sum_op(inputs)
if size == 3:
w_matrix = weight_variable([size, size, in_ch, out_ch])
return tf.nn.atrous_conv2d(x_input, w_matrix, rate=2, padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def separable_conv(inputs, size=-1, in_ch=-1, out_ch=-1):
"""separable_conv"""
x_input = sum_op(inputs)
if size in [3, 5, 7]:
depth_matrix = weight_variable([size, size, in_ch, 1])
point_matrix = weight_variable([1, 1, 1*in_ch, out_ch])
return tf.nn.separable_conv2d(x_input, depth_matrix, point_matrix, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def avg_pool(inputs, size=-1):
"""avg_pool downsamples a feature map."""
x_input = sum_op(inputs)
if size in [3, 5, 7]:
return tf.nn.avg_pool(x_input, ksize=[1, size, size, 1], strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def max_pool(inputs, size=-1):
"""max_pool downsamples a feature map."""
x_input = sum_op(inputs)
if size in [3, 5, 7]:
return tf.nn.max_pool(x_input, ksize=[1, size, size, 1], strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def post_process(inputs, ch_size=-1):
"""post_process"""
x_input = inputs[0][0]
bias_matrix = bias_variable([ch_size])
return tf.nn.relu(x_input + bias_matrix)
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 10
#choice: local, remote, pai
trainingServicePlatform: local
#choice: true, false
useAnnotation: true
tuner:
codeDir: ../../../tuners/random_nas_tuner
classFileName: random_nas_tuner.py
className: RandomNASTuner
trial:
command: python3 mnist-oneshot.py
codeDir: .
gpuNum: 0
nasMode: oneshot_mode
"""A deep MNIST classifier using convolutional layers."""
import argparse
import logging
import math
import tempfile
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import operators as op
FLAGS = None
logger = logging.getLogger('mnist_AutoML')
class MnistNetwork(object):
'''
MnistNetwork is for initializing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
channel_2_num,
conv_size,
hidden_size,
pool_size,
learning_rate,
x_dim=784,
y_dim=10):
self.channel_1_num = channel_1_num
self.channel_2_num = channel_2_num
self.conv_size = conv_size
self.hidden_size = hidden_size
self.pool_size = pool_size
self.learning_rate = learning_rate
self.x_dim = x_dim
self.y_dim = y_dim
self.images = tf.placeholder(tf.float32, [None, self.x_dim], name='input_x')
self.labels = tf.placeholder(tf.float32, [None, self.y_dim], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.train_step = None
self.accuracy = None
def build_network(self):
'''
Building network for mnist, meanwhile specifying its neural architecture search space
'''
# Reshape to use within a convolutional neural net.
# Last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
with tf.name_scope('reshape'):
try:
input_dim = int(math.sqrt(self.x_dim))
except:
print(
'input dim cannot be sqrt and reshape. input dim: ' + str(self.x_dim))
logger.debug(
'input dim cannot be sqrt and reshape. input dim: %s', str(self.x_dim))
raise
x_image = tf.reshape(self.images, [-1, input_dim, input_dim, 1])
"""@nni.mutable_layers(
{
layer_choice: [op.conv2d(size=1, in_ch=1, out_ch=self.channel_1_num),
op.conv2d(size=3, in_ch=1, out_ch=self.channel_1_num),
op.twice_conv2d(size=3, in_ch=1, out_ch=self.channel_1_num),
op.twice_conv2d(size=7, in_ch=1, out_ch=self.channel_1_num),
op.dilated_conv(in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=3, in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=5, in_ch=1, out_ch=self.channel_1_num),
op.separable_conv(size=7, in_ch=1, out_ch=self.channel_1_num)],
fixed_inputs: [x_image],
layer_output: conv1_out
},
{
layer_choice: [op.post_process(ch_size=self.channel_1_num)],
fixed_inputs: [conv1_out],
layer_output: post1_out
},
{
layer_choice: [op.max_pool(size=3),
op.max_pool(size=5),
op.max_pool(size=7),
op.avg_pool(size=3),
op.avg_pool(size=5),
op.avg_pool(size=7)],
fixed_inputs: [post1_out],
layer_output: pool1_out
},
{
layer_choice: [op.conv2d(size=1, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.conv2d(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.twice_conv2d(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.twice_conv2d(size=7, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.dilated_conv(in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=3, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=5, in_ch=self.channel_1_num, out_ch=self.channel_2_num),
op.separable_conv(size=7, in_ch=self.channel_1_num, out_ch=self.channel_2_num)],
fixed_inputs: [pool1_out],
optional_inputs: [post1_out],
optional_input_size: [0, 1],
layer_output: conv2_out
},
{
layer_choice: [op.post_process(ch_size=self.channel_2_num)],
fixed_inputs: [conv2_out],
layer_output: post2_out
},
{
layer_choice: [op.max_pool(size=3),
op.max_pool(size=5),
op.max_pool(size=7),
op.avg_pool(size=3),
op.avg_pool(size=5),
op.avg_pool(size=7)],
fixed_inputs: [post2_out],
optional_inputs: [post1_out, pool1_out],
optional_input_size: [0, 1],
layer_output: pool2_out
}
)"""
# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
last_dim_list = pool2_out.get_shape().as_list()
assert(last_dim_list[1] == last_dim_list[2])
last_dim = last_dim_list[1]
with tf.name_scope('fc1'):
w_fc1 = op.weight_variable(
[last_dim * last_dim * self.channel_2_num, self.hidden_size])
b_fc1 = op.bias_variable([self.hidden_size])
h_pool2_flat = tf.reshape(
pool2_out, [-1, last_dim * last_dim * self.channel_2_num])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout - controls the complexity of the model, prevents co-adaptation of features.
with tf.name_scope('dropout'):
h_fc1_drop = tf.nn.dropout(h_fc1, self.keep_prob)
# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
w_fc2 = op.weight_variable([self.hidden_size, self.y_dim])
b_fc2 = op.bias_variable([self.y_dim])
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
with tf.name_scope('loss'):
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=y_conv))
with tf.name_scope('adam_optimizer'):
self.train_step = tf.train.AdamOptimizer(
self.learning_rate).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(
tf.argmax(y_conv, 1), tf.argmax(self.labels, 1))
self.accuracy = tf.reduce_mean(
tf.cast(correct_prediction, tf.float32))
def download_mnist_retry(data_dir, max_num_retries=20):
"""Try to download mnist dataset and avoid errors"""
for _ in range(max_num_retries):
try:
return input_data.read_data_sets(data_dir, one_hot=True)
except tf.errors.AlreadyExistsError:
time.sleep(1)
raise Exception("Failed to download MNIST.")
def main(params):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist = download_mnist_retry(params['data_dir'])
print('Mnist download data done.')
logger.debug('Mnist download data done.')
# Create the model
# Build the graph for the deep net
mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],
channel_2_num=params['channel_2_num'],
conv_size=params['conv_size'],
hidden_size=params['hidden_size'],
pool_size=params['pool_size'],
learning_rate=params['learning_rate'])
mnist_network.build_network()
logger.debug('Mnist build network done.')
# Write log
graph_location = tempfile.mkdtemp()
logger.debug('Saving graph to: %s', graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
test_acc = 0.0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(params['batch_num']):
batch = mnist.train.next_batch(params['batch_size'])
mnist_network.train_step.run(feed_dict={mnist_network.images: batch[0],
mnist_network.labels: batch[1],
mnist_network.keep_prob: 1 - params['dropout_rate']}
)
if i % 100 == 0:
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_intermediate_result(test_acc)"""
logger.debug('test accuracy %g', test_acc)
logger.debug('Pipe send intermediate result done.')
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_final_result(test_acc)"""
logger.debug('Final result is %g', test_acc)
logger.debug('Send final result done.')
def get_params():
''' Get parameters from command line '''
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, default='/tmp/tensorflow/mnist/input_data', help="data directory")
parser.add_argument("--dropout_rate", type=float, default=0.5, help="dropout rate")
parser.add_argument("--channel_1_num", type=int, default=32)
parser.add_argument("--channel_2_num", type=int, default=64)
parser.add_argument("--conv_size", type=int, default=5)
parser.add_argument("--pool_size", type=int, default=2)
parser.add_argument("--hidden_size", type=int, default=1024)
parser.add_argument("--learning_rate", type=float, default=1e-4)
parser.add_argument("--batch_num", type=int, default=2000)
parser.add_argument("--batch_size", type=int, default=32)
args, _ = parser.parse_known_args()
return args
if __name__ == '__main__':
try:
params = vars(get_params())
main(params)
except Exception as exception:
logger.exception(exception)
raise
import tensorflow as tf
import math
def weight_variable(shape):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def sum_op(inputs):
"""sum_op"""
fixed_input = inputs[0][0]
optional_input = tf.concat(inputs[1], axis=3)
fixed_shape = fixed_input.get_shape().as_list()
optional_shape = optional_input.get_shape().as_list()
assert fixed_shape[1] == fixed_shape[2]
assert optional_shape[1] == optional_shape[2]
pool_size = math.ceil(optional_shape[1] / fixed_shape[1])
pool_out = tf.nn.avg_pool(optional_input, ksize=[1, pool_size, pool_size, 1], strides=[1, pool_size, pool_size, 1], padding='SAME')
conv_matrix = weight_variable([1, 1, optional_shape[3], fixed_shape[3]])
conv_out = tf.nn.conv2d(pool_out, conv_matrix, strides=[1, 1, 1, 1], padding='SAME')
return fixed_input + conv_out
def conv2d(inputs, size=-1, in_ch=-1, out_ch=-1):
"""conv2d returns a 2d convolution layer with full stride."""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [1, 3]:
w_matrix = weight_variable([size, size, in_ch, out_ch])
return tf.nn.conv2d(x_input, w_matrix, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def twice_conv2d(inputs, size=-1, in_ch=-1, out_ch=-1):
"""twice_conv2d"""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 7]:
w_matrix1 = weight_variable([1, size, in_ch, int(out_ch/2)])
out = tf.nn.conv2d(x_input, w_matrix1, strides=[1, 1, 1, 1], padding='SAME')
w_matrix2 = weight_variable([size, 1, int(out_ch/2), out_ch])
return tf.nn.conv2d(out, w_matrix2, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def dilated_conv(inputs, size=3, in_ch=-1, out_ch=-1):
"""dilated_conv"""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size == 3:
w_matrix = weight_variable([size, size, in_ch, out_ch])
return tf.nn.atrous_conv2d(x_input, w_matrix, rate=2, padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def separable_conv(inputs, size=-1, in_ch=-1, out_ch=-1):
"""separable_conv"""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 5, 7]:
depth_matrix = weight_variable([size, size, in_ch, 1])
point_matrix = weight_variable([1, 1, 1*in_ch, out_ch])
return tf.nn.separable_conv2d(x_input, depth_matrix, point_matrix, strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def avg_pool(inputs, size=-1):
"""avg_pool downsamples a feature map."""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 5, 7]:
return tf.nn.avg_pool(x_input, ksize=[1, size, size, 1], strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def max_pool(inputs, size=-1):
"""max_pool downsamples a feature map."""
if not inputs[1]:
x_input = inputs[0][0]
else:
x_input = sum_op(inputs)
if size in [3, 5, 7]:
return tf.nn.max_pool(x_input, ksize=[1, size, size, 1], strides=[1, 1, 1, 1], padding='SAME')
else:
raise Exception("Unknown filter size: %d." % size)
def post_process(inputs, ch_size=-1):
"""post_process"""
x_input = inputs[0][0]
bias_matrix = bias_variable([ch_size])
return tf.nn.relu(x_input + bias_matrix)
...@@ -52,7 +52,7 @@ export namespace ValidationSchemas { ...@@ -52,7 +52,7 @@ export namespace ValidationSchemas {
virtualCluster: joi.string(), virtualCluster: joi.string(),
shmMB: joi.number(), shmMB: joi.number(),
authFile: joi.string(), authFile: joi.string(),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode'), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
worker: joi.object({ worker: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
......
...@@ -77,8 +77,6 @@ class PAITrainingService implements TrainingService { ...@@ -77,8 +77,6 @@ class PAITrainingService implements TrainingService {
private versionCheck: boolean = true; private versionCheck: boolean = true;
private logCollection: string; private logCollection: string;
private isMultiPhase: boolean = false; private isMultiPhase: boolean = false;
private hdfsCodeDir?: string;
private hdfsOutputDir?: string;
constructor() { constructor() {
this.log = getLogger(); this.log = getLogger();
...@@ -154,13 +152,13 @@ class PAITrainingService implements TrainingService { ...@@ -154,13 +152,13 @@ class PAITrainingService implements TrainingService {
//TODO: use HDFS working folder instead //TODO: use HDFS working folder instead
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
this.hdfsCodeDir = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);
this.hdfsOutputDir = unixPathJoin(this.hdfsCodeDir, 'nnioutput'); const hdfsOutputDir: string = unixPathJoin(hdfsCodeDir, 'nnioutput');
const hdfsLogPath : string = String.Format( const hdfsLogPath : string = String.Format(
PAI_LOG_PATH_FORMAT, PAI_LOG_PATH_FORMAT,
this.paiClusterConfig.host, this.paiClusterConfig.host,
this.hdfsOutputDir hdfsOutputDir
); );
const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
...@@ -365,14 +363,6 @@ class PAITrainingService implements TrainingService { ...@@ -365,14 +363,6 @@ class PAITrainingService implements TrainingService {
throw new Error('PAI token is not initialized'); throw new Error('PAI token is not initialized');
} }
if (this.hdfsCodeDir === undefined) {
throw new Error('hdfsCodeDir is not initialized');
}
if (this.hdfsOutputDir === undefined) {
throw new Error('hdfsOutputDir is not initialized');
}
if (this.paiRestServerPort === undefined) { if (this.paiRestServerPort === undefined) {
const restServer: PAIJobRestServer = component.get(PAIJobRestServer); const restServer: PAIJobRestServer = component.get(PAIJobRestServer);
this.paiRestServerPort = restServer.clusterRestServerPort; this.paiRestServerPort = restServer.clusterRestServerPort;
...@@ -401,7 +391,8 @@ class PAITrainingService implements TrainingService { ...@@ -401,7 +391,8 @@ class PAITrainingService implements TrainingService {
trialForm.hyperParameters.value, { encoding: 'utf8' } trialForm.hyperParameters.value, { encoding: 'utf8' }
); );
} }
const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);
const hdfsOutputDir: string = unixPathJoin(hdfsCodeDir, 'nnioutput');
// tslint:disable-next-line: strict-boolean-expressions // tslint:disable-next-line: strict-boolean-expressions
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
const version: string = this.versionCheck ? await getVersion() : ''; const version: string = this.versionCheck ? await getVersion() : '';
...@@ -417,7 +408,7 @@ class PAITrainingService implements TrainingService { ...@@ -417,7 +408,7 @@ class PAITrainingService implements TrainingService {
this.paiTrialConfig.command, this.paiTrialConfig.command,
nniManagerIp, nniManagerIp,
this.paiRestServerPort, this.paiRestServerPort,
this.hdfsOutputDir, hdfsOutputDir,
this.paiClusterConfig.host, this.paiClusterConfig.host,
this.paiClusterConfig.userName, this.paiClusterConfig.userName,
HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName),
...@@ -452,7 +443,7 @@ class PAITrainingService implements TrainingService { ...@@ -452,7 +443,7 @@ class PAITrainingService implements TrainingService {
// Docker image // Docker image
this.paiTrialConfig.image, this.paiTrialConfig.image,
// codeDir // codeDir
`$PAI_DEFAULT_FS_URI${this.hdfsCodeDir}`, `$PAI_DEFAULT_FS_URI${hdfsCodeDir}`,
// PAI Task roles // PAI Task roles
paiTaskRoles, paiTaskRoles,
// Add Virutal Cluster // Add Virutal Cluster
...@@ -463,9 +454,9 @@ class PAITrainingService implements TrainingService { ...@@ -463,9 +454,9 @@ class PAITrainingService implements TrainingService {
// Step 2. Upload code files in codeDir onto HDFS // Step 2. Upload code files in codeDir onto HDFS
try { try {
await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, this.hdfsCodeDir, this.hdfsClient); await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient);
} catch (error) { } catch (error) {
this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${this.hdfsCodeDir} failed, error is ${error}`); this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`);
trialJobDetail.status = 'FAILED'; trialJobDetail.status = 'FAILED';
deferred.resolve(true); deferred.resolve(true);
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
from .trial import * from .trial import *
from .smartparam import * from .smartparam import *
from .nas_utils import reload_tensorflow_variables from .nas_utils import training_update
class NoMoreTrialError(Exception): class NoMoreTrialError(Exception):
def __init__(self,ErrorInfo): def __init__(self,ErrorInfo):
......
...@@ -97,9 +97,8 @@ class GridSearchTuner(Tuner): ...@@ -97,9 +97,8 @@ class GridSearchTuner(Tuner):
def _parse_quniform(self, param_value): def _parse_quniform(self, param_value):
'''parse type of quniform parameter and return a list''' '''parse type of quniform parameter and return a list'''
low, high, interval = param_value[0], param_value[1], param_value[2] low, high, q = param_value[0], param_value[1], param_value[2]
count = int(np.floor((high - low) / interval)) + 1 return np.clip(np.arange(np.round(low/q), np.round(high/q)+1) * q, low, high)
return [low + interval * i for i in range(count)]
def _parse_randint(self, param_value): def _parse_randint(self, param_value):
'''parse type of randint parameter and return a list''' '''parse type of randint parameter and return a list'''
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment