Commit 252f36f8 authored by Deshui Yu's avatar Deshui Yu
Browse files

NNI dogfood version 1

parent 781cea26
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(str_input):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
'''
Remove "a|an|the"
'''
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
'''
Remove unnessary whitespace
'''
return ' '.join(text.split())
def remove_punc(text):
'''
Remove punc
'''
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
'''
Change string to lower form.
'''
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(str_input))))
def f1_score(prediction, ground_truth):
'''
Calculate the f1 score.
'''
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
'''
Calculate the match score with prediction and ground truth.
'''
return normalize_answer(prediction) == normalize_answer(ground_truth)
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
'''
Metric max over the ground truths.
'''
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def _evaluate(dataset, predictions):
'''
Evaluate function.
'''
f1 = exact_match = total = 0
count = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
#print(message, file=sys.stderr)
count += 1
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
print('total', total, 'exact_match', exact_match, 'unanswer_question ', count)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
def evaluate(data_file, pred_file):
'''
Evaluate.
'''
expected_version = '1.1'
with open(data_file) as dataset_file:
dataset_json = json.load(dataset_file)
if dataset_json['version'] != expected_version:
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(pred_file) as prediction_file:
predictions = json.load(prediction_file)
# print(json.dumps(evaluate(dataset, predictions)))
result = _evaluate(dataset, predictions)
# print('em:', result['exact_match'], 'f1:', result['f1'])
return result['exact_match']
def evaluate_with_predictions(data_file, predictions):
'''
Evalutate with predictions/
'''
expected_version = '1.1'
with open(data_file) as dataset_file:
dataset_json = json.load(dataset_file)
if dataset_json['version'] != expected_version:
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
result = _evaluate(dataset, predictions)
return result['exact_match']
if __name__ == '__main__':
EXPECT_VERSION = '1.1'
parser = argparse.ArgumentParser(
description='Evaluation for SQuAD ' + EXPECT_VERSION)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
print(evaluate(args.dataset_file, args.prediction_file))
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
Graph is customed-define class, this module contains related class and function about graph.
'''
import copy
import json
import random
from enum import Enum, unique
@unique
class LayerType(Enum):
'''
Layer type
'''
attention = 0
self_attention = 1
rnn = 2
input = 3
output = 4
class Layer(object):
'''
Layer class, which contains the information of graph.
'''
def __init__(self, graph_type, input=None, output=None, size=None):
self.input = input if input is not None else []
self.output = output if output is not None else []
self.graph_type = graph_type
self.is_delete = False
self.size = size
if graph_type == LayerType.attention.value:
self.input_size = 2
self.output_size = 1
elif graph_type == LayerType.rnn.value:
self.input_size = 1
self.output_size = 1
elif graph_type == LayerType.self_attention.value:
self.input_size = 1
self.output_size = 1
elif graph_type == LayerType.input.value:
self.input_size = 0
self.output_size = 1
elif graph_type == LayerType.output.value:
self.input_size = 1
self.output_size = 0
else:
print(graph_type)
def set_size(self, graph_id, size):
'''
Set size.
'''
if self.graph_type == LayerType.attention.value:
if self.input[0] == graph_id:
self.size = size
if self.graph_type == LayerType.rnn.value:
self.size = size
if self.graph_type == LayerType.self_attention.value:
self.size = size
if self.graph_type == LayerType.output.value:
if self.size != size:
return False
return True
def clear_size(self):
'''
Clear size
'''
if self.graph_type == LayerType.attention.value or \
LayerType.rnn.value or LayerType.self_attention.value:
self.size = None
def __str__(self):
return 'input:' + str(self.input) + ' output:' + str(self.output) + ' type:' + str(
self.graph_type) + ' is_delete:' + str(self.is_delete) + ' size:' + str(self.size)
def graph_dumps(graph):
'''
Dump the graph.
'''
return json.dumps(graph, default=lambda obj: obj.__dict__)
def graph_loads(graph_json):
'''
Load graph
'''
layers = []
for layer in graph_json['layers']:
layer_info = Layer(layer['type'], layer['input'], layer['output'], layer['size'])
layer_info.is_delete = layer['is_delete']
layers.append(layer_info)
graph = Graph(graph_json['max_layer_num'], [], [], [])
graph.layers = layers
return graph
class Graph(object):
'''
Customed Graph class.
'''
def __init__(self, max_layer_num, input, output, hide):
self.layers = []
self.max_layer_num = max_layer_num
for layer in input:
self.layers.append(layer)
for layer in output:
self.layers.append(layer)
if hide is not None:
for layer in hide:
self.layers.append(layer)
assert self.is_legal()
def is_topology(self, layers=None):
'''
valid the topology
'''
if layers is None:
layers = self.layers
layers_nodle = []
result = []
for i, layer in enumerate(layers):
if layer.is_delete is False:
layers_nodle.append(i)
while True:
flag_break = True
layers_toremove = []
for layer1 in layers_nodle:
flag_arrive = True
for layer2 in layers[layer1].input:
if layer2 in layers_nodle:
flag_arrive = False
if flag_arrive is True:
for layer2 in layers[layer1].output:
# Size is error
if layers[layer2].set_size(layer1, layers[layer1].size) is False:
return False
layers_toremove.append(layer1)
result.append(layer1)
flag_break = False
for layer in layers_toremove:
layers_nodle.remove(layer)
result.append('|')
if flag_break:
break
# There is loop in graph || some layers can't to arrive
if layers_nodle:
return False
return result
def layer_num(self, layers=None):
'''
Reutn number of layer.
'''
if layers is None:
layers = self.layers
layer_num = 0
for layer in layers:
if layer.is_delete is False and layer.graph_type != LayerType.input.value\
and layer.graph_type != LayerType.output.value:
layer_num += 1
return layer_num
def is_legal(self, layers=None):
'''
Judge whether is legal for layers
'''
if layers is None:
layers = self.layers
for layer in layers:
if layer.is_delete is False:
if len(layer.input) != layer.input_size:
return False
if len(layer.output) < layer.output_size:
return False
# layer_num <= max_layer_num
if self.layer_num(layers) > self.max_layer_num:
return False
# There is loop in graph || some layers can't to arrive
if self.is_topology(layers) is False:
return False
return True
def mutation(self, only_add=False):
'''
Mutation for a graph
'''
types = []
if self.layer_num() < self.max_layer_num:
types.append(0)
types.append(1)
if self.layer_num() > 5 and only_add is False:
types.append(2)
types.append(3)
# 0 : add a layer , delete a edge
# 1 : add a layer , change a edge
# 2 : delete a layer, delete a edge
# 3 : delete a layer, change a edge
graph_type = random.choice(types)
layer_type = random.choice([LayerType.attention.value,\
LayerType.self_attention.value, LayerType.rnn.value])
layers = copy.deepcopy(self.layers)
cnt_try = 0
while True:
layers_in = []
layers_out = []
layers_del = []
for i, layer in enumerate(layers):
if layer.is_delete is False:
if layer.graph_type != LayerType.output.value:
layers_in.append(i)
if layer.graph_type != LayerType.input.value:
layers_out.append(i)
if layer.graph_type != LayerType.output.value\
and layer.graph_type != LayerType.input.value:
layers_del.append(i)
if graph_type <= 1:
new_id = len(layers)
out = random.choice(layers_out)
input = []
output = [out]
pos = random.randint(0, len(layers[out].input) - 1)
last_in = layers[out].input[pos]
layers[out].input[pos] = new_id
if graph_type == 0:
layers[last_in].output.remove(out)
if graph_type == 1:
layers[last_in].output.remove(out)
layers[last_in].output.append(new_id)
input = [last_in]
lay = Layer(graph_type=layer_type, input=input, output=output)
while len(input) < lay.input_size:
layer1 = random.choice(layers_in)
input.append(layer1)
layers[layer1].output.append(new_id)
lay.input = input
layers.append(lay)
else:
layer1 = random.choice(layers_del)
for layer2 in layers[layer1].output:
layers[layer2].input.remove(layer1)
if graph_type == 2:
random_in = random.choice(layers_in)
else:
random_in = random.choice(layers[layer1].input)
layers[layer2].input.append(random_in)
layers[random_in].output.append(layer2)
for layer2 in layers[layer1].input:
layers[layer2].output.remove(layer1)
layers[layer1].is_delete = True
if self.is_legal(layers):
self.layers = layers
break
else:
layers = copy.deepcopy(self.layers)
cnt_try += 1
def __str__(self):
info = ""
for l_id, layer in enumerate(self.layers):
if layer.is_delete is False:
info += 'id:%d ' % l_id + str(layer) + '\n'
return info
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import tensorflow as tf
from rnn import XGRUCell
from util import dropout
from graph import LayerType
def normalize(inputs,
epsilon=1e-8,
scope="ln"):
'''Applies layer normalization.
Args:
inputs: A tensor with 2 or more dimensions, where the first dimension has
`batch_size`.
epsilon: A floating number. A very small number for preventing ZeroDivision Error.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.
Returns:
A tensor with the same shape and data dtype as `inputs`.
'''
with tf.variable_scope(scope):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]
mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta = tf.Variable(tf.zeros(params_shape))
gamma = tf.Variable(tf.ones(params_shape))
normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
outputs = gamma * normalized + beta
return outputs
def multihead_attention(queries,
keys,
scope="multihead_attention",
num_units=None,
num_heads=4,
dropout_rate=0,
is_training=True,
causality=False):
'''Applies multihead attention.
Args:
queries: A 3d tensor with shape of [N, T_q, C_q].
keys: A 3d tensor with shape of [N, T_k, C_k].
num_units: A cdscalar. Attention size.
dropout_rate: A floating point number.
is_training: Boolean. Controller of mechanism for dropout.
causality: Boolean. If true, units that reference the future are masked.
num_heads: An int. Number of heads.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.
Returns
A 3d tensor with shape of (N, T_q, C)
'''
global look5
with tf.variable_scope(scope):
# Set the fall back option for num_units
if num_units is None:
num_units = queries.get_shape().as_list()[-1]
Q_ = []
K_ = []
V_ = []
for _ in range(num_heads):
Q = tf.layers.dense(queries, num_units / num_heads,
activation=tf.nn.relu) # (N, T_q, C)
K = tf.layers.dense(keys, num_units / num_heads,
activation=tf.nn.relu) # (N, T_k, C)
V = tf.layers.dense(keys, num_units / num_heads,
activation=tf.nn.relu) # (N, T_k, C)
Q_.append(Q)
K_.append(K)
V_.append(V)
# Split and concat
Q_ = tf.concat(Q_, axis=0) # (h*N, T_q, C/h)
K_ = tf.concat(K_, axis=0) # (h*N, T_k, C/h)
V_ = tf.concat(V_, axis=0) # (h*N, T_k, C/h)
# Multiplication
outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
# Scale
outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
# Key Masking
key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
key_masks = tf.tile(tf.expand_dims(key_masks, 1),
[1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
outputs = tf.where(tf.equal(key_masks, 0), paddings,
outputs) # (h*N, T_q, T_k)
# Causality = Future blinding
if causality:
diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
tril = tf.contrib.linalg.LinearOperatorTriL(
diag_vals).to_dense() # (T_q, T_k)
masks = tf.tile(tf.expand_dims(tril, 0),
[tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
outputs = tf.where(tf.equal(masks, 0), paddings,
outputs) # (h*N, T_q, T_k)
# Activation
look5 = outputs
outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
# Query Masking
query_masks = tf.sign(
tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
query_masks = tf.tile(tf.expand_dims(
query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
outputs *= query_masks # broadcasting. (N, T_q, C)
# Dropouts
outputs = dropout(outputs, dropout_rate, is_training)
# Weighted sum
outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
# Restore shape
outputs = tf.concat(tf.split(outputs, num_heads,
axis=0), axis=2) # (N, T_q, C)
# Residual connection
if queries.get_shape().as_list()[-1] == num_units:
outputs += queries
# Normalize
outputs = normalize(outputs, scope=scope) # (N, T_q, C)
return outputs
def positional_encoding(inputs,
num_units=None,
zero_pad=True,
scale=True,
scope="positional_encoding",
reuse=None):
'''
Return positinal embedding.
'''
Shape = tf.shape(inputs)
N = Shape[0]
T = Shape[1]
num_units = Shape[2]
with tf.variable_scope(scope, reuse=reuse):
position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
# First part of the PE function: sin and cos argument
# Second part, apply the cosine to even columns and sin to odds.
X = tf.expand_dims(tf.cast(tf.range(T), tf.float32), axis=1)
Y = tf.expand_dims(
tf.cast(10000 ** -(2 * tf.range(num_units) / num_units), tf.float32), axis=0)
h1 = tf.cast((tf.range(num_units) + 1) % 2, tf.float32)
h2 = tf.cast((tf.range(num_units) % 2), tf.float32)
position_enc = tf.multiply(X, Y)
position_enc = tf.sin(position_enc) * tf.multiply(tf.ones_like(X), h1) + \
tf.cos(position_enc) * tf.multiply(tf.ones_like(X), h2)
# Convert to a tensor
lookup_table = position_enc
if zero_pad:
lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
lookup_table[1:, :]), 0)
outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
if scale:
outputs = outputs * tf.sqrt(tf.cast(num_units, tf.float32))
return outputs
def feedforward(inputs,
num_units,
scope="multihead_attention"):
'''Point-wise feed forward net.
Args:
inputs: A 3d tensor with shape of [N, T, C].
num_units: A list of two integers.
scope: Optional scope for `variable_scope`.
reuse: Boolean, whether to reuse the weights of a previous layer
by the same name.
Returns:
A 3d tensor with the same shape and dtype as inputs
'''
with tf.variable_scope(scope):
# Inner layer
params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
"activation": tf.nn.relu, "use_bias": True}
outputs = tf.layers.conv1d(**params)
# Readout layer
params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
"activation": None, "use_bias": True}
outputs = tf.layers.conv1d(**params)
# Residual connection
outputs += inputs
# Normalize
outputs = normalize(outputs)
return outputs
def rnn(input_states, sequence_lengths, dropout_rate, is_training, num_units):
layer_cnt = 1
states = []
xs = tf.transpose(input_states, perm=[1, 0, 2])
for i in range(0, layer_cnt):
xs = dropout(xs, dropout_rate, is_training)
with tf.variable_scope('layer_' + str(i)):
cell_fw = XGRUCell(num_units)
cell_bw = XGRUCell(num_units)
outputs, _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw=cell_fw,
cell_bw=cell_bw,
dtype=tf.float32,
sequence_length=sequence_lengths,
inputs=xs,
time_major=True)
y_lr, y_rl = outputs
xs = tf.concat([y_lr, y_rl], 2)
states.append(xs)
return tf.transpose(dropout(tf.concat(states, axis=2),
dropout_rate,
is_training), perm=[1, 0, 2])
def graph_to_network(input1,
input2,
input1_lengths,
input2_lengths,
graph,
dropout_rate,
is_training,
num_heads=1,
rnn_units=256):
topology = graph.is_topology()
layers = dict()
layers_sequence_lengths = dict()
num_units = input1.get_shape().as_list()[-1]
layers[0] = input1*tf.sqrt(tf.cast(num_units, tf.float32)) + \
positional_encoding(input1, scale=False, zero_pad=False)
layers[1] = input2*tf.sqrt(tf.cast(num_units, tf.float32))
layers[0] = dropout(layers[0], dropout_rate, is_training)
layers[1] = dropout(layers[1], dropout_rate, is_training)
layers_sequence_lengths[0] = input1_lengths
layers_sequence_lengths[1] = input2_lengths
for _, topo_i in enumerate(topology):
if topo_i == '|':
continue
if graph.layers[topo_i].graph_type == LayerType.input.value:
continue
elif graph.layers[topo_i].graph_type == LayerType.attention.value:
with tf.variable_scope('attation_%d' % topo_i):
layer = multihead_attention(layers[graph.layers[topo_i].input[0]],
layers[graph.layers[topo_i].input[1]],
scope="multihead_attention%d" % topo_i,
dropout_rate=dropout_rate,
is_training=is_training,
num_heads=num_heads,
num_units=rnn_units * 2)
layer = feedforward(layer, scope="feedforward%d" % topo_i,
num_units=[rnn_units * 2 * 4, rnn_units * 2])
layers[topo_i] = layer
layers_sequence_lengths[topo_i] = layers_sequence_lengths[
graph.layers[topo_i].input[0]]
elif graph.layers[topo_i].graph_type == LayerType.self_attention.value:
with tf.variable_scope('self-attation_%d' % topo_i):
layer = multihead_attention(layers[graph.layers[topo_i].input[0]],
layers[graph.layers[topo_i].input[0]],
scope="multihead_attention%d" % topo_i,
dropout_rate=dropout_rate,
is_training=is_training,
num_heads=num_heads,
num_units=rnn_units * 2)
layer = feedforward(layer, scope="feedforward%d" % topo_i,
num_units=[rnn_units * 2 * 4, rnn_units * 2])
layers[topo_i] = layer
layers_sequence_lengths[topo_i] = layers_sequence_lengths[
graph.layers[topo_i].input[0]]
elif graph.layers[topo_i].graph_type == LayerType.rnn.value:
with tf.variable_scope('rnn_%d' % topo_i):
layer = rnn(layers[graph.layers[topo_i].input[0]],
layers_sequence_lengths[graph.layers[topo_i].input[0]],
dropout_rate,
is_training,
rnn_units)
layers[topo_i] = layer
layers_sequence_lengths[topo_i] = layers_sequence_lengths[
graph.layers[topo_i].input[0]]
elif graph.layers[topo_i].graph_type == LayerType.output.value:
layers[topo_i] = layers[graph.layers[topo_i].input[0]]
if layers[topo_i].get_shape().as_list()[-1] != rnn_units * 1 * 2:
with tf.variable_scope('add_dense'):
layers[topo_i] = tf.layers.dense(
layers[topo_i], units=rnn_units*2)
return layers[2], layers[3]
## How to download data
1. download "dev-v1.1.json" and "train-v1.1.json" in https://rajpurkar.github.io/SQuAD-explorer/
2. download "glove.840B.300d.txt" in "https://nlp.stanford.edu/projects/glove/"
## How to submit this job
1. run "$NNI_ROOT_DIR/auto_run.py" as "$NNI_ROOT_DIR/README-AUTO.md" said.
2. use the dockerImage openpai.azurecr.io/nni_v0.0.1, which means it use a tensorflow cpu-version.
3. this model don't need search_space.json.
\ No newline at end of file
tensorflow==1.4.0
\ No newline at end of file
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import tensorflow as tf
from tensorflow.python.ops.rnn_cell_impl import RNNCell
class GRU:
'''
GRU class.
'''
def __init__(self, name, input_dim, hidden_dim):
self.name = '/'.join([name, 'gru'])
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.w_matrix = None
self.U = None
self.bias = None
def define_params(self):
'''
Define parameters.
'''
input_dim = self.input_dim
hidden_dim = self.hidden_dim
prefix = self.name
self.w_matrix = tf.Variable(tf.random_normal([input_dim, 3 * hidden_dim], stddev=0.1),
name='/'.join([prefix, 'W']))
self.U = tf.Variable(tf.random_normal([hidden_dim, 3 * hidden_dim], stddev=0.1),
name='/'.join([prefix, 'U']))
self.bias = tf.Variable(tf.random_normal([1, 3 * hidden_dim], stddev=0.1),
name='/'.join([prefix, 'b']))
return self
def build(self, x, h, mask=None):
'''
Build the GRU cell.
'''
xw = tf.split(tf.matmul(x, self.w_matrix) + self.bias, 3, 1)
hu = tf.split(tf.matmul(h, self.U), 3, 1)
r = tf.sigmoid(xw[0] + hu[0])
z = tf.sigmoid(xw[1] + hu[1])
h1 = tf.tanh(xw[2] + r * hu[2])
next_h = h1 * (1 - z) + h * z
if mask is not None:
next_h = next_h * mask + h * (1 - mask)
return next_h
def build_sequence(self, xs, masks, init, is_left_to_right):
'''
Build GRU sequence.
'''
states = []
last = init
if is_left_to_right:
for i, xs_i in enumerate(xs):
h = self.build(xs_i, last, masks[i])
states.append(h)
last = h
else:
for i in range(len(xs) - 1, -1, -1):
h = self.build(xs[i], last, masks[i])
states.insert(0, h)
last = h
return states
class XGRUCell(RNNCell):
def __init__(self, hidden_dim, reuse=None):
super(XGRUCell, self).__init__(self, _reuse=reuse)
self._num_units = hidden_dim
self._activation = tf.tanh
@property
def state_size(self):
return self._num_units
@property
def output_size(self):
return self._num_units
def call(self, inputs, state):
input_dim = inputs.get_shape()[-1]
assert input_dim is not None, "input dimension must be defined"
W = tf.get_variable(
name="W", shape=[input_dim, 3 * self._num_units], dtype=tf.float32)
U = tf.get_variable(
name='U', shape=[self._num_units, 3 * self._num_units], dtype=tf.float32)
b = tf.get_variable(
name='b', shape=[1, 3 * self._num_units], dtype=tf.float32)
xw = tf.split(tf.matmul(inputs, W) + b, 3, 1)
hu = tf.split(tf.matmul(state, U), 3, 1)
r = tf.sigmoid(xw[0] + hu[0])
z = tf.sigmoid(xw[1] + hu[1])
h1 = self._activation(xw[2] + r * hu[2])
next_h = h1 * (1 - z) + state * z
return next_h, next_h
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
Train the network combined by RNN and attention.
'''
import tensorflow as tf
from attention import DotAttention
from rnn import XGRUCell
from util import dropout
from graph_to_tf import graph_to_network
class GAGConfig:
def __init__(self):
self.batch_size = 128
self.dropout = 0.1
self.char_vcb_size = 1371
self.max_char_length = 20
self.char_embed_dim = 100
self.max_query_length = 40
self.max_passage_length = 800
self.att_is_vanilla = True
self.att_need_padding = False
self.att_is_id = False
self.ptr_dim = 70
self.learning_rate = 0.1
self.labelsmoothing = 0.1
self.num_heads = 1
self.rnn_units = 256
class GAG:
def __init__(self, cfg, embed, graph):
self.cfg = cfg
self.embed = embed
self.graph = graph
self.query_word = None
self.query_mask = None
self.query_lengths = None
self.passage_word = None
self.passage_mask = None
self.passage_lengths = None
self.answer_begin = None
self.answer_end = None
self.query_char_ids = None
self.query_char_lengths = None
self.passage_char_ids = None
self.passage_char_lengths = None
self.passage_states = None
self.query_states = None
self.query_init = None
self.begin_prob = None
self.end_prob = None
self.loss = None
self.train_op = None
def build_net(self, is_training):
cfg = self.cfg
with tf.device('/cpu:0'):
word_embed = tf.get_variable(
name='word_embed', initializer=self.embed, dtype=tf.float32, trainable=False)
char_embed = tf.get_variable(name='char_embed',
shape=[cfg.char_vcb_size,
cfg.char_embed_dim],
dtype=tf.float32)
# [query_length, batch_size]
self.query_word = tf.placeholder(dtype=tf.int32,
shape=[None, None],
name='query_word')
self.query_mask = tf.placeholder(dtype=tf.float32,
shape=[None, None],
name='query_mask')
# [batch_size]
self.query_lengths = tf.placeholder(
dtype=tf.int32, shape=[None], name='query_lengths')
# [passage_length, batch_size]
self.passage_word = tf.placeholder(
dtype=tf.int32, shape=[None, None], name='passage_word')
self.passage_mask = tf.placeholder(
dtype=tf.float32, shape=[None, None], name='passage_mask')
# [batch_size]
self.passage_lengths = tf.placeholder(
dtype=tf.int32, shape=[None], name='passage_lengths')
if is_training:
self.answer_begin = tf.placeholder(
dtype=tf.int32, shape=[None], name='answer_begin')
self.answer_end = tf.placeholder(
dtype=tf.int32, shape=[None], name='answer_end')
self.query_char_ids = tf.placeholder(dtype=tf.int32,
shape=[
self.cfg.max_char_length, None, None],
name='query_char_ids')
# sequence_length, batch_size
self.query_char_lengths = tf.placeholder(
dtype=tf.int32, shape=[None, None], name='query_char_lengths')
self.passage_char_ids = tf.placeholder(dtype=tf.int32,
shape=[
self.cfg.max_char_length, None, None],
name='passage_char_ids')
# sequence_length, batch_size
self.passage_char_lengths = tf.placeholder(dtype=tf.int32,
shape=[None, None],
name='passage_char_lengths')
query_char_states = self.build_char_states(char_embed=char_embed,
is_training=is_training,
reuse=False,
char_ids=self.query_char_ids,
char_lengths=self.query_char_lengths)
passage_char_states = self.build_char_states(char_embed=char_embed,
is_training=is_training,
reuse=True,
char_ids=self.passage_char_ids,
char_lengths=self.passage_char_lengths)
with tf.variable_scope("encoding") as scope:
query_states = tf.concat([tf.nn.embedding_lookup(
word_embed, self.query_word), query_char_states], axis=2)
scope.reuse_variables()
passage_states = tf.concat([tf.nn.embedding_lookup(
word_embed, self.passage_word), passage_char_states], axis=2)
passage_states = tf.transpose(passage_states, perm=[1, 0, 2])
query_states = tf.transpose(query_states, perm=[1, 0, 2])
self.passage_states = passage_states
self.query_states = query_states
output, output2 = graph_to_network(passage_states, query_states,
self.passage_lengths, self.query_lengths,
self.graph, self.cfg.dropout,
is_training, num_heads=cfg.num_heads,
rnn_units=cfg.rnn_units)
passage_att_mask = self.passage_mask
batch_size_x = tf.shape(self.query_lengths)
answer_h = tf.zeros(
tf.concat([batch_size_x, tf.constant([cfg.ptr_dim], dtype=tf.int32)], axis=0))
answer_context = tf.reduce_mean(output2, axis=1)
query_init_w = tf.get_variable(
'query_init_w', shape=[output2.get_shape().as_list()[-1], cfg.ptr_dim])
self.query_init = query_init_w
answer_context = tf.matmul(answer_context, query_init_w)
output = tf.transpose(output, perm=[1, 0, 2])
with tf.variable_scope('answer_ptr_layer'):
ptr_att = DotAttention('ptr',
hidden_dim=cfg.ptr_dim,
is_vanilla=self.cfg.att_is_vanilla,
is_identity_transform=self.cfg.att_is_id,
need_padding=self.cfg.att_need_padding)
answer_pre_compute = ptr_att.get_pre_compute(output)
ptr_gru = XGRUCell(hidden_dim=cfg.ptr_dim)
begin_prob, begin_logits = ptr_att.get_prob(output, answer_context, passage_att_mask,
answer_pre_compute, True)
att_state = ptr_att.get_att(output, begin_prob)
(_, answer_h) = ptr_gru.call(inputs=att_state, state=answer_h)
answer_context = answer_h
end_prob, end_logits = ptr_att.get_prob(output, answer_context,
passage_att_mask, answer_pre_compute,
True)
self.begin_prob = tf.transpose(begin_prob, perm=[1, 0])
self.end_prob = tf.transpose(end_prob, perm=[1, 0])
begin_logits = tf.transpose(begin_logits, perm=[1, 0])
end_logits = tf.transpose(end_logits, perm=[1, 0])
if is_training:
def label_smoothing(inputs, masks, epsilon=0.1):
epsilon = cfg.labelsmoothing
num_of_channel = tf.shape(inputs)[-1] # number of channels
inputs = tf.cast(inputs, tf.float32)
return (((1 - epsilon) * inputs) + (epsilon /
tf.cast(num_of_channel, tf.float32))) * masks
cost1 = tf.reduce_mean(
tf.losses.softmax_cross_entropy(label_smoothing(
tf.one_hot(self.answer_begin,
depth=tf.shape(self.passage_word)[0]),
tf.transpose(self.passage_mask, perm=[1, 0])), begin_logits))
cost2 = tf.reduce_mean(
tf.losses.softmax_cross_entropy(
label_smoothing(tf.one_hot(self.answer_end,
depth=tf.shape(self.passage_word)[0]),
tf.transpose(self.passage_mask, perm=[1, 0])), end_logits))
reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
l2_loss = tf.reduce_sum(reg_ws)
loss = cost1 + cost2 + l2_loss
self.loss = loss
optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate)
self.train_op = optimizer.minimize(self.loss)
return tf.stack([self.begin_prob, self.end_prob])
def build_char_states(self, char_embed, is_training, reuse, char_ids, char_lengths):
max_char_length = self.cfg.max_char_length
inputs = dropout(tf.nn.embedding_lookup(char_embed, char_ids),
self.cfg.dropout, is_training)
inputs = tf.reshape(
inputs, shape=[max_char_length, -1, self.cfg.char_embed_dim])
char_lengths = tf.reshape(char_lengths, shape=[-1])
with tf.variable_scope('char_encoding', reuse=reuse):
cell_fw = XGRUCell(hidden_dim=self.cfg.char_embed_dim)
cell_bw = XGRUCell(hidden_dim=self.cfg.char_embed_dim)
_, (left_right, right_left) = tf.nn.bidirectional_dynamic_rnn(
cell_fw=cell_fw,
cell_bw=cell_bw,
sequence_length=char_lengths,
inputs=inputs,
time_major=True,
dtype=tf.float32
)
left_right = tf.reshape(left_right, shape=[-1, self.cfg.char_embed_dim])
right_left = tf.reshape(right_left, shape=[-1, self.cfg.char_embed_dim])
states = tf.concat([left_right, right_left], axis=1)
out_shape = tf.shape(char_ids)[1:3]
out_shape = tf.concat([out_shape, tf.constant(
value=[self.cfg.char_embed_dim * 2], dtype=tf.int32)], axis=0)
return tf.reshape(states, shape=out_shape)
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import os
import logging
logger = logging.getLogger('ga_squad')
try:
import argparse
import heapq
import json
import numpy as np
import pickle
import graph
from util import Timer
import nni
import data
import evaluate
from train_model import *
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
except:
logger.exception('Catch exception in trial.py.')
raise
def get_config():
'''
Get config from arument parser.
'''
parser = argparse.ArgumentParser(
description='This program is using genetic algorithm to search architecture for SQuAD.')
parser.add_argument('--input_file', type=str,
default='./dev-v1.1.json', help='input file')
parser.add_argument('--dev_file', type=str,
default='./dev-v1.1.json', help='dev file')
parser.add_argument('--embedding_file', type=str,
default='./glove.840B.300d.txt', help='dev file')
parser.add_argument('--root_path', default='./data/',
type=str, help='Root path of models')
parser.add_argument('--batch_size', type=int, default=2, help='batch size')
parser.add_argument('--save_path', type=str,
default='./save', help='save path dir')
parser.add_argument('--learning_rate', type=float, default=0.0001,
help='set half of original learning rate reload data and train.')
parser.add_argument('--max_epoch', type=int, default=30)
parser.add_argument('--dropout_rate', type=float,
default=0.1, help='dropout_rate')
parser.add_argument('--labelsmoothing', type=float,
default=0.1, help='labelsmoothing')
parser.add_argument('--num_heads', type=int, default=1, help='num_heads')
parser.add_argument('--rnn_units', type=int, default=256, help='rnn_units')
args = parser.parse_args()
return args
def get_id(word_dict, word):
'''
Return word id.
'''
if word in word_dict.keys():
return word_dict[word]
return word_dict['<unk>']
def load_embedding(path):
'''
return embedding for a specif file by given file path.
'''
embedding_dict = {}
with open(path, 'r', encoding='utf-8') as file:
pairs = [line.strip('\r\n').split() for line in file.readlines()]
for pair in pairs:
embedding_dict[pair[0]] = [float(x) for x in pair[1:]]
logger.debug('embedding_dict size: %d', len(embedding_dict))
return embedding_dict
class MaxQueue:
'''
Queue for max value.
'''
def __init__(self, capacity):
assert capacity > 0, 'queue size must be larger than 0'
self._capacity = capacity
self._entries = []
@property
def entries(self):
return self._entries
@property
def capacity(self):
return self._capacity
@property
def size(self):
return len(self._entries)
def clear(self):
self._entries = []
def push(self, item):
if self.size < self.capacity:
heapq.heappush(self.entries, item)
else:
heapq.heappushpop(self.entries, item)
def find_best_answer_span(left_prob, right_prob, passage_length, max_answer_length):
left = 0
right = 0
max_prob = left_prob[0] * right_prob[0]
for i in range(0, passage_length):
left_p = left_prob[i]
for j in range(i, min(i + max_answer_length, passage_length)):
total_prob = left_p * right_prob[j]
if max_prob < total_prob:
left, right, max_prob = i, j, total_prob
return [(max_prob, left, right)]
def write_prediction(path, position1_result, position2_result):
import codecs
with codecs.open(path, 'w', encoding='utf8') as file:
batch_num = len(position1_result)
for i in range(batch_num):
position1_batch = position1_result[i]
position2_batch = position2_result[i]
for j in range(position1_batch.shape[0]):
file.write(str(position1_batch[j]) +
'\t' + str(position2_batch[j]) + '\n')
def find_kbest_answer_span(k, left_prob, right_prob, passage_length, max_answer_length):
if k == 1:
return find_best_answer_span(left_prob, right_prob, passage_length, max_answer_length)
queue = MaxQueue(k)
for i in range(0, passage_length):
left_p = left_prob[i]
for j in range(i, min(i + max_answer_length, passage_length)):
total_prob = left_p * right_prob[j]
queue.push((total_prob, i, j))
return list(sorted(queue.entries, key=lambda x: -x[0]))
def run_epoch(batches, answer_net, is_training):
if not is_training:
position1_result = []
position2_result = []
contexts = []
ids = []
loss_sum = 0
timer = Timer()
count = 0
for batch in batches:
used = timer.get_elapsed(False)
count += 1
qps = batch['qp_pairs']
question_tokens = [qp['question_tokens'] for qp in qps]
passage_tokens = [qp['passage_tokens'] for qp in qps]
context = [(qp['passage'], qp['passage_tokens']) for qp in qps]
sample_id = [qp['id'] for qp in qps]
_, query, query_mask, query_lengths = data.get_word_input(
data=question_tokens, word_dict=word_vcb, embed=embed, embed_dim=cfg.word_embed_dim)
_, passage, passage_mask, passage_lengths = data.get_word_input(
data=passage_tokens, word_dict=word_vcb, embed=embed, embed_dim=cfg.word_embed_dim)
query_char, query_char_lengths = data.get_char_input(
data=question_tokens, char_dict=char_vcb, max_char_length=cfg.max_char_length)
passage_char, passage_char_lengths = data.get_char_input(
data=passage_tokens, char_dict=char_vcb, max_char_length=cfg.max_char_length)
if is_training:
answer_begin, answer_end = data.get_answer_begin_end(qps)
if is_training:
feed_dict = {answer_net.query_word: query,
answer_net.query_mask: query_mask,
answer_net.query_lengths: query_lengths,
answer_net.passage_word: passage,
answer_net.passage_mask: passage_mask,
answer_net.passage_lengths: passage_lengths,
answer_net.query_char_ids: query_char,
answer_net.query_char_lengths: query_char_lengths,
answer_net.passage_char_ids: passage_char,
answer_net.passage_char_lengths: passage_char_lengths,
answer_net.answer_begin: answer_begin,
answer_net.answer_end: answer_end}
loss, _, = sess.run(
[answer_net.loss, answer_net.train_op], feed_dict=feed_dict)
if count % 100 == 0:
logger.debug('%d %g except:%g, loss:%g' %
(count, used, used / count * len(batches), loss))
loss_sum += loss
else:
feed_dict = {answer_net.query_word: query,
answer_net.query_mask: query_mask,
answer_net.query_lengths: query_lengths,
answer_net.passage_word: passage,
answer_net.passage_mask: passage_mask,
answer_net.passage_lengths: passage_lengths,
answer_net.query_char_ids: query_char,
answer_net.query_char_lengths: query_char_lengths,
answer_net.passage_char_ids: passage_char,
answer_net.passage_char_lengths: passage_char_lengths}
position1, position2 = sess.run(
[answer_net.begin_prob, answer_net.end_prob], feed_dict=feed_dict)
position1_result += position1.tolist()
position2_result += position2.tolist()
contexts += context
ids = np.concatenate((ids, sample_id))
if count % 100 == 0:
logger.debug('%d %g except:%g' %
(count, used, used / count * len(batches)))
if count % 100 == 0:
break
loss = loss_sum / len(batches)
if is_training:
return loss
return loss, position1_result, position2_result, ids, contexts
def generate_predict_json(position1_result, position2_result, ids, passage_tokens):
'''
Generate json by prediction.
'''
predict_len = len(position1_result)
logger.debug('total prediction num is %s', str(predict_len))
answers = {}
for i in range(predict_len):
sample_id = ids[i]
passage, tokens = passage_tokens[i]
kbest = find_best_answer_span(
position1_result[i], position2_result[i], len(tokens), 23)
_, start, end = kbest[0]
answer = passage[tokens[start]['char_begin']:tokens[end]['char_end']]
answers[sample_id] = answer
logger.debug('generate predict done.')
return answers
def generate_data(path, tokenizer, char_vcb, word_vcb, is_training=False):
'''
Generate data
'''
global root_path
qp_pairs = data.load_from_file(path=path, is_training=is_training)
tokenized_sent = 0
# qp_pairs = qp_pairs[:1000]1
for qp_pair in qp_pairs:
tokenized_sent += 1
data.tokenize(qp_pair, tokenizer, is_training)
for word in qp_pair['question_tokens']:
word_vcb.add(word['word'])
for char in word['word']:
char_vcb.add(char)
for word in qp_pair['passage_tokens']:
word_vcb.add(word['word'])
for char in word['word']:
char_vcb.add(char)
max_query_length = max(len(x['question_tokens']) for x in qp_pairs)
max_passage_length = max(len(x['passage_tokens']) for x in qp_pairs)
#min_passage_length = min(len(x['passage_tokens']) for x in qp_pairs)
cfg.max_query_length = max_query_length
cfg.max_passage_length = max_passage_length
return qp_pairs
def train_with_graph(graph, qp_pairs, dev_qp_pairs):
'''
Train a network from a specific graph.
'''
global sess
with tf.Graph().as_default():
train_model = GAG(cfg, embed, graph)
train_model.build_net(is_training=True)
tf.get_variable_scope().reuse_variables()
dev_model = GAG(cfg, embed, graph)
dev_model.build_net(is_training=False)
with tf.Session() as sess:
logger.debug('init variables')
init = tf.global_variables_initializer()
sess.run(init)
# writer = tf.summary.FileWriter('%s/graph/'%execution_path, sess.graph)
logger.debug('assign to graph')
saver = tf.train.Saver()
train_loss = None
bestacc = 0
patience = 5
patience_increase = 2
improvement_threshold = 0.995
for epoch in range(max_epoch):
logger.debug('begin to train')
train_batches = data.get_batches(qp_pairs, cfg.batch_size)
train_loss = run_epoch(train_batches, train_model, True)
logger.debug('epoch ' + str(epoch) +
' loss: ' + str(train_loss))
dev_batches = list(data.get_batches(
dev_qp_pairs, cfg.batch_size))
_, position1, position2, ids, contexts = run_epoch(
dev_batches, dev_model, False)
answers = generate_predict_json(
position1, position2, ids, contexts)
if save_path is not None:
with open(save_path + 'epoch%d.prediction' % epoch, 'w') as file:
json.dump(answers, file)
else:
answers = json.dumps(answers)
answers = json.loads(answers)
iter = epoch + 1
acc = evaluate.evaluate_with_predictions(
args.dev_file, answers)
logger.debug('Send intermediate acc: %s', str(acc))
nni.report_intermediate_result(acc)
logger.debug('Send intermediate result done.')
if acc > bestacc:
if acc * improvement_threshold > bestacc:
patience = max(patience, iter * patience_increase)
bestacc = acc
if save_path is not None:
saver.save(sess, save_path + 'epoch%d.model' % epoch)
with open(save_path + 'epoch%d.score' % epoch, 'wb') as file:
pickle.dump(
(position1, position2, ids, contexts), file)
logger.debug('epoch %d acc %g bestacc %g' %
(epoch, acc, bestacc))
if patience <= iter:
break
logger.debug('save done.')
return train_loss, bestacc
embed = None
char_vcb = None
tokenizer = None
word_vcb = None
def load_data():
global embed, char_vcb, tokenizer, word_vcb
logger.debug('tokenize data')
tokenizer = data.WhitespaceTokenizer()
char_set = set()
word_set = set()
logger.debug('generate train data')
qp_pairs = generate_data(input_file, tokenizer,
char_set, word_set, is_training=True)
logger.debug('generate dev data')
dev_qp_pairs = generate_data(
dev_file, tokenizer, char_set, word_set, is_training=False)
logger.debug('generate data done.')
char_vcb = {char: sample_id for sample_id, char in enumerate(char_set)}
word_vcb = {word: sample_id for sample_id, word in enumerate(word_set)}
timer.start()
logger.debug('read embedding table')
cfg.word_embed_dim = 300
embed = np.zeros((len(word_vcb), cfg.word_embed_dim), dtype=np.float32)
embedding = load_embedding(args.embedding_file)
for word, sample_id in enumerate(word_vcb):
if word in embedding:
embed[sample_id] = embedding[word]
# add UNK into dict
unk = np.zeros((1, cfg.word_embed_dim), dtype=np.float32)
embed = np.concatenate((unk, embed), axis=0)
word_vcb = {key: value + 1 for key, value in word_vcb.items()}
return qp_pairs, dev_qp_pairs
if __name__ == '__main__':
try:
args = get_config()
root_path = os.path.expanduser(args.root_path)
input_file = os.path.expanduser(args.input_file)
dev_file = os.path.expanduser(args.dev_file)
save_path = None
max_epoch = args.max_epoch
cfg = GAGConfig()
cfg.batch_size = args.batch_size
cfg.learning_rate = float(args.learning_rate)
cfg.dropout = args.dropout_rate
cfg.rnn_units = args.rnn_units
cfg.labelsmoothing = args.labelsmoothing
cfg.num_heads = args.num_heads
timer = Timer()
qp_pairs, dev_qp_pairs = load_data()
logger.debug('Init finish.')
original_params = nni.get_parameters()
'''
with open('data.json') as f:
original_params = json.load(f)
'''
try:
graph = graph.graph_loads(original_params)
except Exception:
logger.debug('Can\'t load graph.')
train_loss, best_acc = train_with_graph(graph, qp_pairs, dev_qp_pairs)
logger.debug('Send best acc: %s', str(best_acc))
nni.report_final_result(best_acc)
logger.debug('Send final result done')
except:
logger.exception('Catch exception in trial.py.')
raise
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
Util Module
'''
import time
import tensorflow as tf
def shape(tensor):
'''
Get shape of variable.
Return type is tuple.
'''
temp_s = tensor.get_shape()
return tuple([temp_s[i].value for i in range(0, len(temp_s))])
def get_variable(name, temp_s):
'''
Get variable by name.
'''
return tf.Variable(tf.zeros(temp_s), name=name)
def dropout(tensor, drop_prob, is_training):
'''
Dropout except test.
'''
if not is_training:
return tensor
return tf.nn.dropout(tensor, 1.0 - drop_prob)
class Timer:
'''
Class Timer is for calculate time.
'''
def __init__(self):
self.__start = time.time()
def start(self):
'''
Start to calculate time.
'''
self.__start = time.time()
def get_elapsed(self, restart=True):
'''
Calculate time span.
'''
end = time.time()
span = end - self.__start
if restart:
self.__start = end
return span
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
#choice: local, remote
trainingServicePlatform: local
#choice: true, false
useAnnotation: true
tuner:
#choice: TPE, Random, Anneal, Evolution
tunerName: TPE
#choice: Maximize, Minimize
optimizationMode: Maximize
trial:
trialCommand: python3 mnist.py
trialCodeDir: /usr/share/nni/examples/trials/mnist-annotation
trialGpuNum: 0
\ No newline at end of file
"""A deep MNIST classifier using convolutional layers."""
import logging
import math
import tempfile
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
FLAGS = None
logger = logging.getLogger('mnist_AutoML')
class MnistNetwork(object):
'''
MnistNetwork is for initlizing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
channel_2_num,
conv_size,
hidden_size,
pool_size,
learning_rate,
x_dim=784,
y_dim=10):
self.channel_1_num = channel_1_num
self.channel_2_num = channel_2_num
"""@nni.variable(nni.choice(2, 3, 5, 7),name=self.conv_size)"""
self.conv_size = conv_size
"""@nni.variable(nni.choice(124, 512, 1024), name=self.hidden_size)"""
self.hidden_size = hidden_size
self.pool_size = pool_size
"""@nni.variable(nni.uniform(0.0001, 0.1), name=self.learning_rate)"""
self.learning_rate = learning_rate
self.x_dim = x_dim
self.y_dim = y_dim
self.images = tf.placeholder(tf.float32, [None, self.x_dim], name='input_x')
self.labels = tf.placeholder(tf.float32, [None, self.y_dim], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.train_step = None
self.accuracy = None
def build_network(self):
'''
Building network for mnist
'''
# Reshape to use within a convolutional neural net.
# Last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
with tf.name_scope('reshape'):
try:
input_dim = int(math.sqrt(self.x_dim))
except:
print(
'input dim cannot be sqrt and reshape. input dim: ' + str(self.x_dim))
logger.debug(
'input dim cannot be sqrt and reshape. input dim: %s', str(self.x_dim))
raise
x_image = tf.reshape(self.images, [-1, input_dim, input_dim, 1])
# First convolutional layer - maps one grayscale image to 32 feature maps.
with tf.name_scope('conv1'):
w_conv1 = weight_variable(
[self.conv_size, self.conv_size, 1, self.channel_1_num])
b_conv1 = bias_variable([self.channel_1_num])
"""@nni.function_choice(tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1), tf.nn.sigmoid(conv2d(x_image, w_conv1) + b_conv1), tf.nn.tanh(conv2d(x_image, w_conv1) + b_conv1), name=tf.nn.relu)"""
h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
"""@nni.function_choice(max_pool(h_conv1, self.pool_size), avg_pool(h_conv1, self.pool_size), name=max_pool)"""
h_pool1 = max_pool(h_conv1, self.pool_size)
# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
w_conv2 = weight_variable([self.conv_size, self.conv_size,
self.channel_1_num, self.channel_2_num])
b_conv2 = bias_variable([self.channel_2_num])
h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
# Second pooling layer.
with tf.name_scope('pool2'):
h_pool2 = max_pool(h_conv2, self.pool_size)
# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
last_dim = int(input_dim / (self.pool_size * self.pool_size))
with tf.name_scope('fc1'):
w_fc1 = weight_variable(
[last_dim * last_dim * self.channel_2_num, self.hidden_size])
b_fc1 = bias_variable([self.hidden_size])
h_pool2_flat = tf.reshape(
h_pool2, [-1, last_dim * last_dim * self.channel_2_num])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout - controls the complexity of the model, prevents co-adaptation of features.
with tf.name_scope('dropout'):
h_fc1_drop = tf.nn.dropout(h_fc1, self.keep_prob)
# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
w_fc2 = weight_variable([self.hidden_size, self.y_dim])
b_fc2 = bias_variable([self.y_dim])
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
with tf.name_scope('loss'):
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=y_conv))
with tf.name_scope('adam_optimizer'):
self.train_step = tf.train.AdamOptimizer(
self.learning_rate).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(
tf.argmax(y_conv, 1), tf.argmax(self.labels, 1))
self.accuracy = tf.reduce_mean(
tf.cast(correct_prediction, tf.float32))
def conv2d(x_input, w_matrix):
"""conv2d returns a 2d convolution layer with full stride."""
return tf.nn.conv2d(x_input, w_matrix, strides=[1, 1, 1, 1], padding='SAME')
def max_pool(x_input, pool_size):
"""max_pool downsamples a feature map by 2X."""
return tf.nn.max_pool(x_input, ksize=[1, pool_size, pool_size, 1],
strides=[1, pool_size, pool_size, 1], padding='SAME')
def avg_pool(x_input, pool_size):
return tf.nn.avg_pool(x_input, ksize=[1, pool_size, pool_size, 1],
strides=[1, pool_size, pool_size, 1], padding='SAME')
def weight_variable(shape):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def main(params):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
print('Mnist download data down.')
logger.debug('Mnist download data down.')
# Create the model
# Build the graph for the deep net
mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],
channel_2_num=params['channel_2_num'],
conv_size=params['conv_size'],
hidden_size=params['hidden_size'],
pool_size=params['pool_size'],
learning_rate=params['learning_rate'])
mnist_network.build_network()
logger.debug('Mnist build network done.')
# Write log
graph_location = tempfile.mkdtemp()
logger.debug('Saving graph to: %s', graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
test_acc = 0.0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
"""@nni.variable(nni.choice(50, 250, 500), name=batch_num)"""
batch_num = params['batch_num']
for i in range(batch_num):
batch = mnist.train.next_batch(batch_num)
"""@nni.variable(nni.choice(1, 5), name=dropout_rate)"""
dropout_rate = params['dropout_rate']
mnist_network.train_step.run(feed_dict={mnist_network.images: batch[0],
mnist_network.labels: batch[1],
mnist_network.keep_prob: dropout_rate}
)
if i % 100 == 0:
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_intermediate_result(test_acc)"""
logger.debug('test accuracy %g', test_acc)
logger.debug('Pipe send intermediate result done.')
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
"""@nni.report_final_result(test_acc)"""
logger.debug('Final result is %g', test_acc)
logger.debug('Send final result done.')
def generate_defualt_params():
'''
Generate default parameters for mnist network.
'''
params = {
'data_dir': '/tmp/tensorflow/mnist/input_data',
'dropout_rate': 0.5,
'channel_1_num': 32,
'channel_2_num': 64,
'conv_size': 5,
'pool_size': 2,
'hidden_size': 1024,
'learning_rate': 1e-4,
'batch_num': 200}
return params
if __name__ == '__main__':
try:
main(generate_defualt_params())
except Exception as exception:
logger.exception(exception)
raise
authorName: default
experimentName: example_mnist-keras
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
#choice: local, remote
trainingServicePlatform: local
searchSpacePath: /usr/share/nni/examples/trials/mnist-keras/search_space.json
#choice: true, false
useAnnotation: false
tuner:
#choice: TPE, Random, Anneal, Evolution
tunerName: TPE
#choice: Maximize, Minimize
optimizationMode: Maximize
trial:
trialCommand: python3 mnist-keras.py
trialCodeDir: /usr/share/nni/examples/trials/mnist-keras
trialGpuNum: 0
\ No newline at end of file
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import argparse
import logging
import os
import keras
import numpy as np
from keras import backend as K
from keras.callbacks import TensorBoard
from keras.datasets import mnist
from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D
from keras.models import Sequential
import nni
LOG = logging.getLogger('mnist_keras')
K.set_image_data_format('channels_last')
TENSORBOARD_DIR = os.environ['NNI_OUTPUT_DIR']
H, W = 28, 28
NUM_CLASSES = 10
def create_mnist_model(hyper_params, input_shape=(H, W, 1), num_classes=NUM_CLASSES):
'''
Create simple convolutional model
'''
layers = [
Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),
Conv2D(64, (3, 3), activation='relu'),
MaxPooling2D(pool_size=(2, 2)),
Flatten(),
Dense(100, activation='relu'),
Dense(num_classes, activation='softmax')
]
model = Sequential(layers)
if hyper_params['optimizer'] == 'Adam':
optimizer = keras.optimizers.Adam(lr=hyper_params['learning_rate'])
else:
optimizer = keras.optimizers.SGD(lr=hyper_params['learning_rate'], momentum=0.9)
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=optimizer, metrics=['accuracy'])
return model
def load_mnist_data(args):
'''
Load MNIST dataset
'''
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = (np.expand_dims(x_train, -1).astype(np.float) / 255.)[:args.num_train]
x_test = (np.expand_dims(x_test, -1).astype(np.float) / 255.)[:args.num_test]
y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)[:args.num_train]
y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)[:args.num_test]
LOG.debug('x_train shape: %s', (x_train.shape,))
LOG.debug('x_test shape: %s', (x_test.shape,))
return x_train, y_train, x_test, y_test
class SendMetrics(keras.callbacks.Callback):
'''
Keras callback to send metrics to NNI framework
'''
def on_epoch_end(self, epoch, logs={}):
'''
Run on end of each epoch
'''
LOG.debug(logs)
nni.report_intermediate_result(logs)
def train(args, params):
'''
Train model
'''
x_train, y_train, x_test, y_test = load_mnist_data(args)
model = create_mnist_model(params)
model.fit(x_train, y_train, batch_size=args.batch_size, epochs=args.epochs, verbose=1,
validation_data=(x_test, y_test), callbacks=[SendMetrics(), TensorBoard(log_dir=TENSORBOARD_DIR)])
_, acc = model.evaluate(x_test, y_test, verbose=0)
LOG.debug('Final result is: %d', acc)
nni.report_final_result(acc)
def generate_default_params():
'''
Generate default hyper parameters
'''
return {
'optimizer': 'Adam',
'learning_rate': 0.001
}
if __name__ == '__main__':
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--batch_size", type=int, default=200, help="batch size", required=False)
PARSER.add_argument("--epochs", type=int, default=10, help="Train epochs", required=False)
PARSER.add_argument("--num_train", type=int, default=60000, help="Number of train samples to be used, maximum 60000", required=False)
PARSER.add_argument("--num_test", type=int, default=10000, help="Number of test samples to be used, maximum 10000", required=False)
ARGS, UNKNOWN = PARSER.parse_known_args()
try:
# get parameters from tuner
RECEIVED_PARAMS = nni.get_parameters()
LOG.debug(RECEIVED_PARAMS)
PARAMS = generate_default_params()
PARAMS.update(RECEIVED_PARAMS)
# train
train(ARGS, PARAMS)
except Exception as e:
LOG.exception(e)
raise
{
"optimizer":{"_type":"choice","_value":["Adam", "SGD"]},
"learning_rate":{"_type":"choice","_value":[0.0001, 0.001, 0.002, 0.005, 0.01]}
}
authorName: default
experimentName: example_mnist-smartparam
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
#choice: local, remote
trainingServicePlatform: local
#choice: true, false
useAnnotation: true
tuner:
#choice: TPE, Random, Anneal, Evolution
tunerName: TPE
#choice: Maximize, Minimize
optimizationMode: Maximize
trial:
trialCommand: python3 mnist.py
trialCodeDir: /usr/share/nni/examples/trials/mnist-smartparam
trialGpuNum: 0
\ No newline at end of file
"""A deep MNIST classifier using convolutional layers."""
import logging
import math
import tempfile
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import nni
FLAGS = None
logger = logging.getLogger('mnist_AutoML')
class MnistNetwork(object):
'''
MnistNetwork is for initlizing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
channel_2_num,
pool_size,
x_dim=784,
y_dim=10):
self.channel_1_num = channel_1_num
self.channel_2_num = channel_2_num
self.conv_size = nni.choice(2, 3, 5, 7, name='conv-size')
self.hidden_size = nni.choice(124, 512, 1024) # example: without name
self.pool_size = pool_size
self.learning_rate = nni.uniform(0.0001, 0.1, name='learning_rate')
self.x_dim = x_dim
self.y_dim = y_dim
self.images = tf.placeholder(tf.float32, [None, self.x_dim], name='input_x')
self.labels = tf.placeholder(tf.float32, [None, self.y_dim], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.train_step = None
self.accuracy = None
def build_network(self):
'''
Building network for mnist
'''
# Reshape to use within a convolutional neural net.
# Last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
with tf.name_scope('reshape'):
try:
input_dim = int(math.sqrt(self.x_dim))
except:
print(
'input dim cannot be sqrt and reshape. input dim: ' + str(self.x_dim))
logger.debug(
'input dim cannot be sqrt and reshape. input dim: %s', str(self.x_dim))
raise
x_image = tf.reshape(self.images, [-1, input_dim, input_dim, 1])
# First convolutional layer - maps one grayscale image to 32 feature maps.
with tf.name_scope('conv1'):
w_conv1 = weight_variable(
[self.conv_size, self.conv_size, 1, self.channel_1_num])
b_conv1 = bias_variable([self.channel_1_num])
h_conv1 = nni.function_choice(
lambda: tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1),
lambda: tf.nn.sigmoid(conv2d(x_image, w_conv1) + b_conv1),
lambda: tf.nn.tanh(conv2d(x_image, w_conv1) + b_conv1)
) # example: without name
# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
h_pool1 = max_pool(h_conv1, self.pool_size)
h_pool1 = nni.function_choice(
lambda: max_pool(h_conv1, self.pool_size),
lambda: avg_pool(h_conv1, self.pool_size),
name='h_pool1')
# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
w_conv2 = weight_variable([self.conv_size, self.conv_size,
self.channel_1_num, self.channel_2_num])
b_conv2 = bias_variable([self.channel_2_num])
h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
# Second pooling layer.
with tf.name_scope('pool2'): # example: another style
h_pool2 = max_pool(h_conv2, self.pool_size)
# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
last_dim = int(input_dim / (self.pool_size * self.pool_size))
with tf.name_scope('fc1'):
w_fc1 = weight_variable(
[last_dim * last_dim * self.channel_2_num, self.hidden_size])
b_fc1 = bias_variable([self.hidden_size])
h_pool2_flat = tf.reshape(
h_pool2, [-1, last_dim * last_dim * self.channel_2_num])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout - controls the complexity of the model, prevents co-adaptation of features.
with tf.name_scope('dropout'):
h_fc1_drop = tf.nn.dropout(h_fc1, self.keep_prob)
# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
w_fc2 = weight_variable([self.hidden_size, self.y_dim])
b_fc2 = bias_variable([self.y_dim])
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
with tf.name_scope('loss'):
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=y_conv))
with tf.name_scope('adam_optimizer'):
self.train_step = tf.train.AdamOptimizer(
self.learning_rate).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(
tf.argmax(y_conv, 1), tf.argmax(self.labels, 1))
self.accuracy = tf.reduce_mean(
tf.cast(correct_prediction, tf.float32))
def conv2d(x_input, w_matrix):
"""conv2d returns a 2d convolution layer with full stride."""
return tf.nn.conv2d(x_input, w_matrix, strides=[1, 1, 1, 1], padding='SAME')
def max_pool(x_input, pool_size):
"""max_pool downsamples a feature map by 2X."""
return tf.nn.max_pool(x_input, ksize=[1, pool_size, pool_size, 1],
strides=[1, pool_size, pool_size, 1], padding='SAME')
def avg_pool(x_input, pool_size):
return tf.nn.avg_pool(x_input, ksize=[1, pool_size, pool_size, 1],
strides=[1, pool_size, pool_size, 1], padding='SAME')
def weight_variable(shape):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def main(params):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
print('Mnist download data down.')
logger.debug('Mnist download data down.')
# Create the model
# Build the graph for the deep net
mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],
channel_2_num=params['channel_2_num'],
pool_size=params['pool_size'])
mnist_network.build_network()
logger.debug('Mnist build network done.')
# Write log
graph_location = tempfile.mkdtemp()
logger.debug('Saving graph to: %s', graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
test_acc = 0.0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
batch_num = nni.choice(50, 250, 500, name='batch_num')
for i in range(batch_num):
batch = mnist.train.next_batch(batch_num)
dropout_rate = nni.choice(1, 5, name='dropout_rate')
mnist_network.train_step.run(feed_dict={mnist_network.images: batch[0],
mnist_network.labels: batch[1],
mnist_network.keep_prob: dropout_rate}
)
if i % 100 == 0:
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
nni.report_intermediate_result(test_acc)
logger.debug('test accuracy %g', test_acc)
logger.debug('Pipe send intermediate result done.')
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
nni.report_final_result(test_acc)
logger.debug('Final result is %g', test_acc)
logger.debug('Send final result done.')
def generate_defualt_params():
'''
Generate default parameters for mnist network.
'''
params = {
'data_dir': '/tmp/tensorflow/mnist/input_data',
'channel_1_num': 32,
'channel_2_num': 64,
'pool_size': 2}
return params
if __name__ == '__main__':
try:
main(generate_defualt_params())
except Exception as exception:
logger.exception(exception)
raise
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
#choice: local, remote
trainingServicePlatform: local
searchSpacePath: /usr/share/nni/examples/trials/mnist/search_space.json
#choice: true, false
useAnnotation: false
tuner:
#choice: TPE, Random, Anneal, Evolution
tunerName: TPE
#choice: Maximize, Minimize
optimizationMode: Maximize
trial:
trialCommand: python3 mnist.py
trialCodeDir: /usr/share/nni/examples/trials/mnist
trialGpuNum: 0
\ No newline at end of file
"""A deep MNIST classifier using convolutional layers."""
import logging
import math
import tempfile
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import nni
FLAGS = None
logger = logging.getLogger('mnist_AutoML')
class MnistNetwork(object):
'''
MnistNetwork is for initlizing and building basic network for mnist.
'''
def __init__(self,
channel_1_num,
channel_2_num,
conv_size,
hidden_size,
pool_size,
learning_rate,
x_dim=784,
y_dim=10):
self.channel_1_num = channel_1_num
self.channel_2_num = channel_2_num
self.conv_size = conv_size
self.hidden_size = hidden_size
self.pool_size = pool_size
self.learning_rate = learning_rate
self.x_dim = x_dim
self.y_dim = y_dim
self.images = tf.placeholder(tf.float32, [None, self.x_dim], name='input_x')
self.labels = tf.placeholder(tf.float32, [None, self.y_dim], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.train_step = None
self.accuracy = None
def build_network(self):
'''
Building network for mnist
'''
# Reshape to use within a convolutional neural net.
# Last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
with tf.name_scope('reshape'):
try:
input_dim = int(math.sqrt(self.x_dim))
except:
print(
'input dim cannot be sqrt and reshape. input dim: ' + str(self.x_dim))
logger.debug(
'input dim cannot be sqrt and reshape. input dim: %s', str(self.x_dim))
raise
x_image = tf.reshape(self.images, [-1, input_dim, input_dim, 1])
# First convolutional layer - maps one grayscale image to 32 feature maps.
with tf.name_scope('conv1'):
w_conv1 = weight_variable(
[self.conv_size, self.conv_size, 1, self.channel_1_num])
b_conv1 = bias_variable([self.channel_1_num])
h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
# Pooling layer - downsamples by 2X.
with tf.name_scope('pool1'):
h_pool1 = max_pool(h_conv1, self.pool_size)
# Second convolutional layer -- maps 32 feature maps to 64.
with tf.name_scope('conv2'):
w_conv2 = weight_variable([self.conv_size, self.conv_size,
self.channel_1_num, self.channel_2_num])
b_conv2 = bias_variable([self.channel_2_num])
h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
# Second pooling layer.
with tf.name_scope('pool2'):
h_pool2 = max_pool(h_conv2, self.pool_size)
# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
# is down to 7x7x64 feature maps -- maps this to 1024 features.
last_dim = int(input_dim / (self.pool_size * self.pool_size))
with tf.name_scope('fc1'):
w_fc1 = weight_variable(
[last_dim * last_dim * self.channel_2_num, self.hidden_size])
b_fc1 = bias_variable([self.hidden_size])
h_pool2_flat = tf.reshape(
h_pool2, [-1, last_dim * last_dim * self.channel_2_num])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout - controls the complexity of the model, prevents co-adaptation of features.
with tf.name_scope('dropout'):
h_fc1_drop = tf.nn.dropout(h_fc1, self.keep_prob)
# Map the 1024 features to 10 classes, one for each digit
with tf.name_scope('fc2'):
w_fc2 = weight_variable([self.hidden_size, self.y_dim])
b_fc2 = bias_variable([self.y_dim])
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
with tf.name_scope('loss'):
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=y_conv))
with tf.name_scope('adam_optimizer'):
self.train_step = tf.train.AdamOptimizer(
self.learning_rate).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(
tf.argmax(y_conv, 1), tf.argmax(self.labels, 1))
self.accuracy = tf.reduce_mean(
tf.cast(correct_prediction, tf.float32))
def conv2d(x_input, w_matrix):
"""conv2d returns a 2d convolution layer with full stride."""
return tf.nn.conv2d(x_input, w_matrix, strides=[1, 1, 1, 1], padding='SAME')
def max_pool(x_input, pool_size):
"""max_pool downsamples a feature map by 2X."""
return tf.nn.max_pool(x_input, ksize=[1, pool_size, pool_size, 1],
strides=[1, pool_size, pool_size, 1], padding='SAME')
def weight_variable(shape):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def main(params):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
print('Mnist download data down.')
logger.debug('Mnist download data down.')
# Create the model
# Build the graph for the deep net
mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],
channel_2_num=params['channel_2_num'],
conv_size=params['conv_size'],
hidden_size=params['hidden_size'],
pool_size=params['pool_size'],
learning_rate=params['learning_rate'])
mnist_network.build_network()
logger.debug('Mnist build network done.')
# Write log
graph_location = tempfile.mkdtemp()
logger.debug('Saving graph to: %s', graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
test_acc = 0.0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(params['batch_num']):
batch = mnist.train.next_batch(params['batch_num'])
mnist_network.train_step.run(feed_dict={mnist_network.images: batch[0],
mnist_network.labels: batch[1],
mnist_network.keep_prob: params['dropout_rate']}
)
if i % 100 == 0:
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
nni.report_intermediate_result(test_acc)
logger.debug('test accuracy %g', test_acc)
logger.debug('Pipe send intermediate result done.')
test_acc = mnist_network.accuracy.eval(
feed_dict={mnist_network.images: mnist.test.images,
mnist_network.labels: mnist.test.labels,
mnist_network.keep_prob: 1.0})
nni.report_final_result(test_acc)
logger.debug('Final result is %g', test_acc)
logger.debug('Send final result done.')
def generate_default_params():
'''
Generate default parameters for mnist network.
'''
params = {
'data_dir': '/tmp/tensorflow/mnist/input_data',
'dropout_rate': 0.5,
'channel_1_num': 32,
'channel_2_num': 64,
'conv_size': 5,
'pool_size': 2,
'hidden_size': 1024,
'learning_rate': 1e-4,
'batch_num': 200}
return params
if __name__ == '__main__':
try:
# get parameters form tuner
RCV_PARAMS = nni.get_parameters()
logger.debug(RCV_PARAMS)
# run
params = generate_default_params()
params.update(RCV_PARAMS)
main(params)
except Exception as exception:
logger.exception(exception)
raise
{
"dropout_rate":{"_type":"uniform","_value":[0.1,0.5]},
"conv_size":{"_type":"choice","_value":[2,3,5,7]},
"hidden_size":{"_type":"choice","_value":[124, 512, 1024]},
"learning_rate":{"_type":"uniform","_value":[0.0001, 0.1]}
}
\ No newline at end of file
# Customized Tuner for Experts
*Tuner receive result from Trial as a matric to evaluate the performance of a specific parameters/architecture configure. And tuner send next hyper-parameter or architecture configure to Trial.*
So, if user want to implement a customized Tuner, she/he only need to:
**1) Inherit a tuner of a base Tuner class**
```python
from nni.tuner import Tuner
class CustomizedTuner(Tuner):
def __init__(self, ...):
...
```
**2) Implement receive trial result function**
```python
from nni.tuner import Tuner
class CustomizedTuner(Tuner):
def __init__(self, ...):
...
def receive_trial_result(self, parameter_id, parameters, reward):
'''
Record an observation of the objective function
'''
# you code implements here.
...
```
**3) Implement generate parameter function**
```python
from nni.tuner import Tuner
class CustomizedTuner(Tuner):
def __init__(self, ...):
...
def receive_trial_result(self, parameter_id, parameters, reward):
'''
Record an observation of the objective function
parameter_id: int
parameters: object created by 'generate_parameters()'
reward: object reported by trial
'''
# your code implements here.
...
def generate_parameters(self, parameter_id):
'''
Returns a set of trial (hyper-)parameters, as a serializable object
parameter_id: int
'''
# your code implements here.
...
```
**4) Write a script to run Tuner**
```python
import argparse
import CustomizedTuner
def main():
parser = argparse.ArgumentParser(description='parse command line parameters.')
# parse your tuner arg here.
...
FLAGS, unparsed = parser.parse_known_args()
tuner = CustomizedTuner(...)
tuner.run()
main()
```
Please noted in **2)** and **3)**. The parameter configures from ```generate_parameters``` function, will be package as json object by nni SDK. And nni SDK will unpack json object so the Trial will receive the exact same configure from Tuner.
User could override the ```run``` function in ```CustomizedTuner``` class, which could help user to control the process logic in Tuner, such as control handle request from Trial.
```receive_trial_result``` will receive ```the parameter_id, parameters, reward``` as parameters input. Also, Tuner will receive the ```reward``` object are exactly same reward that Trial send.
More detail example you could see:
> * [evlution-tuner](https://msrasrg.visualstudio.com/NeuralNetworkIntelligenceOpenSource/_git/Default?path=%2Fsrc%2Fsdk%2Fpynni%2Fnni%2Fevolution_tuner&version=GBmaster)
> * [hyperopt-tuner](https://msrasrg.visualstudio.com/NeuralNetworkIntelligenceOpenSource/_git/Default?path=%2Fsrc%2Fsdk%2Fpynni%2Fnni%2Fhyperopt_tuner&version=GBmaster)
> * [ga-customer-tuner](https://msrasrg.visualstudio.com/NeuralNetworkIntelligenceOpenSource/_git/Default?path=%2Fexamples%2Ftuners%2Fga_customer_tuner&version=GBmaster)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment