Commit 241e3254 authored by Sinan Tan's avatar Sinan Tan Committed by xuehui
Browse files

Fix some pylint warnings for SQuAD QA model.

parent 5e01504d
......@@ -19,6 +19,10 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
Data processing script for the QA model.
'''
import csv
import json
from random import shuffle
......@@ -73,19 +77,19 @@ def load_from_file(path, fmt=None, is_training=True):
for doc in data:
for paragraph in doc['paragraphs']:
passage = paragraph['context']
for qa in paragraph['qas']:
question = qa['question']
id = qa['id']
for qa_pair in paragraph['qas']:
question = qa_pair['question']
qa_id = qa_pair['id']
if not is_training:
qp_pairs.append(
{'passage': passage, 'question': question, 'id': id})
{'passage': passage, 'question': question, 'id': qa_id})
else:
for answer in qa['answers']:
for answer in qa_pair['answers']:
answer_begin = int(answer['answer_start'])
answer_end = answer_begin + len(answer['text'])
qp_pairs.append({'passage': passage,
'question': question,
'id': id,
'id': qa_id,
'answer_begin': answer_begin,
'answer_end': answer_end})
else:
......@@ -121,21 +125,21 @@ def collect_vocab(qp_pairs):
Build the vocab from corpus.
'''
vocab = set()
for qp in qp_pairs:
for word in qp['question_tokens']:
for qp_pair in qp_pairs:
for word in qp_pair['question_tokens']:
vocab.add(word['word'])
for word in qp['passage_tokens']:
for word in qp_pair['passage_tokens']:
vocab.add(word['word'])
return vocab
def shuffle_step(l, step):
def shuffle_step(entries, step):
'''
Shuffle the step
'''
answer = []
for i in range(0, len(l), step):
sub = l[i:i+step]
for i in range(0, len(entries), step):
sub = entries[i:i+step]
shuffle(sub)
answer += sub
return answer
......@@ -163,13 +167,13 @@ def get_char_input(data, char_dict, max_char_length):
char_id = np.zeros((max_char_length, sequence_length,
batch_size), dtype=np.int32)
char_lengths = np.zeros((sequence_length, batch_size), dtype=np.float32)
for b in range(0, min(len(data), batch_size)):
d = data[b]
for s in range(0, min(len(d), sequence_length)):
word = d[s]['word']
char_lengths[s, b] = min(len(word), max_char_length)
for batch_idx in range(0, min(len(data), batch_size)):
batch_data = data[batch_idx]
for sample_idx in range(0, min(len(batch_data), sequence_length)):
word = batch_data[sample_idx]['word']
char_lengths[sample_idx, batch_idx] = min(len(word), max_char_length)
for i in range(0, min(len(word), max_char_length)):
char_id[i, s, b] = get_id(char_dict, word[i])
char_id[i, sample_idx, batch_idx] = get_id(char_dict, word[i])
return char_id, char_lengths
......@@ -180,26 +184,26 @@ def get_word_input(data, word_dict, embed, embed_dim):
batch_size = len(data)
max_sequence_length = max(len(d) for d in data)
sequence_length = max_sequence_length
t = np.zeros((max_sequence_length, batch_size,
embed_dim), dtype=np.float32)
word_input = np.zeros((max_sequence_length, batch_size,
embed_dim), dtype=np.float32)
ids = np.zeros((sequence_length, batch_size), dtype=np.int32)
masks = np.zeros((sequence_length, batch_size), dtype=np.float32)
lengths = np.zeros([batch_size], dtype=np.int32)
for b in range(0, min(len(data), batch_size)):
d = data[b]
for batch_idx in range(0, min(len(data), batch_size)):
batch_data = data[batch_idx]
lengths[b] = len(d)
lengths[batch_idx] = len(batch_data)
for s in range(0, min(len(d), sequence_length)):
word = d[s]['word'].lower()
for sample_idx in range(0, min(len(batch_data), sequence_length)):
word = batch_data[sample_idx]['word'].lower()
if word in word_dict.keys():
t[s, b] = embed[word_dict[word]]
ids[s, b] = word_dict[word]
masks[s, b] = 1
word_input[sample_idx, batch_idx] = embed[word_dict[word]]
ids[sample_idx, batch_idx] = word_dict[word]
masks[sample_idx, batch_idx] = 1
t = np.reshape(t, (-1, embed_dim))
return t, ids, masks, lengths
word_input = np.reshape(word_input, (-1, embed_dim))
return word_input, ids, masks, lengths
def get_word_index(tokens, char_index):
......
......@@ -19,6 +19,10 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
Evaluation scripts for QA model.
'''
from __future__ import print_function
from collections import Counter
import string
......@@ -68,8 +72,8 @@ def f1_score(prediction, ground_truth):
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
f1_result = (2 * precision * recall) / (precision + recall)
return f1_result
def exact_match_score(prediction, ground_truth):
'''
......@@ -91,28 +95,25 @@ def _evaluate(dataset, predictions):
'''
Evaluate function.
'''
f1 = exact_match = total = 0
f1_result = exact_match = total = 0
count = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
for qa_pair in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
#print(message, file=sys.stderr)
if qa_pair['id'] not in predictions:
count += 1
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
ground_truths = list(map(lambda x: x['text'], qa_pair['answers']))
prediction = predictions[qa_pair['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_result += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
print('total', total, 'exact_match', exact_match, 'unanswer_question ', count)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
f1_result = 100.0 * f1_result / total
return {'exact_match': exact_match, 'f1': f1_result}
def evaluate(data_file, pred_file):
'''
......
......@@ -43,8 +43,8 @@ class Layer(object):
'''
Layer class, which contains the information of graph.
'''
def __init__(self, graph_type, input=None, output=None, size=None):
self.input = input if input is not None else []
def __init__(self, graph_type, inputs=None, output=None, size=None):
self.input = inputs if inputs is not None else []
self.output = output if output is not None else []
self.graph_type = graph_type
self.is_delete = False
......@@ -117,11 +117,11 @@ class Graph(object):
'''
Customed Graph class.
'''
def __init__(self, max_layer_num, input, output, hide):
def __init__(self, max_layer_num, inputs, output, hide):
self.layers = []
self.max_layer_num = max_layer_num
for layer in input:
for layer in inputs:
self.layers.append(layer)
for layer in output:
self.layers.append(layer)
......@@ -240,7 +240,7 @@ class Graph(object):
if graph_type <= 1:
new_id = len(layers)
out = random.choice(layers_out)
input = []
inputs = []
output = [out]
pos = random.randint(0, len(layers[out].input) - 1)
last_in = layers[out].input[pos]
......@@ -250,13 +250,13 @@ class Graph(object):
if graph_type == 1:
layers[last_in].output.remove(out)
layers[last_in].output.append(new_id)
input = [last_in]
lay = Layer(graph_type=layer_type, input=input, output=output)
while len(input) < lay.input_size:
inputs = [last_in]
lay = Layer(graph_type=layer_type, inputs=inputs, output=output)
while len(inputs) < lay.input_size:
layer1 = random.choice(layers_in)
input.append(layer1)
inputs.append(layer1)
layers[layer1].output.append(new_id)
lay.input = input
lay.input = inputs
layers.append(lay)
else:
layer1 = random.choice(layers_del)
......
......@@ -32,6 +32,7 @@ from graph_to_tf import graph_to_network
class GAGConfig:
"""The class for model hyper-parameter configuration."""
def __init__(self):
self.batch_size = 128
......@@ -56,6 +57,7 @@ class GAGConfig:
class GAG:
"""The class for the computation graph based QA model."""
def __init__(self, cfg, embed, graph):
self.cfg = cfg
self.embed = embed
......@@ -83,6 +85,7 @@ class GAG:
def build_net(self, is_training):
"""Build the whole neural network for the QA model."""
cfg = self.cfg
with tf.device('/cpu:0'):
word_embed = tf.get_variable(
......@@ -202,6 +205,7 @@ class GAG:
if is_training:
def label_smoothing(inputs, masks, epsilon=0.1):
"""Modify target for label smoothing."""
epsilon = cfg.labelsmoothing
num_of_channel = tf.shape(inputs)[-1] # number of channels
inputs = tf.cast(inputs, tf.float32)
......@@ -229,6 +233,7 @@ class GAG:
return tf.stack([self.begin_prob, self.end_prob])
def build_char_states(self, char_embed, is_training, reuse, char_ids, char_lengths):
"""Build char embedding network for the QA model."""
max_char_length = self.cfg.max_char_length
inputs = dropout(tf.nn.embedding_lookup(char_embed, char_ids),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment