Fix some pylint warnings for SQuAD QA model.

241e3254 · Sinan Tan · xuehui · 5e01504d · 241e3254 · 241e3254
Commit 241e3254 authored Sep 14, 2018 by Sinan Tan Committed by xuehui Sep 14, 2018
4 changed files
--- a/examples/trials/ga_squad/data.py
+++ b/examples/trials/ga_squad/data.py
@@ -19,6 +19,10 @@
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+'''
+Data processing script for the QA model.
+'''
+
 import csv
 import json
 from random import shuffle
@@ -73,19 +77,19 @@ def load_from_file(path, fmt=None, is_training=True):
            for doc in data:
                for paragraph in doc['paragraphs']:
                    passage = paragraph['context']
-                    for qa in paragraph['qas']:
-                        question = qa['question']
-                        id = qa['id']
+                    for qa_pair in paragraph['qas']:
+                        question = qa_pair['question']
+                        qa_id = qa_pair['id']
                        if not is_training:
                            qp_pairs.append(
-                                {'passage': passage, 'question': question, 'id': id})
+                                {'passage': passage, 'question': question, 'id': qa_id})
                        else:
-                            for answer in qa['answers']:
+                            for answer in qa_pair['answers']:
                                answer_begin = int(answer['answer_start'])
                                answer_end = answer_begin + len(answer['text'])
                                qp_pairs.append({'passage': passage,
                                                 'question': question,
-                                                 'id': id,
+                                                 'id': qa_id,
                                                 'answer_begin': answer_begin,
                                                 'answer_end': answer_end})
    else:
@@ -121,21 +125,21 @@ def collect_vocab(qp_pairs):
    Build the vocab from corpus.
    '''
    vocab = set()
-    for qp in qp_pairs:
-        for word in qp['question_tokens']:
+    for qp_pair in qp_pairs:
+        for word in qp_pair['question_tokens']:
            vocab.add(word['word'])
-        for word in qp['passage_tokens']:
+        for word in qp_pair['passage_tokens']:
            vocab.add(word['word'])
    return vocab


-def shuffle_step(l, step):
+def shuffle_step(entries, step):
    '''
    Shuffle the step
    '''
    answer = []
-    for i in range(0, len(l), step):
-        sub = l[i:i+step]
+    for i in range(0, len(entries), step):
+        sub = entries[i:i+step]
        shuffle(sub)
        answer += sub
    return answer
@@ -163,13 +167,13 @@ def get_char_input(data, char_dict, max_char_length):
    char_id = np.zeros((max_char_length, sequence_length,
                        batch_size), dtype=np.int32)
    char_lengths = np.zeros((sequence_length, batch_size), dtype=np.float32)
-    for b in range(0, min(len(data), batch_size)):
-        d = data[b]
-        for s in range(0, min(len(d), sequence_length)):
-            word = d[s]['word']
-            char_lengths[s, b] = min(len(word), max_char_length)
+    for batch_idx in range(0, min(len(data), batch_size)):
+        batch_data = data[batch_idx]
+        for sample_idx in range(0, min(len(batch_data), sequence_length)):
+            word = batch_data[sample_idx]['word']
+            char_lengths[sample_idx, batch_idx] = min(len(word), max_char_length)
            for i in range(0, min(len(word), max_char_length)):
-                char_id[i, s, b] = get_id(char_dict, word[i])
+                char_id[i, sample_idx, batch_idx] = get_id(char_dict, word[i])
    return char_id, char_lengths


@@ -180,26 +184,26 @@ def get_word_input(data, word_dict, embed, embed_dim):
    batch_size = len(data)
    max_sequence_length = max(len(d) for d in data)
    sequence_length = max_sequence_length
-    t = np.zeros((max_sequence_length, batch_size,
-                  embed_dim), dtype=np.float32)
+    word_input = np.zeros((max_sequence_length, batch_size,
+                           embed_dim), dtype=np.float32)
    ids = np.zeros((sequence_length, batch_size), dtype=np.int32)
    masks = np.zeros((sequence_length, batch_size), dtype=np.float32)
    lengths = np.zeros([batch_size], dtype=np.int32)

-    for b in range(0, min(len(data), batch_size)):
-        d = data[b]
+    for batch_idx in range(0, min(len(data), batch_size)):
+        batch_data = data[batch_idx]

-        lengths[b] = len(d)
+        lengths[batch_idx] = len(batch_data)

-        for s in range(0, min(len(d), sequence_length)):
-            word = d[s]['word'].lower()
+        for sample_idx in range(0, min(len(batch_data), sequence_length)):
+            word = batch_data[sample_idx]['word'].lower()
            if word in word_dict.keys():
-                t[s, b] = embed[word_dict[word]]
-                ids[s, b] = word_dict[word]
-            masks[s, b] = 1
+                word_input[sample_idx, batch_idx] = embed[word_dict[word]]
+                ids[sample_idx, batch_idx] = word_dict[word]
+            masks[sample_idx, batch_idx] = 1

-    t = np.reshape(t, (-1, embed_dim))
-    return t, ids, masks, lengths
+    word_input = np.reshape(word_input, (-1, embed_dim))
+    return word_input, ids, masks, lengths


 def get_word_index(tokens, char_index):

--- a/examples/trials/ga_squad/evaluate.py
+++ b/examples/trials/ga_squad/evaluate.py
@@ -19,6 +19,10 @@
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+'''
+Evaluation scripts for QA model.
+'''
+
 from __future__ import print_function
 from collections import Counter
 import string
@@ -68,8 +72,8 @@ def f1_score(prediction, ground_truth):
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
+    f1_result = (2 * precision * recall) / (precision + recall)
+    return f1_result

 def exact_match_score(prediction, ground_truth):
    '''
@@ -91,28 +95,25 @@ def _evaluate(dataset, predictions):
    '''
    Evaluate function.
    '''
-    f1 = exact_match = total = 0
+    f1_result = exact_match = total = 0
    count = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
-            for qa in paragraph['qas']:
+            for qa_pair in paragraph['qas']:
                total += 1
-                if qa['id'] not in predictions:
-                    message = 'Unanswered question ' + qa['id'] + \
-                              ' will receive score 0.'
-                    #print(message, file=sys.stderr)
+                if qa_pair['id'] not in predictions:
                    count += 1
                    continue
-                ground_truths = list(map(lambda x: x['text'], qa['answers']))
-                prediction = predictions[qa['id']]
+                ground_truths = list(map(lambda x: x['text'], qa_pair['answers']))
+                prediction = predictions[qa_pair['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
-                f1 += metric_max_over_ground_truths(
+                f1_result += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)
    print('total', total, 'exact_match', exact_match, 'unanswer_question ', count)
    exact_match = 100.0 * exact_match / total
-    f1 = 100.0 * f1 / total
-    return {'exact_match': exact_match, 'f1': f1}
+    f1_result = 100.0 * f1_result / total
+    return {'exact_match': exact_match, 'f1': f1_result}

 def evaluate(data_file, pred_file):
    '''

--- a/examples/trials/ga_squad/graph.py
+++ b/examples/trials/ga_squad/graph.py
@@ -43,8 +43,8 @@ class Layer(object):
    '''
    Layer class, which contains the information of graph.
    '''
-    def __init__(self, graph_type, input=None, output=None, size=None):
-        self.input = input if input is not None else []
+    def __init__(self, graph_type, inputs=None, output=None, size=None):
+        self.input = inputs if inputs is not None else []
        self.output = output if output is not None else []
        self.graph_type = graph_type
        self.is_delete = False
@@ -117,11 +117,11 @@ class Graph(object):
    '''
    Customed Graph class.
    '''
-    def __init__(self, max_layer_num, input, output, hide):
+    def __init__(self, max_layer_num, inputs, output, hide):
        self.layers = []
        self.max_layer_num = max_layer_num

-        for layer in input:
+        for layer in inputs:
            self.layers.append(layer)
        for layer in output:
            self.layers.append(layer)
@@ -240,7 +240,7 @@ class Graph(object):
            if graph_type <= 1:
                new_id = len(layers)
                out = random.choice(layers_out)
-                input = []
+                inputs = []
                output = [out]
                pos = random.randint(0, len(layers[out].input) - 1)
                last_in = layers[out].input[pos]
@@ -250,13 +250,13 @@ class Graph(object):
                if graph_type == 1:
                    layers[last_in].output.remove(out)
                    layers[last_in].output.append(new_id)
-                    input = [last_in]
-                lay = Layer(graph_type=layer_type, input=input, output=output)
-                while len(input) < lay.input_size:
+                    inputs = [last_in]
+                lay = Layer(graph_type=layer_type, inputs=inputs, output=output)
+                while len(inputs) < lay.input_size:
                    layer1 = random.choice(layers_in)
-                    input.append(layer1)
+                    inputs.append(layer1)
                    layers[layer1].output.append(new_id)
-                lay.input = input
+                lay.input = inputs
                layers.append(lay)
            else:
                layer1 = random.choice(layers_del)

--- a/examples/trials/ga_squad/train_model.py
+++ b/examples/trials/ga_squad/train_model.py
@@ -32,6 +32,7 @@ from graph_to_tf import graph_to_network


 class GAGConfig:
+    """The class for model hyper-parameter configuration."""
    def __init__(self):
        self.batch_size = 128

@@ -56,6 +57,7 @@ class GAGConfig:


 class GAG:
+    """The class for the computation graph based QA model."""
    def __init__(self, cfg, embed, graph):
        self.cfg = cfg
        self.embed = embed
@@ -83,6 +85,7 @@ class GAG:


    def build_net(self, is_training):
+        """Build the whole neural network for the QA model."""
        cfg = self.cfg
        with tf.device('/cpu:0'):
            word_embed = tf.get_variable(
@@ -202,6 +205,7 @@ class GAG:

        if is_training:
            def label_smoothing(inputs, masks, epsilon=0.1):
+                """Modify target for label smoothing."""
                epsilon = cfg.labelsmoothing
                num_of_channel = tf.shape(inputs)[-1]  # number of channels
                inputs = tf.cast(inputs, tf.float32)
@@ -229,6 +233,7 @@ class GAG:
        return tf.stack([self.begin_prob, self.end_prob])

    def build_char_states(self, char_embed, is_training, reuse, char_ids, char_lengths):
+        """Build char embedding network for the QA model."""
        max_char_length = self.cfg.max_char_length

        inputs = dropout(tf.nn.embedding_lookup(char_embed, char_ids),