Commit c6d9d539 authored by Grégory Châtel's avatar Grégory Châtel
Browse files

Simplifying code for easier understanding.

parent 793262e8
...@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor): ...@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor):
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s.""" """Loads a data file into a list of `InputBatch`s."""
label_map = {} label_map = {label : i for i, label in enumerate(label_list)}
for (i, label) in enumerate(label_list):
label_map[label] = i
features = [] features = []
for (ex_index, example) in enumerate(examples): for (ex_index, example) in enumerate(examples):
...@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
tokens_b = None tokens_b = None
if example.text_b: if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b) tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total # Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length. # length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3" # Account for [CLS], [SEP], [SEP] with "- 3"
...@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
else: else:
# Account for [CLS] and [SEP] with "- 2" # Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2: if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)] tokens_a = tokens_a[:(max_seq_length - 2)]
# The convention in BERT is: # The convention in BERT is:
# (a) For sequence pairs: # (a) For sequence pairs:
...@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
# For classification tasks, the first vector (corresponding to [CLS]) is # For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because # used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned. # the entire model is fine-tuned.
tokens = [] tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
segment_ids = [] segment_ids = [0] * len(tokens)
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b: if tokens_b:
for token in tokens_b: tokens += tokens_b + ["[SEP]"]
tokens.append(token) segment_ids += [1] * (len(tokens_b) + 1)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens)
...@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
input_mask = [1] * len(input_ids) input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length. # Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length: padding = [0] * (max_seq_length - len(input_ids))
input_ids.append(0) input_ids += padding
input_mask.append(0) input_mask += padding
segment_ids.append(0) segment_ids += padding
assert len(input_ids) == max_seq_length assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length assert len(input_mask) == max_seq_length
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment