Commit a4086c5d authored by thomwolf's avatar thomwolf
Browse files
parents 088ad458 8bd6b235
......@@ -4,26 +4,72 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# TensorFlow code"
"# Comparing TensorFlow (original) and PyTorch models\n",
"\n",
"We use this small notebook to test the conversion of the model's weights and to make sure both the TensorFlow and PyTorch are coherent. In particular, we compare the weights of the last layer on a simple example (in `input.txt`).\n",
"\n",
"To run this notebook, please make sure that your Python environment has both TensorFlow and PyTorch.\n",
"You should follow the instructions in the `README.md` and make sure that you have:\n",
"- the original TensorFlow implementation\n",
"- the `BERT-base, Uncased` model\n",
"- run the script `convert_tf_checkpoint_to_pytorch.py` to convert the weights to PyTorch\n",
"\n",
"Please modify the relative paths accordingly (at the beggining of Sections 1 and 2)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1/ TensorFlow code"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"original_tf_inplem_dir = \"../bert/\"\n",
"model_dir = \"../uncased_L-12_H-768_A-12/\"\n",
"\n",
"vocab_file = model_dir + \"vocab.txt\"\n",
"bert_config_file = model_dir + \"bert_config.json\"\n",
"init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
"\n",
"input_file = \"input.txt\"\n",
"max_seq_length = 128"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:37.498678Z",
"start_time": "2018-11-03T02:09:36.366672Z"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append(original_tf_inplem_dir)\n",
"\n",
"from extract_features import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:37.621865Z",
......@@ -45,13 +91,6 @@
}
],
"source": [
"data_dir=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/data/glue_data/MRPC/\"\n",
"vocab_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/vocab.txt\"\n",
"bert_config_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_config.json\"\n",
"init_checkpoint=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_model.ckpt\"\n",
"max_seq_length=128\n",
"input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n",
"\n",
"layer_indexes = list(range(12))\n",
"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
"tokenizer = tokenization.FullTokenizer(\n",
......@@ -67,7 +106,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:40.831618Z",
......@@ -79,15 +118,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b0bcc80>) includes params argument, but params are not passed to Estimator.\n",
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u\n",
"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x1289c1a60>) includes params argument, but params are not passed to Estimator.\n",
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr\n",
"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
"graph_options {\n",
" rewrite_options {\n",
" meta_optimizer_iterations: ONE\n",
" }\n",
"}\n",
", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e1160f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12c242470>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
"WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
"INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
"WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
......@@ -123,7 +162,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:46.413197Z",
......@@ -135,7 +174,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u, running initialization to predict.\n",
"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr, running initialization to predict.\n",
"INFO:tensorflow:Calling model_fn.\n",
"INFO:tensorflow:Running infer on CPU\n",
"INFO:tensorflow:Done calling model_fn.\n",
......@@ -186,7 +225,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:46.460128Z",
......@@ -211,7 +250,7 @@
"(128, 768)"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
......@@ -227,7 +266,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:46.498637Z",
......@@ -243,12 +282,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# PyTorch code"
"## 2/ PyTorch code"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:46.660303Z",
......@@ -263,12 +302,22 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"init_checkpoint_pt = \"../pytorch_model/uncased_L-12_H-768_A-12/pytorch_model.bin\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:48.292135Z",
"start_time": "2018-11-03T02:09:46.661921Z"
}
},
"scrolled": true
},
"outputs": [
{
......@@ -569,14 +618,12 @@
")"
]
},
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"init_checkpoint_pt=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"\n",
"\n",
"device = torch.device(\"cpu\")\n",
"model = extract_features_pytorch.BertModel(bert_config)\n",
"model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
......@@ -585,7 +632,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:48.332982Z",
......@@ -892,7 +939,7 @@
")"
]
},
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
......@@ -912,7 +959,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:54.371188Z",
......@@ -1000,7 +1047,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:57.139854Z",
......@@ -1026,7 +1073,7 @@
"(128, 768)"
]
},
"execution_count": 17,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1043,7 +1090,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:59.000058Z",
......@@ -1068,7 +1115,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:09:59.462123Z",
......@@ -1090,9 +1137,16 @@
"print(tensorflow_outputs[1].shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3/ Comparing the standard deviation on the last layer of both models"
]
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:10:00.014784Z",
......@@ -1106,7 +1160,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2018-11-03T02:10:09.582557Z",
......@@ -1127,7 +1181,7 @@
"4.1671223e-07"
]
},
"execution_count": 24,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1137,21 +1191,14 @@
"print(np.array(tensorflow_outputs[i]).shape, np.array(pytorch_outputs[i]).shape)\n",
"np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python [conda env:bert]",
"display_name": "Python 3",
"language": "python",
"name": "conda-env-bert-py"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
......@@ -1163,7 +1210,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
"version": "3.6.5"
},
"toc": {
"colors": {
......
This diff is collapsed.
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import random
import tokenization
import tensorflow as tf
import argparse
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_file", default=None, type=str, required=True,
help="Input raw text file (or comma-separated list of files).")
parser.add_argument("--output_file", default=None, type=str, required=True,
help="Output TF example file (or comma-separated list of files).")
parser.add_argument("--vocab_file", default=None, type=str, required=True,
help="The vocabulary file that the BERT model was trained on.")
## Other parameters
parser.add_argument("--do_lower_case", default=True, action='store_true',
help="Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
parser.add_argument("--max_seq_length", default=128, type=int, help="Maximum sequence length.")
parser.add_argument("--max_predictions_per_seq", default=20, type=int,
help="Maximum number of masked LM predictions per sequence.")
parser.add_argument("--random_seed", default=12345, type=int, help="Random seed for data generation.")
parser.add_argument("--dupe_factor", default=10, type=int,
help="Number of times to duplicate the input data (with different masks).")
parser.add_argument("--masked_lm_prob", default=0.15, type=float, help="Masked LM probability.")
parser.add_argument("--short_seq_prob", default=0.1, type=float,
help="Probability of creating sequences which are shorter than the maximum length.")
args = parser.parse_args()
class TrainingInstance(object):
"""A single training instance (sentence pair)."""
def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
is_random_next):
self.tokens = tokens
self.segment_ids = segment_ids
self.is_random_next = is_random_next
self.masked_lm_positions = masked_lm_positions
self.masked_lm_labels = masked_lm_labels
def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
s += "\n"
return s
def __repr__(self):
return self.__str__()
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
max_predictions_per_seq, output_files):
"""Create TF example files from `TrainingInstance`s."""
writers = []
for output_file in output_files:
writers.append(tf.python_io.TFRecordWriter(output_file))
writer_index = 0
total_written = 0
for (inst_index, instance) in enumerate(instances):
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
input_mask = [1] * len(input_ids)
segment_ids = list(instance.segment_ids)
assert len(input_ids) <= max_seq_length
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
masked_lm_positions = list(instance.masked_lm_positions)
masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
masked_lm_weights = [1.0] * len(masked_lm_ids)
while len(masked_lm_positions) < max_predictions_per_seq:
masked_lm_positions.append(0)
masked_lm_ids.append(0)
masked_lm_weights.append(0.0)
next_sentence_label = 1 if instance.is_random_next else 0
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
features["next_sentence_labels"] = create_int_feature([next_sentence_label])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writers[writer_index].write(tf_example.SerializeToString())
writer_index = (writer_index + 1) % len(writers)
total_written += 1
if inst_index < 20:
tf.logging.info("*** Example ***")
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in instance.tokens]))
for feature_name in features.keys():
feature = features[feature_name]
values = []
if feature.int64_list.value:
values = feature.int64_list.value
elif feature.float_list.value:
values = feature.float_list.value
tf.logging.info(
"%s: %s" % (feature_name, " ".join([str(x) for x in values])))
for writer in writers:
writer.close()
tf.logging.info("Wrote %d total instances", total_written)
def create_int_feature(values):
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return feature
def create_float_feature(values):
feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
return feature
def create_training_instances(input_files, tokenizer, max_seq_length,
dupe_factor, short_seq_prob, masked_lm_prob,
max_predictions_per_seq, rng):
"""Create `TrainingInstance`s from raw text."""
all_documents = [[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for input_file in input_files:
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
# Empty lines are used as document delimiters
if not line:
all_documents.append([])
tokens = tokenizer.tokenize(line)
if tokens:
all_documents[-1].append(tokens)
# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
vocab_words = list(tokenizer.vocab.keys())
instances = []
for _ in range(dupe_factor):
for document_index in range(len(all_documents)):
instances.extend(
create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
rng.shuffle(instances)
return instances
def create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
"""Creates `TrainingInstance`s for a single document."""
document = all_documents[document_index]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if rng.random() < short_seq_prob:
target_seq_length = rng.randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = rng.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random next
is_random_next = False
if len(current_chunk) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for _ in range(10):
random_document_index = rng.randint(0, len(all_documents) - 1)
if random_document_index != document_index:
break
random_document = all_documents[random_document_index]
random_start = rng.randint(0, len(random_document) - 1)
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
(tokens, masked_lm_positions,
masked_lm_labels) = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
instance = TrainingInstance(
tokens=tokens,
segment_ids=segment_ids,
is_random_next=is_random_next,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels)
instances.append(instance)
current_chunk = []
current_length = 0
i += 1
return instances
def create_masked_lm_predictions(tokens, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng):
"""Creates the predictis for the masked LM objective."""
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append(i)
rng.shuffle(cand_indexes)
output_tokens = list(tokens)
masked_lm = collections.namedtuple("masked_lm", ["index", "label"]) # pylint: disable=invalid-name
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
masked_lms = []
covered_indexes = set()
for index in cand_indexes:
if len(masked_lms) >= num_to_predict:
break
if index in covered_indexes:
continue
covered_indexes.add(index)
masked_token = None
# 80% of the time, replace with [MASK]
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if rng.random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(masked_lm(index=index, label=tokens[index]))
masked_lms = sorted(masked_lms, key=lambda x: x.index)
masked_lm_positions = []
masked_lm_labels = []
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels)
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if rng.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
input_files = []
for input_pattern in args.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Reading from input files ***")
for input_file in input_files:
tf.logging.info(" %s", input_file)
rng = random.Random(args.random_seed)
instances = create_training_instances(
input_files, tokenizer, args.max_seq_length, args.dupe_factor,
args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
rng)
output_files = args.output_file.split(",")
tf.logging.info("*** Writing to output files ***")
for output_file in output_files:
tf.logging.info(" %s", output_file)
write_instance_to_example_files(instances, tokenizer, args.max_seq_length,
args.max_predictions_per_seq, output_files)
if __name__ == "__main__":
tf.app.run()
......@@ -249,6 +249,9 @@ def main():
if args.init_checkpoint is not None:
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
model.to(device)
if n_gpu > 1:
model = nn.DataParallel(model)
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
......
......@@ -502,7 +502,7 @@ class BertForQuestionAnswering(nn.Module):
def compute_loss(logits, positions):
max_position = positions.max().item()
one_hot = torch.FloatTensor(batch_size, max(max_position, seq_length) +1).zero_()
one_hot = one_hot.scatter(1, positions.cpu(), 1) # Second argument need to be LongTensor and not cuda.LongTensor
one_hot = one_hot.scatter_(1, positions.cpu(), 1) # Second argument need to be LongTensor and not cuda.LongTensor
one_hot = one_hot[:, :seq_length].to(input_ids.device)
log_probs = nn.functional.log_softmax(logits, dim = -1).view(batch_size, seq_length)
loss = -torch.mean(torch.sum(one_hot*log_probs), dim = -1)
......
......@@ -482,6 +482,9 @@ def main():
if args.init_checkpoint is not None:
model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
model.to(device)
if n_gpu > 1:
model = torch.nn.DataParallel(model)
no_decay = ['bias', 'gamma', 'beta']
optimizer_parameters = [
......@@ -518,7 +521,7 @@ def main():
model.train()
nb_tr_examples = 0
for epoch in trange(args.num_train_epochs, desc="Epoch"):
for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Iteration"):
input_ids = input_ids.to(device)
input_mask = input_mask.float().to(device)
......
This diff is collapsed.
......@@ -23,7 +23,7 @@ import logging
import json
import math
import os
import tokenization
import tokenization_pytorch
import six
import argparse
......@@ -62,9 +62,9 @@ class SquadExample(object):
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += "qas_id: %s" % (tokenization_pytorch.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
tokenization_pytorch.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
......@@ -153,7 +153,7 @@ def read_squad_examples(input_file, is_training):
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
tokenization_pytorch.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
......@@ -287,7 +287,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
[tokenization_pytorch.printable_text(x) for x in tokens]))
logger.info("token_to_orig_map: %s" % " ".join(
["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
logger.info("token_is_max_context: %s" % " ".join([
......@@ -303,7 +303,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position))
logger.info(
"answer: %s" % (tokenization.printable_text(answer_text)))
"answer: %s" % (tokenization_pytorch.printable_text(answer_text)))
features.append(
InputFeatures(
......@@ -579,7 +579,7 @@ def get_final_text(pred_text, orig_text, do_lower_case):
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tokenizer = tokenization_pytorch.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
......@@ -780,7 +780,7 @@ def main():
raise ValueError("Output directory () already exists and is not empty.")
os.makedirs(args.output_dir, exist_ok=True)
tokenizer = tokenization.FullTokenizer(
tokenizer = tokenization_pytorch.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
train_examples = None
......@@ -795,6 +795,9 @@ def main():
if args.init_checkpoint is not None:
model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
model.to(device)
if n_gpu > 1:
model = torch.nn.DataParallel(model)
optimizer = BERTAdam([{'params': [p for n, p in model.named_parameters() if n != 'bias'], 'l2': 0.01},
{'params': [p for n, p in model.named_parameters() if n == 'bias'], 'l2': 0.}
......
......@@ -21,7 +21,7 @@ from __future__ import print_function
import collections
import random
import tokenization
from tensorflow_code import tokenization
import tensorflow as tf
flags = tf.flags
......
......@@ -23,8 +23,8 @@ import collections
import json
import re
import modeling
import tokenization
from tensorflow_code import modeling
from tensorflow_code import tokenization
import tensorflow as tf
flags = tf.flags
......
......@@ -21,7 +21,7 @@ import json
import random
import re
import modeling
from tensorflow_code import modeling
import six
import tensorflow as tf
......
......@@ -16,7 +16,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import optimization
from tensorflow_code import optimization
import tensorflow as tf
......
......@@ -20,9 +20,9 @@ from __future__ import print_function
import csv
import os
import modeling
import optimization
import tokenization
from tensorflow_code import modeling
from tensorflow_code import optimization
from tensorflow_code import tokenization
import tensorflow as tf
flags = tf.flags
......
......@@ -19,8 +19,8 @@ from __future__ import division
from __future__ import print_function
import os
import modeling
import optimization
from tensorflow_code import modeling
from tensorflow_code import optimization
import tensorflow as tf
flags = tf.flags
......
......@@ -22,9 +22,9 @@ import collections
import json
import math
import os
import modeling
import optimization
import tokenization
from tensorflow_code import modeling
from tensorflow_code import optimization
from tensorflow_code import tokenization
import six
import tensorflow as tf
......
......@@ -19,7 +19,7 @@ from __future__ import print_function
import os
import tempfile
import tokenization
from tensorflow_code import tokenization
import tensorflow as tf
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment