Merge branch 'master' of https://github.com/huggingface/pytorch-pretrained-BERT

a4086c5d · thomwolf · 088ad458 · 8bd6b235 · a4086c5d · a4086c5d
Commit a4086c5d authored Nov 03, 2018 by thomwolf
16 changed files
--- a/Comparing TF and PT models.ipynb
+++ b/Comparing TF and PT models.ipynb
@@ -4,26 +4,72 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# TensorFlow code"
+    "# Comparing TensorFlow (original) and PyTorch models\n",
+    "\n",
+    "We use this small notebook to test the conversion of the model's weights and to make sure both the TensorFlow and PyTorch are coherent. In particular, we compare the weights of the last layer on a simple example (in `input.txt`).\n",
+    "\n",
+    "To run this notebook, please make sure that your Python environment has both TensorFlow and PyTorch.\n",
+    "You should follow the instructions in the `README.md` and make sure that you have:\n",
+    "- the original TensorFlow implementation\n",
+    "- the `BERT-base, Uncased` model\n",
+    "- run the script `convert_tf_checkpoint_to_pytorch.py` to convert the weights to PyTorch\n",
+    "\n",
+    "Please modify the relative paths accordingly (at the beggining of Sections 1 and 2)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1/ TensorFlow code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_tf_inplem_dir = \"../bert/\"\n",
+    "model_dir = \"../uncased_L-12_H-768_A-12/\"\n",
+    "\n",
+    "vocab_file = model_dir + \"vocab.txt\"\n",
+    "bert_config_file = model_dir + \"bert_config.json\"\n",
+    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
+    "\n",
+    "input_file = \"input.txt\"\n",
+    "max_seq_length = 128"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:37.498678Z",
     "start_time": "2018-11-03T02:09:36.366672Z"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  from ._conv import register_converters as _register_converters\n"
+     ]
+    }
+   ],
   "source": [
+    "import sys\n",
+    "sys.path.append(original_tf_inplem_dir)\n",
+    "\n",
    "from extract_features import *"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:37.621865Z",
@@ -45,13 +91,6 @@
    }
   ],
   "source": [
-    "data_dir=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/data/glue_data/MRPC/\"\n",
-    "vocab_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/vocab.txt\"\n",
-    "bert_config_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_config.json\"\n",
-    "init_checkpoint=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_model.ckpt\"\n",
-    "max_seq_length=128\n",
-    "input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n",
-    "\n",
    "layer_indexes = list(range(12))\n",
    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
    "tokenizer = tokenization.FullTokenizer(\n",
@@ -67,7 +106,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:40.831618Z",
@@ -79,15 +118,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b0bcc80>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
+      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x1289c1a60>) includes params argument, but params are not passed to Estimator.\n",
+      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr\n",
+      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
      "graph_options {\n",
      "  rewrite_options {\n",
      "    meta_optimizer_iterations: ONE\n",
      "  }\n",
      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e1160f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
+      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12c242470>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
      "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
@@ -123,7 +162,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:46.413197Z",
@@ -135,7 +174,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u, running initialization to predict.\n",
+      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr, running initialization to predict.\n",
      "INFO:tensorflow:Calling model_fn.\n",
      "INFO:tensorflow:Running infer on CPU\n",
      "INFO:tensorflow:Done calling model_fn.\n",
@@ -186,7 +225,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:46.460128Z",
@@ -211,7 +250,7 @@
       "(128, 768)"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -227,7 +266,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:46.498637Z",
@@ -243,12 +282,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# PyTorch code"
+    "## 2/ PyTorch code"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:46.660303Z",
@@ -263,12 +302,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "init_checkpoint_pt = \"../pytorch_model/uncased_L-12_H-768_A-12/pytorch_model.bin\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:48.292135Z",
     "start_time": "2018-11-03T02:09:46.661921Z"
-    }
+    },
+    "scrolled": true
   },
   "outputs": [
    {
@@ -569,14 +618,12 @@
       ")"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "init_checkpoint_pt=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"\n",
-    "\n",
    "device = torch.device(\"cpu\")\n",
    "model = extract_features_pytorch.BertModel(bert_config)\n",
    "model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
@@ -585,7 +632,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:48.332982Z",
@@ -892,7 +939,7 @@
       ")"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -912,7 +959,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:54.371188Z",
@@ -1000,7 +1047,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:57.139854Z",
@@ -1026,7 +1073,7 @@
       "(128, 768)"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1043,7 +1090,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:59.000058Z",
@@ -1068,7 +1115,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:09:59.462123Z",
@@ -1090,9 +1137,16 @@
    "print(tensorflow_outputs[1].shape)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3/ Comparing the standard deviation on the last layer of both models"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:10:00.014784Z",
@@ -1106,7 +1160,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-03T02:10:09.582557Z",
@@ -1127,7 +1181,7 @@
       "4.1671223e-07"
      ]
     },
-     "execution_count": 24,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1137,21 +1191,14 @@
    "print(np.array(tensorflow_outputs[i]).shape, np.array(pytorch_outputs[i]).shape)\n",
    "np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0))"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
-   "display_name": "Python [conda env:bert]",
+   "display_name": "Python 3",
   "language": "python",
-   "name": "conda-env-bert-py"
+   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
@@ -1163,7 +1210,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.7"
+   "version": "3.6.5"
  },
  "toc": {
   "colors": {

--- a/README.md
+++ b/README.md
--- a/create_pretraining_data_pytorch.py
+++ b/create_pretraining_data_pytorch.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Create masked LM/next sentence masked_lm TF examples for BERT."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import random
-
-import tokenization
-import tensorflow as tf
-
-import argparse
-
-parser = argparse.ArgumentParser()
-
-## Required parameters
-parser.add_argument("--input_file", default=None, type=str, required=True,
-                    help="Input raw text file (or comma-separated list of files).")
-parser.add_argument("--output_file", default=None, type=str, required=True, 
-                    help="Output TF example file (or comma-separated list of files).")
-parser.add_argument("--vocab_file", default=None, type=str, required=True,
-                    help="The vocabulary file that the BERT model was trained on.")
-
-## Other parameters
-parser.add_argument("--do_lower_case", default=True, action='store_true',
-                    help="Whether to lower case the input text. Should be True for uncased "
-                         "models and False for cased models.")
-parser.add_argument("--max_seq_length", default=128, type=int, help="Maximum sequence length.")
-parser.add_argument("--max_predictions_per_seq", default=20, type=int,
-                    help="Maximum number of masked LM predictions per sequence.")
-parser.add_argument("--random_seed", default=12345, type=int, help="Random seed for data generation.")
-parser.add_argument("--dupe_factor", default=10, type=int, 
-                    help="Number of times to duplicate the input data (with different masks).")
-parser.add_argument("--masked_lm_prob", default=0.15, type=float, help="Masked LM probability.")
-parser.add_argument("--short_seq_prob", default=0.1, type=float, 
-                    help="Probability of creating sequences which are shorter than the maximum length.")
-
-args = parser.parse_args()
-
-
-class TrainingInstance(object):
-    """A single training instance (sentence pair)."""
-
-    def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
-                 is_random_next):
-        self.tokens = tokens
-        self.segment_ids = segment_ids
-        self.is_random_next = is_random_next
-        self.masked_lm_positions = masked_lm_positions
-        self.masked_lm_labels = masked_lm_labels
-
-    def __str__(self):
-        s = ""
-        s += "tokens: %s\n" % (" ".join(
-            [tokenization.printable_text(x) for x in self.tokens]))
-        s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
-        s += "is_random_next: %s\n" % self.is_random_next
-        s += "masked_lm_positions: %s\n" % (" ".join(
-            [str(x) for x in self.masked_lm_positions]))
-        s += "masked_lm_labels: %s\n" % (" ".join(
-            [tokenization.printable_text(x) for x in self.masked_lm_labels]))
-        s += "\n"
-        return s
-
-    def __repr__(self):
-        return self.__str__()
-
-
-def write_instance_to_example_files(instances, tokenizer, max_seq_length,
-                                    max_predictions_per_seq, output_files):
-    """Create TF example files from `TrainingInstance`s."""
-    writers = []
-    for output_file in output_files:
-        writers.append(tf.python_io.TFRecordWriter(output_file))
-
-    writer_index = 0
-
-    total_written = 0
-    for (inst_index, instance) in enumerate(instances):
-        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
-        input_mask = [1] * len(input_ids)
-        segment_ids = list(instance.segment_ids)
-        assert len(input_ids) <= max_seq_length
-
-        while len(input_ids) < max_seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            segment_ids.append(0)
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-
-        masked_lm_positions = list(instance.masked_lm_positions)
-        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
-        masked_lm_weights = [1.0] * len(masked_lm_ids)
-
-        while len(masked_lm_positions) < max_predictions_per_seq:
-            masked_lm_positions.append(0)
-            masked_lm_ids.append(0)
-            masked_lm_weights.append(0.0)
-
-        next_sentence_label = 1 if instance.is_random_next else 0
-
-        features = collections.OrderedDict()
-        features["input_ids"] = create_int_feature(input_ids)
-        features["input_mask"] = create_int_feature(input_mask)
-        features["segment_ids"] = create_int_feature(segment_ids)
-        features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
-        features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
-        features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
-        features["next_sentence_labels"] = create_int_feature([next_sentence_label])
-
-        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-
-        writers[writer_index].write(tf_example.SerializeToString())
-        writer_index = (writer_index + 1) % len(writers)
-
-        total_written += 1
-
-        if inst_index < 20:
-            tf.logging.info("*** Example ***")
-            tf.logging.info("tokens: %s" % " ".join(
-                [tokenization.printable_text(x) for x in instance.tokens]))
-
-            for feature_name in features.keys():
-                feature = features[feature_name]
-                values = []
-                if feature.int64_list.value:
-                    values = feature.int64_list.value
-                elif feature.float_list.value:
-                    values = feature.float_list.value
-                tf.logging.info(
-                    "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
-
-    for writer in writers:
-        writer.close()
-
-    tf.logging.info("Wrote %d total instances", total_written)
-
-
-def create_int_feature(values):
-    feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return feature
-
-
-def create_float_feature(values):
-    feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-    return feature
-
-
-def create_training_instances(input_files, tokenizer, max_seq_length,
-                              dupe_factor, short_seq_prob, masked_lm_prob,
-                              max_predictions_per_seq, rng):
-    """Create `TrainingInstance`s from raw text."""
-    all_documents = [[]]
-
-    # Input file format:
-    # (1) One sentence per line. These should ideally be actual sentences, not
-    # entire paragraphs or arbitrary spans of text. (Because we use the
-    # sentence boundaries for the "next sentence prediction" task).
-    # (2) Blank lines between documents. Document boundaries are needed so
-    # that the "next sentence prediction" task doesn't span between documents.
-    for input_file in input_files:
-        with tf.gfile.GFile(input_file, "r") as reader:
-            while True:
-                line = tokenization.convert_to_unicode(reader.readline())
-                if not line:
-                    break
-                line = line.strip()
-
-                # Empty lines are used as document delimiters
-                if not line:
-                    all_documents.append([])
-                tokens = tokenizer.tokenize(line)
-                if tokens:
-                    all_documents[-1].append(tokens)
-
-    # Remove empty documents
-    all_documents = [x for x in all_documents if x]
-    rng.shuffle(all_documents)
-
-    vocab_words = list(tokenizer.vocab.keys())
-    instances = []
-    for _ in range(dupe_factor):
-        for document_index in range(len(all_documents)):
-            instances.extend(
-                create_instances_from_document(
-                    all_documents, document_index, max_seq_length, short_seq_prob,
-                    masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
-
-    rng.shuffle(instances)
-    return instances
-
-
-def create_instances_from_document(
-        all_documents, document_index, max_seq_length, short_seq_prob,
-        masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
-    """Creates `TrainingInstance`s for a single document."""
-    document = all_documents[document_index]
-
-    # Account for [CLS], [SEP], [SEP]
-    max_num_tokens = max_seq_length - 3
-
-    # We *usually* want to fill up the entire sequence since we are padding
-    # to `max_seq_length` anyways, so short sequences are generally wasted
-    # computation. However, we *sometimes*
-    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
-    # sequences to minimize the mismatch between pre-training and fine-tuning.
-    # The `target_seq_length` is just a rough target however, whereas
-    # `max_seq_length` is a hard limit.
-    target_seq_length = max_num_tokens
-    if rng.random() < short_seq_prob:
-        target_seq_length = rng.randint(2, max_num_tokens)
-
-    # We DON'T just concatenate all of the tokens from a document into a long
-    # sequence and choose an arbitrary split point because this would make the
-    # next sentence prediction task too easy. Instead, we split the input into
-    # segments "A" and "B" based on the actual "sentences" provided by the user
-    # input.
-    instances = []
-    current_chunk = []
-    current_length = 0
-    i = 0
-    while i < len(document):
-        segment = document[i]
-        current_chunk.append(segment)
-        current_length += len(segment)
-        if i == len(document) - 1 or current_length >= target_seq_length:
-            if current_chunk:
-                # `a_end` is how many segments from `current_chunk` go into the `A`
-                # (first) sentence.
-                a_end = 1
-                if len(current_chunk) >= 2:
-                    a_end = rng.randint(1, len(current_chunk) - 1)
-
-                tokens_a = []
-                for j in range(a_end):
-                    tokens_a.extend(current_chunk[j])
-
-                tokens_b = []
-                # Random next
-                is_random_next = False
-                if len(current_chunk) == 1 or rng.random() < 0.5:
-                    is_random_next = True
-                    target_b_length = target_seq_length - len(tokens_a)
-
-                    # This should rarely go for more than one iteration for large
-                    # corpora. However, just to be careful, we try to make sure that
-                    # the random document is not the same as the document
-                    # we're processing.
-                    for _ in range(10):
-                        random_document_index = rng.randint(0, len(all_documents) - 1)
-                        if random_document_index != document_index:
-                            break
-
-                    random_document = all_documents[random_document_index]
-                    random_start = rng.randint(0, len(random_document) - 1)
-                    for j in range(random_start, len(random_document)):
-                        tokens_b.extend(random_document[j])
-                        if len(tokens_b) >= target_b_length:
-                            break
-                    # We didn't actually use these segments so we "put them back" so
-                    # they don't go to waste.
-                    num_unused_segments = len(current_chunk) - a_end
-                    i -= num_unused_segments
-                # Actual next
-                else:
-                    is_random_next = False
-                    for j in range(a_end, len(current_chunk)):
-                        tokens_b.extend(current_chunk[j])
-                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
-
-                assert len(tokens_a) >= 1
-                assert len(tokens_b) >= 1
-
-                tokens = []
-                segment_ids = []
-                tokens.append("[CLS]")
-                segment_ids.append(0)
-                for token in tokens_a:
-                    tokens.append(token)
-                    segment_ids.append(0)
-
-                tokens.append("[SEP]")
-                segment_ids.append(0)
-
-                for token in tokens_b:
-                    tokens.append(token)
-                    segment_ids.append(1)
-                tokens.append("[SEP]")
-                segment_ids.append(1)
-
-                (tokens, masked_lm_positions,
-                 masked_lm_labels) = create_masked_lm_predictions(
-                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
-                instance = TrainingInstance(
-                    tokens=tokens,
-                    segment_ids=segment_ids,
-                    is_random_next=is_random_next,
-                    masked_lm_positions=masked_lm_positions,
-                    masked_lm_labels=masked_lm_labels)
-                instances.append(instance)
-            current_chunk = []
-            current_length = 0
-        i += 1
-
-    return instances
-
-
-def create_masked_lm_predictions(tokens, masked_lm_prob,
-                                 max_predictions_per_seq, vocab_words, rng):
-    """Creates the predictis for the masked LM objective."""
-
-    cand_indexes = []
-    for (i, token) in enumerate(tokens):
-        if token == "[CLS]" or token == "[SEP]":
-            continue
-        cand_indexes.append(i)
-
-    rng.shuffle(cand_indexes)
-
-    output_tokens = list(tokens)
-
-    masked_lm = collections.namedtuple("masked_lm", ["index", "label"])  # pylint: disable=invalid-name
-
-    num_to_predict = min(max_predictions_per_seq,
-                         max(1, int(round(len(tokens) * masked_lm_prob))))
-
-    masked_lms = []
-    covered_indexes = set()
-    for index in cand_indexes:
-        if len(masked_lms) >= num_to_predict:
-            break
-        if index in covered_indexes:
-            continue
-        covered_indexes.add(index)
-
-        masked_token = None
-        # 80% of the time, replace with [MASK]
-        if rng.random() < 0.8:
-            masked_token = "[MASK]"
-        else:
-            # 10% of the time, keep original
-            if rng.random() < 0.5:
-                masked_token = tokens[index]
-            # 10% of the time, replace with random word
-            else:
-                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
-
-        output_tokens[index] = masked_token
-
-        masked_lms.append(masked_lm(index=index, label=tokens[index]))
-
-    masked_lms = sorted(masked_lms, key=lambda x: x.index)
-
-    masked_lm_positions = []
-    masked_lm_labels = []
-    for p in masked_lms:
-        masked_lm_positions.append(p.index)
-        masked_lm_labels.append(p.label)
-
-    return (output_tokens, masked_lm_positions, masked_lm_labels)
-
-
-def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
-    """Truncates a pair of sequences to a maximum sequence length."""
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_num_tokens:
-            break
-
-        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
-        assert len(trunc_tokens) >= 1
-
-        # We want to sometimes truncate from the front and sometimes from the
-        # back to add more randomness and avoid biases.
-        if rng.random() < 0.5:
-            del trunc_tokens[0]
-        else:
-            trunc_tokens.pop()
-
-
-def main(_):
-    tf.logging.set_verbosity(tf.logging.INFO)
-
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
-
-    input_files = []
-    for input_pattern in args.input_file.split(","):
-        input_files.extend(tf.gfile.Glob(input_pattern))
-
-    tf.logging.info("*** Reading from input files ***")
-    for input_file in input_files:
-        tf.logging.info("  %s", input_file)
-
-    rng = random.Random(args.random_seed)
-    instances = create_training_instances(
-        input_files, tokenizer, args.max_seq_length, args.dupe_factor,
-        args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
-        rng)
-
-    output_files = args.output_file.split(",")
-    tf.logging.info("*** Writing to output files ***")
-    for output_file in output_files:
-        tf.logging.info("  %s", output_file)
-
-    write_instance_to_example_files(instances, tokenizer, args.max_seq_length,
-                                    args.max_predictions_per_seq, output_files)
-
-
-if __name__ == "__main__":
-    tf.app.run()
--- a/extract_features_pytorch.py
+++ b/extract_features_pytorch.py
@@ -249,6 +249,9 @@ def main():
    if args.init_checkpoint is not None:
        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)
+    
+    if n_gpu > 1:
+        model = nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)

--- a/modeling_pytorch.py
+++ b/modeling_pytorch.py
@@ -502,7 +502,7 @@ class BertForQuestionAnswering(nn.Module):
            def compute_loss(logits, positions):
                max_position = positions.max().item()
                one_hot = torch.FloatTensor(batch_size, max(max_position, seq_length) +1).zero_()
-                one_hot = one_hot.scatter(1, positions.cpu(), 1) # Second argument need to be LongTensor and not cuda.LongTensor
+                one_hot = one_hot.scatter_(1, positions.cpu(), 1) # Second argument need to be LongTensor and not cuda.LongTensor
                one_hot = one_hot[:, :seq_length].to(input_ids.device)
                log_probs = nn.functional.log_softmax(logits, dim = -1).view(batch_size, seq_length)
                loss = -torch.mean(torch.sum(one_hot*log_probs), dim = -1)

--- a/run_classifier_pytorch.py
+++ b/run_classifier_pytorch.py
@@ -482,6 +482,9 @@ def main():
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)
+    
+    if n_gpu > 1:
+        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
@@ -518,7 +521,7 @@ def main():

        model.train()
        nb_tr_examples = 0
-        for epoch in trange(args.num_train_epochs, desc="Epoch"):
+        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Iteration"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.float().to(device)

--- a/run_pretraining_pytorch.py
+++ b/run_pretraining_pytorch.py
--- a/run_squad_pytorch.py
+++ b/run_squad_pytorch.py
@@ -23,7 +23,7 @@ import logging
 import json
 import math
 import os
-import tokenization
+import tokenization_pytorch
 import six
 import argparse

@@ -62,9 +62,9 @@ class SquadExample(object):

    def __repr__(self):
        s = ""
-        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += "qas_id: %s" % (tokenization_pytorch.printable_text(self.qas_id))
        s += ", question_text: %s" % (
-            tokenization.printable_text(self.question_text))
+            tokenization_pytorch.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
@@ -153,7 +153,7 @@ def read_squad_examples(input_file, is_training):
                    # guaranteed to be preserved.
                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
-                        tokenization.whitespace_tokenize(orig_answer_text))
+                        tokenization_pytorch.whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
@@ -287,7 +287,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info("tokens: %s" % " ".join(
-                    [tokenization.printable_text(x) for x in tokens]))
+                    [tokenization_pytorch.printable_text(x) for x in tokens]))
                logger.info("token_to_orig_map: %s" % " ".join(
                    ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
                logger.info("token_is_max_context: %s" % " ".join([
@@ -303,7 +303,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
                    logger.info(
-                        "answer: %s" % (tokenization.printable_text(answer_text)))
+                        "answer: %s" % (tokenization_pytorch.printable_text(answer_text)))

            features.append(
                InputFeatures(
@@ -579,7 +579,7 @@ def get_final_text(pred_text, orig_text, do_lower_case):
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+    tokenizer = tokenization_pytorch.BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

@@ -780,7 +780,7 @@ def main():
        raise ValueError("Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

-    tokenizer = tokenization.FullTokenizer(
+    tokenizer = tokenization_pytorch.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    train_examples = None
@@ -795,6 +795,9 @@ def main():
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)
+    
+    if n_gpu > 1:
+        model = torch.nn.DataParallel(model)

    optimizer = BERTAdam([{'params': [p for n, p in model.named_parameters() if n != 'bias'], 'l2': 0.01},
                          {'params': [p for n, p in model.named_parameters() if n == 'bias'], 'l2': 0.}

--- a/tensorflow_code/create_pretraining_data.py
+++ b/tensorflow_code/create_pretraining_data.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import collections
 import random

-import tokenization
+from tensorflow_code import tokenization
 import tensorflow as tf

 flags = tf.flags

--- a/tensorflow_code/extract_features.py
+++ b/tensorflow_code/extract_features.py
@@ -23,8 +23,8 @@ import collections
 import json
 import re

-import modeling
-import tokenization
+from tensorflow_code import modeling
+from tensorflow_code import tokenization
 import tensorflow as tf

 flags = tf.flags

--- a/tensorflow_code/modeling_test.py
+++ b/tensorflow_code/modeling_test.py
@@ -21,7 +21,7 @@ import json
 import random
 import re

-import modeling
+from tensorflow_code import modeling
 import six
 import tensorflow as tf


--- a/tensorflow_code/optimization_test.py
+++ b/tensorflow_code/optimization_test.py
@@ -16,7 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import optimization
+from tensorflow_code import optimization
 import tensorflow as tf



--- a/tensorflow_code/run_classifier.py
+++ b/tensorflow_code/run_classifier.py
@@ -20,9 +20,9 @@ from __future__ import print_function

 import csv
 import os
-import modeling
-import optimization
-import tokenization
+from tensorflow_code import modeling
+from tensorflow_code import optimization
+from tensorflow_code import tokenization
 import tensorflow as tf

 flags = tf.flags

--- a/tensorflow_code/run_pretraining.py
+++ b/tensorflow_code/run_pretraining.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function

 import os
-import modeling
-import optimization
+from tensorflow_code import modeling
+from tensorflow_code import optimization
 import tensorflow as tf

 flags = tf.flags

--- a/tensorflow_code/run_squad.py
+++ b/tensorflow_code/run_squad.py
@@ -22,9 +22,9 @@ import collections
 import json
 import math
 import os
-import modeling
-import optimization
-import tokenization
+from tensorflow_code import modeling
+from tensorflow_code import optimization
+from tensorflow_code import tokenization
 import six
 import tensorflow as tf


--- a/tensorflow_code/tokenization_test.py
+++ b/tensorflow_code/tokenization_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import os
 import tempfile

-import tokenization
+from tensorflow_code import tokenization
 import tensorflow as tf