初始化仓库

e5ca7e62 · hepj987 · e5ca7e62 · e5ca7e62 · e5ca7e62 · e5ca7e62
Commit e5ca7e62 authored Jul 17, 2023 by hepj987
20 changed files
--- a/data/SquadDownloader.py
+++ b/data/SquadDownloader.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+
+class SquadDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/squad'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        if not os.path.exists(self.save_path + '/v1.1'):
+            os.makedirs(self.save_path + '/v1.1')
+
+        if not os.path.exists(self.save_path + '/v2.0'):
+            os.makedirs(self.save_path + '/v2.0')
+
+        self.download_urls = {
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
+            'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
+            'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
+        }
+
+    def download(self):
+        for item in self.download_urls:
+            url = item
+            file = self.download_urls[item]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + file):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + file, "wb") as handle:
+                    handle.write(response.read())
+
+
--- a/data/TextSharding.py
+++ b/data/TextSharding.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from itertools import islice
+
+import multiprocessing
+import statistics
+
+class Sharding:
+    def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
+        assert len(input_files) > 0, 'The input file list must contain at least one file.'
+        assert n_training_shards > 0, 'There must be at least one output shard.'
+        assert n_test_shards > 0, 'There must be at least one output shard.'
+
+        self.n_training_shards = n_training_shards
+        self.n_test_shards = n_test_shards
+        self.fraction_test_set = fraction_test_set
+
+        self.input_files = input_files
+
+        self.output_name_prefix = output_name_prefix
+        self.output_training_identifier = '_training'
+        self.output_test_identifier = '_test'
+        self.output_file_extension = '.txt'
+
+        self.articles = {}    # key: integer identifier, value: list of articles
+        self.sentences = {}    # key: integer identifier, value: list of sentences
+        self.output_training_files = {}    # key: filename, value: list of articles to go into file
+        self.output_test_files = {}  # key: filename, value: list of articles to go into file
+
+        self.init_output_files()
+
+
+    # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
+    def load_articles(self):
+        print('Start: Loading Articles')
+
+        global_article_count = 0
+        for input_file in self.input_files:
+            print('input file:', input_file)
+            with open(input_file, mode='r', newline='\n') as f:
+                for i, line in enumerate(f):
+                    if line.strip():
+                        self.articles[global_article_count] = line.rstrip()
+                        global_article_count += 1
+
+        print('End: Loading Articles: There are', len(self.articles), 'articles.')
+
+
+    def segment_articles_into_sentences(self, segmenter):
+        print('Start: Sentence Segmentation')
+        if len(self.articles) is 0:
+            self.load_articles()
+
+        assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
+
+        # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
+        use_multiprocessing = 'serial'
+
+        def chunks(data, size=len(self.articles)):
+            it = iter(data)
+            for i in range(0, len(data), size):
+                yield {k: data[k] for k in islice(it, size)}
+
+        if use_multiprocessing == 'manager':
+            manager = multiprocessing.Manager()
+            return_dict = manager.dict()
+            jobs = []
+            n_processes = 7    # in addition to the main process, total = n_proc+1
+
+            def work(articles, return_dict):
+                sentences = {}
+                for i, article in enumerate(articles):
+                    sentences[i] = segmenter.segment_string(articles[article])
+
+                    if i % 5000 == 0:
+                        print('Segmenting article', i)
+
+                return_dict.update(sentences)
+
+            for item in chunks(self.articles, len(self.articles)):
+                p = multiprocessing.Process(target=work, args=(item, return_dict))
+
+                # Busy wait
+                while len(jobs) >= n_processes:
+                    pass
+
+                jobs.append(p)
+                p.start()
+
+            for proc in jobs:
+                proc.join()
+
+        elif use_multiprocessing == 'queue':
+            work_queue = multiprocessing.Queue()
+            jobs = []
+
+            for item in chunks(self.articles, len(self.articles)):
+                pass
+
+        else:    # serial option
+            for i, article in enumerate(self.articles):
+                self.sentences[i] = segmenter.segment_string(self.articles[article])
+
+                if i % 5000 == 0:
+                    print('Segmenting article', i)
+
+        print('End: Sentence Segmentation')
+
+
+    def init_output_files(self):
+        print('Start: Init Output Files')
+        assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+        assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+
+        for i in range(self.n_training_shards):
+            name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
+            self.output_training_files[name] = []
+
+        for i in range(self.n_test_shards):
+            name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
+            self.output_test_files[name] = []
+
+        print('End: Init Output Files')
+
+
+    def get_sentences_per_shard(self, shard):
+        result = 0
+        for article_id in shard:
+            result += len(self.sentences[article_id])
+
+        return result
+
+
+    def distribute_articles_over_shards(self):
+        print('Start: Distribute Articles Over Shards')
+        assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+
+        # Create dictionary with - key: sentence count per article, value: article id number
+        sentence_counts = defaultdict(lambda: [])
+
+        max_sentences = 0
+        total_sentences = 0
+
+        for article_id in self.sentences:
+            current_length = len(self.sentences[article_id])
+            sentence_counts[current_length].append(article_id)
+            max_sentences = max(max_sentences, current_length)
+            total_sentences += current_length
+
+        n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
+        nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
+        nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
+
+        consumed_article_set = set({})
+        unused_article_set = set(self.articles.keys())
+
+        # Make first pass and add one article worth of lines per file
+        for file in self.output_training_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_training_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
+                nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+
+        for file in self.output_test_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_test_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
+                nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+
+        training_counts = []
+        test_counts = []
+
+        for shard in self.output_training_files:
+            training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        training_median = statistics.median(training_counts)
+        test_median = statistics.median(test_counts)
+
+        # Make subsequent passes over files to find articles to add without going over limit
+        history_remaining = []
+        n_history_remaining = 4
+
+        while len(consumed_article_set) < len(self.articles):
+            for fidx, file in enumerate(self.output_training_files):
+                nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_training_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            for fidx, file in enumerate(self.output_test_files):
+                nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_test_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
+            if len(history_remaining) == n_history_remaining:
+                history_remaining.pop(0)
+            history_remaining.append(len(unused_article_set))
+
+            history_same = True
+            for i in range(1, len(history_remaining)):
+                history_same = history_same and (history_remaining[i-1] == history_remaining[i])
+
+            if history_same:
+                nominal_sentences_per_training_shard += 1
+                # nominal_sentences_per_test_shard += 1
+
+            training_counts = []
+            test_counts = []
+            for shard in self.output_training_files:
+                training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+            for shard in self.output_test_files:
+                test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+            training_median = statistics.median(training_counts)
+            test_median = statistics.median(test_counts)
+
+            print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+
+
+        if len(unused_article_set) != 0:
+            print('Warning: Some articles did not make it into output files.')
+
+
+        for shard in self.output_training_files:
+            print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        print('End: Distribute Articles Over Shards')
+
+
+    def write_shards_to_disk(self):
+        print('Start: Write Shards to Disk')
+        for shard in self.output_training_files:
+            self.write_single_shard(shard, self.output_training_files[shard])
+
+        for shard in self.output_test_files:
+            self.write_single_shard(shard, self.output_test_files[shard])
+
+        print('End: Write Shards to Disk')
+
+
+    def write_single_shard(self, shard_name, shard):
+        with open(shard_name, mode='w', newline='\n') as f:
+            for article_id in shard:
+                for line in self.sentences[article_id]:
+                    f.write(line + '\n')
+
+                f.write('\n')  # Line break between articles
+
+
+import nltk
+
+nltk.download('punkt')
+
+class NLTKSegmenter:
+    def __init(self):
+        pass
+
+    def segment_string(self, article):
+        return nltk.tokenize.sent_tokenize(article)
+
--- a/data/WikiDownloader.py
+++ b/data/WikiDownloader.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import subprocess
+import sys
+
+class WikiDownloader:
+    def __init__(self, language, save_path):
+        self.save_path = save_path + '/wikicorpus_' + language
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.language = language
+        self.download_urls = {
+            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+        }
+
+        self.output_files = {
+            'en' : 'wikicorpus_en.xml.bz2',
+            'zh' : 'wikicorpus_zh.xml.bz2'
+        }
+
+
+    def download(self):
+        if self.language in self.download_urls:
+            url = self.download_urls[self.language]
+            filename = self.output_files[self.language]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + filename):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + filename, "wb") as handle:
+                    handle.write(response.read())
+
+            # Always unzipping since this is relatively fast and will overwrite
+            print('Unzipping:', self.output_files[self.language])
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
+
+        else:
+            assert False, 'WikiDownloader not implemented for this language yet.'
\ No newline at end of file
--- a/data/WikicorpusTextFormatting.py
+++ b/data/WikicorpusTextFormatting.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class WikicorpusTextFormatting:
+    def __init__(self, wiki_path, output_filename, recursive = False):
+        self.wiki_path = wiki_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
+                for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
+                    print(filename)
+                    article_lines = []
+                    article_open = False
+
+                    with open(filename, mode='r', newline='\n') as file:
+                        for line in file:
+                            if '<doc id=' in line:
+                                article_open = True
+                            elif '</doc>' in line:
+                                article_open = False
+                                for oline in article_lines[1:]:
+                                    if oline != '\n':
+                                        ofile.write(oline.rstrip() + " ")
+                                ofile.write("\n\n")
+                                article_lines = []
+                            else:
+                                if article_open:
+                                    article_lines.append(line)
\ No newline at end of file
--- a/data/__init__.py
+++ b/data/__init__.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/data/bertPrep.py
+++ b/data/bertPrep.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import BookscorpusTextFormatting
+import Downloader
+import TextSharding
+import WikicorpusTextFormatting
+
+import argparse
+import itertools
+import multiprocessing
+import os
+import pprint
+import subprocess
+
+
+def main(args):
+    working_dir = os.environ['BERT_PREP_WORKING_DIR']
+
+    print('Working Directory:', working_dir)
+    print('Action:', args.action)
+    print('Dataset Name:', args.dataset)
+
+    if args.input_files:
+        args.input_files = args.input_files.split(',')
+
+    hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+                                  + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
+                                  + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor)
+
+    directory_structure = {
+        'download' : working_dir + '/download',    # Downloaded and decompressed
+        'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
+        'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
+        'sharded' : working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" + str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set),
+        'tfrecord' : working_dir + '/tfrecord'+ hdf5_tfrecord_folder_prefix,
+        'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix
+    }
+
+    print('\nDirectory Structure:')
+    pp = pprint.PrettyPrinter(indent=2)
+    pp.pprint(directory_structure)
+    print('')
+
+    if args.action == 'download':
+        if not os.path.exists(directory_structure['download']):
+            os.makedirs(directory_structure['download'])
+
+        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+        downloader.download()
+
+    elif args.action == 'text_formatting':
+        assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
+
+        if not os.path.exists(directory_structure['extracted']):
+            os.makedirs(directory_structure['extracted'])
+
+        if not os.path.exists(directory_structure['formatted']):
+            os.makedirs(directory_structure['formatted'])
+
+        if args.dataset == 'bookscorpus':
+            books_path = directory_structure['download'] + '/bookscorpus'
+            #books_path = directory_structure['download']
+            output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
+            books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
+            books_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_en':
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+                #wikiextractor_process.communicate()
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_zh':
+            assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+                #wikiextractor_process.communicate()
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+            assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
+
+    elif args.action == 'sharding':
+        # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
+        if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
+            if args.input_files is None:
+                if args.dataset == 'bookscorpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
+                elif args.dataset == 'wikicorpus_en':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'wikicorpus_zh':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
+                elif args.dataset == 'books_wiki_en_corpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+
+            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
+
+            if not os.path.exists(directory_structure['sharded']):
+                os.makedirs(directory_structure['sharded'])
+
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
+
+            # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
+            # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
+            # Different languages (e.g., Chinese simplified/traditional) may require translation and
+            # other packages to be called from here -- just add a conditional branch for those extra steps
+            segmenter = TextSharding.NLTKSegmenter()
+            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
+
+            sharding.load_articles()
+            sharding.segment_articles_into_sentences(segmenter)
+            sharding.distribute_articles_over_shards()
+            sharding.write_shards_to_disk()
+
+        else:
+            assert False, 'Unsupported dataset for sharding'
+
+    elif args.action == 'create_tfrecord_files':
+        assert False, 'TFrecord creation not supported in this PyTorch model example release.' \
+                      ''
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
+
+        def create_record_worker(filename_prefix, shard_id, output_format='tfrecord'):
+            bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
+            bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
+            bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
+            bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+            return last_process
+
+        output_file_prefix = args.dataset
+
+        for i in range(args.n_training_shards):
+            last_process =create_record_worker(output_file_prefix + '_training', i)
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            last_process = create_record_worker(output_file_prefix + '_test', i)
+
+        last_process.wait()
+
+
+    elif args.action == 'create_hdf5_files':
+        last_process = None
+
+        if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
+            os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
+
+        def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
+            bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
+            bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
+            bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
+            bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+            return last_process
+
+        output_file_prefix = args.dataset
+
+        for i in range(args.n_training_shards):
+            last_process = create_record_worker(output_file_prefix + '_training', i)
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            last_process = create_record_worker(output_file_prefix + '_test', i)
+
+        last_process.wait()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Preprocessing Application for Everything BERT-related'
+    )
+
+    parser.add_argument(
+        '--action',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
+        choices={
+            'download',               # Download and verify mdf5/sha sums
+            'text_formatting',        # Convert into a file that contains one article/book per line
+            'sharding',               # Convert previous formatted text into shards containing one sentence per line
+            'create_tfrecord_files',  # Turn each shard into a TFrecord with masking and next sentence prediction info
+            'create_hdf5_files'       # Turn each shard into a HDF5 file with masking and next sentence prediction info
+        }
+    )
+
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='Specify the dataset to perform --action on',
+        choices={
+            'bookscorpus',
+            'wikicorpus_en',
+            'wikicorpus_zh',
+            'books_wiki_en_corpus',
+            'google_pretrained_weights',
+            'nvidia_pretrained_weights',
+            'mrpc',
+            'sst-2',
+            'squad',
+            'all'
+        }
+    )
+
+    parser.add_argument(
+        '--input_files',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+
+    parser.add_argument(
+        '--n_training_shards',
+        type=int,
+        help='Specify the number of training shards to generate',
+        default=256
+    )
+
+    parser.add_argument(
+        '--n_test_shards',
+        type=int,
+        help='Specify the number of test shards to generate',
+        default=256
+    )
+
+    parser.add_argument(
+        '--fraction_test_set',
+        type=float,
+        help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
+        default=0.1
+    )
+
+    parser.add_argument(
+        '--segmentation_method',
+        type=str,
+        help='Specify your choice of sentence segmentation',
+        choices={
+            'nltk'
+        },
+        default='nltk'
+    )
+
+    parser.add_argument(
+        '--n_processes',
+        type=int,
+        help='Specify the max number of processes to allow at one time',
+        default=4
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        help='Specify the base seed to use for any random number generation',
+        default=12345
+    )
+
+    parser.add_argument(
+        '--dupe_factor',
+        type=int,
+        help='Specify the duplication factor',
+        default=5
+    )
+
+    parser.add_argument(
+        '--masked_lm_prob',
+        type=float,
+        help='Specify the probability for masked lm',
+        default=0.15
+    )
+
+    parser.add_argument(
+        '--max_seq_length',
+        type=int,
+        help='Specify the maximum sequence length',
+        default=512
+    )
+
+    parser.add_argument(
+        '--max_predictions_per_seq',
+        type=int,
+        help='Specify the maximum number of masked words per sequence',
+        default=20
+    )
+
+    parser.add_argument(
+        '--do_lower_case',
+        type=int,
+        help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
+        default=1
+    )
+
+    parser.add_argument(
+        '--vocab_file',
+        type=str,
+        help='Specify absolute path to vocab file to use)'
+    )
+
+    parser.add_argument(
+        '--skip_wikiextractor',
+        type=int,
+        help='Specify whether to skip wikiextractor step 0=False, 1=True',
+        default=0
+    )
+
+    parser.add_argument(
+        '--interactive_json_config_generator',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/data/create_datasets_from_start.sh
+++ b/data/create_datasets_from_start.sh
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+to_download=${1:-"wiki_only"}
+
+#Download
+if [ "$to_download" = "wiki_books" ] ; then
+    python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
+fi
+
+python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
+python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
+python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
+python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
+python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2
+
+# Properly format the text files
+if [ "$to_download" = "wiki_books" ] ; then
+    python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus
+fi
+python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
+
+if [ "$to_download" = "wiki_books" ] ; then
+    DATASET="books_wiki_en_corpus"
+else
+    DATASET="wikicorpus_en"
+    # Shard the text files
+fi
+
+# Shard the text files
+python3 /workspace/bert/data/bertPrep.py --action sharding --dataset $DATASET
+
+# Create HDF5 files Phase 1
+python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 128 \
+--max_predictions_per_seq 20 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
+
+# Create HDF5 files Phase 2
+python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 512 \
+--max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
--- a/data/squad/squad_download.sh
+++ b/data/squad/squad_download.sh
+#!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Downloading dataset for squad..."
+
+# Download SQuAD
+
+v1="v1.1"
+mkdir $v1
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
+wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
+
+EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945  -'
+EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552  -'
+EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9  -'
+CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
+CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
+CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
+
+v2="v2.0"
+mkdir $v2
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
+wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
+
+EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740  -'
+EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8  -'
+EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627  -'
+
+CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
+CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
+CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
+
+echo "Squad data download done!"
+
+echo "Verifying Dataset...."
+
+if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
+    echo "train-v1.1.json is corrupted! md5sum doesn't match"
+fi
+
+if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
+    echo "dev-v1.1.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
+    echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
+fi
+
+
+if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
+    echo "train-v2.0.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
+    echo "dev-v2.0.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
+    echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
+fi
+
+echo "Complete!"
--- a/evaluate-v1.1.py
+++ b/evaluate-v1.1.py
+""" Official evaluation script for v1.1 of the SQuAD dataset. """
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(
+                    f1_score, prediction, ground_truths)
+
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+
+    return {'exact_match': exact_match, 'f1': f1}
+
+
+if __name__ == '__main__':
+    expected_version = '1.1'
+    parser = argparse.ArgumentParser(
+        description='Evaluation for SQuAD ' + expected_version)
+    parser.add_argument('dataset_file', help='Dataset file')
+    parser.add_argument('prediction_file', help='Prediction File')
+    args = parser.parse_args()
+    with open(args.dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if (dataset_json['version'] != expected_version):
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(args.prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    print(json.dumps(evaluate(dataset, predictions)))
--- a/extract_features.py
+++ b/extract_features.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Extract pre-computed feature vectors from a PyTorch BERT model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import logging
+import json
+import re
+
+import torch
+from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+from tokenization import BertTokenizer
+from modeling import BertModel
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+
+
+def convert_examples_to_features(examples, seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > seq_length - 2:
+                tokens_a = tokens_a[0:(seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids:   0   0   0   0  0     0   0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        input_type_ids = []
+        tokens.append("[CLS]")
+        input_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            input_type_ids.append(0)
+        tokens.append("[SEP]")
+        input_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                input_type_ids.append(1)
+            tokens.append("[SEP]")
+            input_type_ids.append(1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            input_type_ids.append(0)
+
+        assert len(input_ids) == seq_length
+        assert len(input_mask) == seq_length
+        assert len(input_type_ids) == seq_length
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("unique_id: %s" % (example.unique_id))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
+
+        features.append(
+            InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def read_examples(input_file):
+    """Read a list of `InputExample`s from an input file."""
+    examples = []
+    unique_id = 0
+    with open(input_file, "r", encoding='utf-8') as reader:
+        while True:
+            line = reader.readline()
+            if not line:
+                break
+            line = line.strip()
+            text_a = None
+            text_b = None
+            m = re.match(r"^(.*) \|\|\| (.*)$", line)
+            if m is None:
+                text_a = line
+            else:
+                text_a = m.group(1)
+                text_b = m.group(2)
+            examples.append(
+                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+            unique_id += 1
+    return examples
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--input_file", default=None, type=str, required=True)
+    parser.add_argument("--output_file", default=None, type=str, required=True)
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+
+    ## Other parameters
+    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
+                            "than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help = "local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+
+    args = parser.parse_args()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
+
+    layer_indexes = [int(x) for x in args.layers.split(",")]
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    examples = read_examples(args.input_file)
+
+    features = convert_examples_to_features(
+        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
+
+    unique_id_to_feature = {}
+    for feature in features:
+        unique_id_to_feature[feature.unique_id] = feature
+
+    model = BertModel.from_pretrained(args.bert_model)
+    model.to(device)
+
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+
+    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
+    if args.local_rank == -1:
+        eval_sampler = SequentialSampler(eval_data)
+    else:
+        eval_sampler = DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+    model.eval()
+    with open(args.output_file, "w", encoding='utf-8') as writer:
+        for input_ids, input_mask, example_indices in eval_dataloader:
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+
+            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
+            all_encoder_layers = all_encoder_layers
+
+            for b, example_index in enumerate(example_indices):
+                feature = features[example_index.item()]
+                unique_id = int(feature.unique_id)
+                # feature = unique_id_to_feature[unique_id]
+                output_json = collections.OrderedDict()
+                output_json["linex_index"] = unique_id
+                all_out_features = []
+                for (i, token) in enumerate(feature.tokens):
+                    all_layers = []
+                    for (j, layer_index) in enumerate(layer_indexes):
+                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
+                        layer_output = layer_output[b]
+                        layers = collections.OrderedDict()
+                        layers["index"] = layer_index
+                        layers["values"] = [
+                            round(x.item(), 6) for x in layer_output[i]
+                        ]
+                        all_layers.append(layers)
+                    out_features = collections.OrderedDict()
+                    out_features["token"] = token
+                    out_features["layers"] = all_layers
+                    all_out_features.append(out_features)
+                output_json["features"] = all_out_features
+                writer.write(json.dumps(output_json) + "\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/file_utils.py
+++ b/file_utils.py
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+
+from __future__ import (absolute_import, division, print_function, unicode_literals)
+
+import json
+import logging
+import os
+import shutil
+import tempfile
+from functools import wraps
+from hashlib import sha256
+import sys
+from io import open
+
+import boto3
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                   Path.home() / '.pytorch_pretrained_bert'))
+except AttributeError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError("HEAD request failed for url {} with status code {}"
+                          .format(url, response.status_code))
+        etag = response.headers.get("ETag")
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
--- a/images/loss_curves.png
+++ b/images/loss_curves.png
--- a/images/model.png
+++ b/images/model.png
--- a/images/nvlamb.png
+++ b/images/nvlamb.png
--- a/inference.py
+++ b/inference.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+""" BERT inference script. Does not depend on dataset. """
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import collections
+import json
+import logging
+import math
+import os
+import random
+import sys
+from io import open
+
+import numpy as np
+import torch
+from tqdm import tqdm, trange
+from types import SimpleNamespace
+
+from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
+from run_squad import _get_best_indices, _compute_softmax, get_valid_prelim_predictions, get_answer_text
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+import math
+import json
+import numpy as np
+import collections
+
+
+def preprocess_tokenized_text(doc_tokens, query_tokens, tokenizer, 
+                              max_seq_length, max_query_length):
+    """ converts an example into a feature """
+    
+    if len(query_tokens) > max_query_length:
+        query_tokens = query_tokens[0:max_query_length]
+    
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(doc_tokens):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+    
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+    
+    # truncate if too long
+    length = len(all_doc_tokens)
+    length = min(length, max_tokens_for_doc)
+    
+    tokens = []
+    token_to_orig_map = {}
+    token_is_max_context = {}
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in query_tokens:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+    
+    for i in range(length):
+        token_to_orig_map[len(tokens)] = tok_to_orig_index[i]
+        token_is_max_context[len(tokens)] = True
+        tokens.append(all_doc_tokens[i])
+        segment_ids.append(1)
+    tokens.append("[SEP]")
+    segment_ids.append(1)
+    
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+    
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+    
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+    
+    tensors_for_inference = {
+                             'input_ids': input_ids, 
+                             'input_mask': input_mask, 
+                             'segment_ids': segment_ids
+                            }
+    tensors_for_inference = SimpleNamespace(**tensors_for_inference)
+    
+    tokens_for_postprocessing = {
+                                 'tokens': tokens,
+                                 'token_to_orig_map': token_to_orig_map,
+                                 'token_is_max_context': token_is_max_context
+                                }
+    tokens_for_postprocessing = SimpleNamespace(**tokens_for_postprocessing)
+    
+    return tensors_for_inference, tokens_for_postprocessing
+
+
+RawResult = collections.namedtuple("RawResult", ["start_logits", "end_logits"])
+
+
+def get_answer(doc_tokens, tokens_for_postprocessing, 
+               start_logits, end_logits, args):
+    
+    result = RawResult(start_logits=start_logits, end_logits=end_logits)
+    
+    predictions = []
+    Prediction = collections.namedtuple('Prediction', ['text', 'start_logit', 'end_logit'])
+    
+    if args.version_2_with_negative:
+        null_val = (float("inf"), 0, 0)
+    
+    start_indices = _get_best_indices(result.start_logits, args.n_best_size)
+    end_indices = _get_best_indices(result.end_logits, args.n_best_size)
+    prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, 
+                                                      tokens_for_postprocessing, result, args)
+    prelim_predictions = sorted(
+                                prelim_predictions,
+                                key=lambda x: (x.start_logit + x.end_logit),
+                                reverse=True
+                                )
+    if args.version_2_with_negative:
+        score = result.start_logits[0] + result.end_logits[0]
+        if score < null_val[0]:
+            null_val = (score, result.start_logits[0], result.end_logits[0])
+    
+    doc_tokens_obj = {
+                      'doc_tokens': doc_tokens, 
+                     }
+    doc_tokens_obj = SimpleNamespace(**doc_tokens_obj)
+
+    curr_predictions = []
+    seen_predictions = []
+    for pred in prelim_predictions:
+        if len(curr_predictions) == args.n_best_size:
+            break
+        if pred.end_index > 0: # this is a non-null prediction
+            final_text = get_answer_text(doc_tokens_obj, tokens_for_postprocessing, pred, args)
+            if final_text in seen_predictions:
+                continue
+        else:
+            final_text = ""
+        
+        seen_predictions.append(final_text)
+        curr_predictions.append(Prediction(final_text, pred.start_logit, pred.end_logit))
+    predictions += curr_predictions
+    
+    # add empty prediction
+    if args.version_2_with_negative:
+        predictions.append(Prediction('', null_val[1], null_val[2]))
+    
+    nbest_answers = []
+    answer = None
+    nbest = sorted(predictions,
+                   key=lambda x: (x.start_logit + x.end_logit),
+                   reverse=True)[:args.n_best_size]
+    
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+        total_scores.append(entry.start_logit + entry.end_logit)
+        if not best_non_null_entry and entry.text:
+            best_non_null_entry = entry
+    probs = _compute_softmax(total_scores)
+    for (i, entry) in enumerate(nbest):
+        output = collections.OrderedDict()
+        output["text"] = entry.text
+        output["probability"] = probs[i]
+        output["start_logit"] = entry.start_logit
+        output["end_logit"] = entry.end_logit
+        nbest_answers.append(output)
+    if args.version_2_with_negative:
+        score_diff = null_val[0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
+        if score_diff > args.null_score_diff_threshold:
+            answer = ""
+        else:
+            answer = best_non_null_entry.text
+    else:
+        answer = nbest_answers[0]['text']
+    
+    return answer, nbest_answers
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    
+    ## Required parameters
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                             "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--init_checkpoint",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The checkpoint file from pretraining")
+    
+    ## Other parameters
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. ")
+    parser.add_argument("--seed", default=1, type=int)
+    parser.add_argument("--question", default="Most antibiotics target bacteria and don't affect what class of organisms? ", 
+                                              type=str, help="question")
+    parser.add_argument("--context", default="Within the genitourinary and gastrointestinal tracts, commensal flora serve as biological barriers by competing with pathogenic bacteria for food and space and, in some cases, by changing the conditions in their environment, such as pH or available iron. This reduces the probability that pathogens will reach sufficient numbers to cause illness. However, since most antibiotics non-specifically target bacteria and do not affect fungi, oral antibiotics can lead to an overgrowth of fungi and cause conditions such as a vaginal candidiasis (a yeast infection). There is good evidence that re-introduction of probiotic flora, such as pure cultures of the lactobacilli normally found in unpasteurized yogurt, helps restore a healthy balance of microbial populations in intestinal infections in children and encouraging preliminary data in studies on bacterial gastroenteritis, inflammatory bowel diseases, urinary tract infection and post-surgical infections. ", 
+                                              type=str, help="context")
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--n_best_size", default=1, type=int,
+                        help="The total number of n-best predictions to generate. ")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, then the model can reply with "unknown". ')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=-11.0,
+                        help="If null_score - best_non_null is greater than the threshold predict 'unknown'. ")
+    parser.add_argument('--vocab_file',
+                        type=str, default=None, required=True,
+                        help="Vocabulary mapping/file BERT was pretrainined on")
+    parser.add_argument("--config_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The BERT model config")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="use mixed-precision")
+    parser.add_argument("--local_rank", default=-1, help="ordinal of the GPU to use")
+    
+    args = parser.parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+    
+    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
+    
+    # Prepare model
+    config = BertConfig.from_json_file(args.config_file)
+    
+    # Padding for divisibility by 8
+    if config.vocab_size % 8 != 0:
+        config.vocab_size += 8 - (config.vocab_size % 8)
+    
+    # initialize model
+    model = BertForQuestionAnswering(config)
+    model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"])
+    model.to(device)
+    if args.fp16:
+        model.half()
+    model.eval()
+    
+    print("question: ", args.question)
+    print("context: ", args.context)
+    print()
+    
+    # preprocessing
+    doc_tokens = args.context.split()
+    query_tokens = tokenizer.tokenize(args.question)
+    feature = preprocess_tokenized_text(doc_tokens, 
+                                        query_tokens, 
+                                        tokenizer, 
+                                        max_seq_length=args.max_seq_length, 
+                                        max_query_length=args.max_query_length)
+    
+    tensors_for_inference, tokens_for_postprocessing = feature
+    
+    input_ids = torch.tensor(tensors_for_inference.input_ids, dtype=torch.long).unsqueeze(0)
+    segment_ids = torch.tensor(tensors_for_inference.segment_ids, dtype=torch.long).unsqueeze(0)
+    input_mask = torch.tensor(tensors_for_inference.input_mask, dtype=torch.long).unsqueeze(0)
+    
+    # load tensors to device
+    input_ids = input_ids.to(device)
+    input_mask = input_mask.to(device)
+    segment_ids = segment_ids.to(device)
+    
+    # run prediction
+    with torch.no_grad():
+        start_logits, end_logits = model(input_ids, segment_ids, input_mask)
+    
+    # post-processing
+    start_logits = start_logits[0].detach().cpu().tolist()
+    end_logits = end_logits[0].detach().cpu().tolist()
+    answer, answers = get_answer(doc_tokens, tokens_for_postprocessing, 
+                                 start_logits, end_logits, args)
+    
+    # print result
+    print()
+    print(answer)
+    print()
+    print(json.dumps(answers, indent=4))
+
+
+if __name__ == "__main__":
+    main()
+
--- a/log/results-squad-fp16.json
+++ b/log/results-squad-fp16.json
+DLLL {"timestamp": "1689419433.768081", "datetime": "2023-07-15 19:10:33.768081", "elapsedtime": "0.000237", "type": "LOG", "step": "PARAMETER", "data": {"Config": ["Namespace(amp=True, bert_model='bert-large-uncased', cache_dir=None, config_file='/public/home/hepj//model_source/pytorch_bert/bert_config.json', disable_progress_bar=False, dist_url='tcp://224.66.41.62:23456', do_eval=True, do_lower_case=False, do_predict=True, do_train=True, doc_stride=128, eval_script='./evaluate-v1.1.py', fp16=True, gpus_per_node=1, gradient_accumulation_steps=1, init_checkpoint='/public/home/hepj/model_source/model_pytorch.ckpt.pt', json_summary='./log/results-squad-fp16.json', learning_rate=5e-05, local_rank=-1, log_freq=50, loss_scale=0, max_answer_length=30, max_query_length=64, max_seq_length=384, max_steps=-1.0, n_best_size=20, no_cuda=False, null_score_diff_threshold=0.0, num_train_epochs=3.0, output_dir='/public/home/hepj/outdir/tourch/SQuAD', predict_batch_size=4, predict_file='/public/home/hepj/data/sq1.1/dev-v1.1.json', seed=42, skip_cache=False, skip_checkpoint=False, train_batch_size=4, train_file='/public/home/hepj/data/sq1.1/train-v1.1.json', use_env=False, verbose_logging=False, version_2_with_negative=False, vocab_file='/public/home/hepj//model_source/pytorch_bert/vocab.txt', warmup_proportion=0.1, world_size=1)"]}}
+DLLL {"timestamp": "1689419433.787672", "datetime": "2023-07-15 19:10:33.787672", "elapsedtime": "0.019828", "type": "LOG", "step": "PARAMETER", "data": {"SEED": 42}}
+DLLL {"timestamp": "1689419453.753193", "datetime": "2023-07-15 19:10:53.753193", "elapsedtime": "19.985349", "type": "LOG", "step": "PARAMETER", "data": {"loading_checkpoint": true}}
+DLLL {"timestamp": "1689419456.642115", "datetime": "2023-07-15 19:10:56.642115", "elapsedtime": "22.874271", "type": "LOG", "step": "PARAMETER", "data": {"loaded_checkpoint": true}}
+DLLL {"timestamp": "1689419457.266302", "datetime": "2023-07-15 19:10:57.266302", "elapsedtime": "23.498458", "type": "LOG", "step": "PARAMETER", "data": {"model_weights_num": 335150082}}
+DLLL {"timestamp": "1689419469.543777", "datetime": "2023-07-15 19:11:09.543777", "elapsedtime": "35.775933", "type": "LOG", "step": "PARAMETER", "data": {"train_start": true}}
+DLLL {"timestamp": "1689419469.543959", "datetime": "2023-07-15 19:11:09.543959", "elapsedtime": "35.776115", "type": "LOG", "step": "PARAMETER", "data": {"training_samples": 87599}}
+DLLL {"timestamp": "1689419469.54403", "datetime": "2023-07-15 19:11:09.544030", "elapsedtime": "35.776186", "type": "LOG", "step": "PARAMETER", "data": {"training_features": 88368}}
+DLLL {"timestamp": "1689419469.544095", "datetime": "2023-07-15 19:11:09.544095", "elapsedtime": "35.776251", "type": "LOG", "step": "PARAMETER", "data": {"train_batch_size": 4}}
+DLLL {"timestamp": "1689419469.544156", "datetime": "2023-07-15 19:11:09.544156", "elapsedtime": "35.776312", "type": "LOG", "step": "PARAMETER", "data": {"steps": 65697.0}}
+DLLL {"timestamp": "1689419476.360987", "datetime": "2023-07-15 19:11:16.360987", "elapsedtime": "42.593143", "type": "LOG", "step": [0, 1], "data": {"step_loss": 6.122858047485352, "learning_rate": 7.610697596541699e-09}}
+DLLL {"timestamp": "1689419492.221115", "datetime": "2023-07-15 19:11:32.221115", "elapsedtime": "58.453271", "type": "LOG", "step": [0, 51], "data": {"step_loss": 5.114989757537842, "learning_rate": 3.8814557742362663e-07}}
+DLLL {"timestamp": "1689419507.932752", "datetime": "2023-07-15 19:11:47.932752", "elapsedtime": "74.164908", "type": "LOG", "step": [0, 101], "data": {"step_loss": 5.053555488586426, "learning_rate": 7.686804572507116e-07}}
--- a/log/results.json
+++ b/log/results.json
+DLLL {"timestamp": "1689585949.809111", "datetime": "2023-07-17 17:25:49.809111", "elapsedtime": "0.000235", "type": "LOG", "step": "PARAMETER", "data": {"Config": ["Namespace(amp=False, bert_model='bert-large-uncased', cache_dir=None, config_file='/public/home/hepj/model_source/pytorch_bert/bert_config.json', disable_progress_bar=False, dist_url='tcp://224.66.41.62:23456', do_eval=False, do_lower_case=False, do_predict=True, do_train=True, doc_stride=128, eval_script='./evaluate-v1.1.py', fp16=False, gpus_per_node=1, gradient_accumulation_steps=1, init_checkpoint='/public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt', json_summary='./log/results.json', learning_rate=5e-05, local_rank=-1, log_freq=50, loss_scale=0, max_answer_length=30, max_query_length=64, max_seq_length=384, max_steps=-1.0, n_best_size=20, no_cuda=False, null_score_diff_threshold=0.0, num_train_epochs=3.0, output_dir='/public/home/hepj/outdir/torch/SQuAD', predict_batch_size=4, predict_file='/public/home/hepj/data/sq1.1/dev-v1.1.json', seed=42, skip_cache=False, skip_checkpoint=False, train_batch_size=4, train_file='/public/home/hepj/data/sq1.1/train-v1.1.json', use_env=False, verbose_logging=False, version_2_with_negative=False, vocab_file='/public/home/hepj/model_source/pytorch_bert/vocab.txt', warmup_proportion=0.1, world_size=1)"]}}
+DLLL {"timestamp": "1689585950.006137", "datetime": "2023-07-17 17:25:50.006137", "elapsedtime": "0.197261", "type": "LOG", "step": "PARAMETER", "data": {"SEED": 42}}
+DLLL {"timestamp": "1689585970.324955", "datetime": "2023-07-17 17:26:10.324955", "elapsedtime": "20.516079", "type": "LOG", "step": "PARAMETER", "data": {"loading_checkpoint": true}}
+DLLL {"timestamp": "1689585974.448674", "datetime": "2023-07-17 17:26:14.448674", "elapsedtime": "24.639798", "type": "LOG", "step": "PARAMETER", "data": {"loaded_checkpoint": true}}
+DLLL {"timestamp": "1689585976.67685", "datetime": "2023-07-17 17:26:16.676850", "elapsedtime": "26.867974", "type": "LOG", "step": "PARAMETER", "data": {"model_weights_num": 335150082}}
+DLLL {"timestamp": "1689585989.449134", "datetime": "2023-07-17 17:26:29.449134", "elapsedtime": "39.640258", "type": "LOG", "step": "PARAMETER", "data": {"train_start": true}}
+DLLL {"timestamp": "1689585989.467614", "datetime": "2023-07-17 17:26:29.467614", "elapsedtime": "39.658738", "type": "LOG", "step": "PARAMETER", "data": {"training_samples": 87599}}
+DLLL {"timestamp": "1689585989.467693", "datetime": "2023-07-17 17:26:29.467693", "elapsedtime": "39.658817", "type": "LOG", "step": "PARAMETER", "data": {"training_features": 88368}}
+DLLL {"timestamp": "1689585989.467758", "datetime": "2023-07-17 17:26:29.467758", "elapsedtime": "39.658882", "type": "LOG", "step": "PARAMETER", "data": {"train_batch_size": 4}}
+DLLL {"timestamp": "1689585989.46782", "datetime": "2023-07-17 17:26:29.467820", "elapsedtime": "39.658944", "type": "LOG", "step": "PARAMETER", "data": {"steps": 65697.0}}
+DLLL {"timestamp": "1689586004.55256", "datetime": "2023-07-17 17:26:44.552560", "elapsedtime": "54.743684", "type": "LOG", "step": [0, 1], "data": {"step_loss": 6.121078014373779, "learning_rate": 5e-05}}
--- a/modeling.py
+++ b/modeling.py
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import json
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+import sys
+from io import open
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils import checkpoint
+
+sys.path.append('/workspace/bert/')
+from file_utils import cached_path
+
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+import torch.nn.functional as F
+import torch.nn.init as init
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+}
+CONFIG_NAME = 'bert_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.ascontiguousarray(np.transpose(array))
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+def gelu(x):
+    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
+
+#used only for triton inference
+def bias_gelu(bias, y):
+    x = bias + y
+    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
+
+# used specifically for training since torch.nn.functional.gelu breaks ONNX export
+def bias_gelu_training(bias, y):
+    x = bias + y
+    return torch.nn.functional.gelu(x) # Breaks ONNX export
+
+def bias_tanh(bias, y):
+    x = bias + y
+    return torch.tanh(x)
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+#torch.nn.functional.gelu(x) # Breaks ONNX export
+ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish}
+
+class LinearActivation(Module):
+    r"""Fused Linear and activation Module.
+    """
+    __constants__ = ['bias']
+
+    def __init__(self, in_features, out_features, act='gelu', bias=True):
+        super(LinearActivation, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.act_fn = nn.Identity()                                                         #
+        self.biased_act_fn = None                                                           #
+        self.bias = None                                                                    #
+        if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)): # For TorchScript
+            if bias and not 'bias' in act:                                                  # compatibility
+                act = 'bias_' + act                                                         #
+                self.biased_act_fn = ACT2FN[act]                                            #
+
+            else:
+                self.act_fn = ACT2FN[act]
+        else:
+            self.act_fn = act
+        self.weight = Parameter(torch.Tensor(out_features, in_features))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        if not self.bias is None:
+            return self.biased_act_fn(self.bias, F.linear(input, self.weight, None))
+        else:
+            return self.act_fn(F.linear(input, self.weight, self.bias))
+
+    def extra_repr(self):
+        return 'in_features={}, out_features={}, bias={}'.format(
+            self.in_features, self.out_features, self.bias is not None
+        )
+
+
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 output_all_encoded_layers=False):
+        """Constructs BertConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.output_all_encoded_layers = output_all_encoded_layers
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+class BertNonFusedLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertNonFusedLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u)
+        s = s * s
+        s = s.mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+try:
+    import apex
+    #apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
+    import apex.normalization
+    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+    #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
+    #BertLayerNorm = apex.normalization.FusedLayerNorm
+    APEX_IS_AVAILABLE = True
+except ImportError:
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
+    #BertLayerNorm = BertNonFusedLayerNorm
+    APEX_IS_AVAILABLE = False
+class BertLayerNorm(Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        super(BertLayerNorm, self).__init__()
+        self.shape = torch.Size((hidden_size,))
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.apex_enabled = APEX_IS_AVAILABLE
+
+    @torch.jit.unused
+    def fused_layer_norm(self, x):
+        return FusedLayerNormAffineFunction.apply(
+                    x, self.weight, self.bias, self.shape, self.eps)
+
+
+    def forward(self, x):
+        if self.apex_enabled and not torch.jit.is_scripting():
+            x = self.fused_layer_norm(x)
+        else:
+            u = x.mean(-1, keepdim=True)
+            s = (x - u)
+            s = s * s
+            s = s.mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight * x + self.bias
+        return x
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def transpose_key_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 3, 1)
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_key_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = torch.reshape(context_layer, new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense_act = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_act(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.output_all_encoded_layers = config.output_all_encoded_layers
+        self._checkpoint_activations = False
+
+    @torch.jit.unused
+    def checkpointed_forward(self, hidden_states, attention_mask):
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+            return custom_forward
+
+        l = 0
+        num_layers = len(self.layer)
+        chunk_length = math.ceil(math.sqrt(num_layers))
+        while l < num_layers:
+            hidden_states = checkpoint.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1)
+            l += chunk_length
+
+        return hidden_states
+
+    def forward(self, hidden_states, attention_mask):
+        all_encoder_layers = []
+
+        if self._checkpoint_activations:
+            hidden_states = self.checkpointed_forward(hidden_states, attention_mask)
+        else:
+            for i,layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+
+                if self.output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not self.output_all_encoded_layers or self._checkpoint_activations:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh")
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense_act(first_token_tensor)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_act(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(BertPreTrainedModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def checkpoint_activations(self, val):
+        def _apply_flag(module):
+            if hasattr(module, "_checkpoint_activations"):
+                module._checkpoint_activations=val
+        self.apply(_apply_flag)
+    def enable_apex(self, val):
+        def _apply_flag(module):
+            if hasattr(module, "apex_enabled"):
+                module.apex_enabled=val
+        self.apply(_apply_flag)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
+                        from_tf=False, *inputs, **kwargs):
+        """
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info("loading archive file {}".format(archive_file))
+        else:
+            logger.info("loading archive file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file) or from_tf:
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info("extracting archive file {} to temp dir {}".format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = BertConfig.from_json_file(config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
+            return load_tf_weights_in_bert(model, weights_path)
+        # Load from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        start_prefix = ''
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+            start_prefix = 'bert.'
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        return model
+
+
+class BertModel(BertPreTrainedModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+        self.output_all_encoded_layers = config.output_all_encoded_layers
+
+    def forward(self, input_ids, token_type_ids, attention_mask):
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.embeddings.word_embeddings.weight.dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output, extended_attention_mask)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not self.output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1:]
+        return encoded_layers, pooled_output
+
+
+class BertForPreTraining(BertPreTrainedModel):
+    """BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForPreTraining(config)
+    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForPreTraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids, attention_mask):
+        encoded_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
+        sequence_output = encoded_layers[-1]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        return prediction_scores, seq_relationship_score
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+    """BERT model with the masked language modeling head.
+    This module comprises the BERT model followed by the masked language modeling head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForMaskedLM(config)
+    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForMaskedLM, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
+        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask)
+        sequence_output = encoded_layers[-1]
+        prediction_scores = self.cls(sequence_output)
+
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            return masked_lm_loss
+        else:
+            return prediction_scores
+
+
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    """BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForNextSentencePrediction(config)
+    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForNextSentencePrediction, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
+        seq_relationship_score = self.cls( pooled_output)
+
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            return next_sentence_loss
+        else:
+            return seq_relationship_score
+
+
+class BertForSequenceClassification(BertPreTrainedModel):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForSequenceClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_labels):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
+        pooled_output = self.dropout(pooled_output)
+        return self.classifier(pooled_output)
+
+
+class BertForMultipleChoice(BertPreTrainedModel):
+    """BERT model for multiple choice tasks.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_choices`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_choices = 2
+
+    model = BertForMultipleChoice(config, num_choices)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_choices):
+        super(BertForMultipleChoice, self).__init__(config)
+        self.num_choices = num_choices
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, self.num_choices)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        else:
+            return reshaped_logits
+
+
+class BertForTokenClassification(BertPreTrainedModel):
+    """BERT model for token-level classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForTokenClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_labels):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask)
+        sequence_output = encoded_layers[-1]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForQuestionAnswering(BertPreTrainedModel):
+    """BERT model for Question Answering (span extraction).
+    This module is composed of the BERT model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+
+    Outputs:
+         Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+         position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.bert = BertModel(config)
+        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids, attention_mask):
+        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask)
+        sequence_output = encoded_layers[-1]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        return start_logits, end_logits
--- a/optimization.py
+++ b/optimization.py
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch optimization for BERT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+#from fused_adam_local import FusedAdam
+from apex.optimizers import FusedAdam
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+from utils import is_main_process
+
+multi_tensor_l2norm = amp_C.multi_tensor_l2norm
+lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda
+lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda
+scale = amp_C.multi_tensor_scale
+
+
+def warmup_cosine(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 0.5 * (1.0 + torch.cos(math.pi * x))
+
+def warmup_constant(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 1.0
+
+def warmup_linear(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return max((x - 1. )/ (warmup - 1.), 0.)
+    
+def warmup_poly(x, warmup=0.002, degree=0.5):
+    if x < warmup:
+        return x/warmup
+    return (1.0 - x)**degree
+
+
+SCHEDULES = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+    'warmup_poly':warmup_poly,
+}
+
+class BertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate. Default: -1
+        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
+                 max_grad_norm=1.0):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(BertAdam, self).__init__(params, defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+                lr.append(lr_scheduled)
+        return lr
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['next_m'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['next_v'] = torch.zeros_like(p.data)
+
+                next_m, next_v = state['next_m'], state['next_v']
+                beta1, beta2 = group['b1'], group['b2']
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                next_m.mul_(beta1).add_(1 - beta1, grad)
+                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                update = next_m / (next_v.sqrt() + group['e'])
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data
+
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+
+                update_with_lr = lr_scheduled * update
+                p.data.add_(-update_with_lr)
+
+                state['step'] += 1
+
+        return loss
--- a/output/dllogger.json
+++ b/output/dllogger.json
+DLLL {"timestamp": "1689557913.603774", "datetime": "2023-07-17 09:38:33.603774", "elapsedtime": "0.006562", "type": "LOG", "step": "PARAMETER", "data": {"Config": ["Namespace(allreduce_post_accumulation=False, allreduce_post_accumulation_fp16=False, amp=True, bert_model='bert-large-uncased', checkpoint_activations=False, config_file='./bert_config.json', disable_progress_bar=False, dist_url='tcp://224.66.41.62:23456', do_train=True, fp16=True, gpus_per_node=1, gradient_accumulation_steps=1, init_checkpoint=None, init_loss_scale=1048576, input_dir='/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training', json_summary='./output/dllogger.json', learning_rate=0.0004, local_rank=-1, log_freq=1.0, loss_scale=0.0, max_predictions_per_seq=20, max_seq_length=128, max_steps=100000.0, n_gpu=1, num_steps_per_checkpoint=20, num_train_epochs=3.0, output_dir='/public/home/hepj/outdir/torch/pre_wiki/phrase1', phase1_end_step=7038, phase2=False, resume_from_checkpoint=False, resume_step=-1, seed=12439, skip_checkpoint=False, steps_this_run=100000.0, train_batch_size=16, use_env=False, warmup_proportion=0.0, world_size=1)"]}}
+DLLL {"timestamp": "1689557920.976409", "datetime": "2023-07-17 09:38:40.976409", "elapsedtime": "7.379197", "type": "LOG", "step": "PARAMETER", "data": {"SEED": 12439}}
+DLLL {"timestamp": "1689557920.97655", "datetime": "2023-07-17 09:38:40.976550", "elapsedtime": "7.379338", "type": "LOG", "step": "PARAMETER", "data": {"train_start": true}}
+DLLL {"timestamp": "1689557920.976623", "datetime": "2023-07-17 09:38:40.976623", "elapsedtime": "7.379411", "type": "LOG", "step": "PARAMETER", "data": {"batch_size_per_gpu": 16}}
+DLLL {"timestamp": "1689557920.976687", "datetime": "2023-07-17 09:38:40.976687", "elapsedtime": "7.379475", "type": "LOG", "step": "PARAMETER", "data": {"learning_rate": 0.0004}}
+DLLL {"timestamp": "1689557925.873029", "datetime": "2023-07-17 09:38:45.873029", "elapsedtime": "12.275817", "type": "LOG", "step": [0, 1], "data": {"average_loss": 11.34375, "step_loss": 11.34375, "learning_rate": 0.000399997999995}}
+DLLL {"timestamp": "1689557926.215128", "datetime": "2023-07-17 09:38:46.215128", "elapsedtime": "12.617916", "type": "LOG", "step": [0, 2], "data": {"average_loss": 11.2890625, "step_loss": 11.2890625, "learning_rate": 0.000399997999995}}
+DLLL {"timestamp": "1689557926.557062", "datetime": "2023-07-17 09:38:46.557062", "elapsedtime": "12.95985", "type": "LOG", "step": [0, 3], "data": {"average_loss": 11.3359375, "step_loss": 11.3359375, "learning_rate": 0.000399997999995}}
+DLLL {"timestamp": "1689557926.898986", "datetime": "2023-07-17 09:38:46.898986", "elapsedtime": "13.301774", "type": "LOG", "step": [0, 4], "data": {"average_loss": 11.390625, "step_loss": 11.390625, "learning_rate": 0.000399997999995}}
+DLLL {"timestamp": "1689557927.240626", "datetime": "2023-07-17 09:38:47.240626", "elapsedtime": "13.643414", "type": "LOG", "step": [0, 5], "data": {"average_loss": 11.328125, "step_loss": 11.328125, "learning_rate": 0.000399997999995}}
+DLLL {"timestamp": "1689557927.712912", "datetime": "2023-07-17 09:38:47.712912", "elapsedtime": "14.1157", "type": "LOG", "step": [0, 6], "data": {"average_loss": 11.3359375, "step_loss": 11.3359375, "learning_rate": 0.000399997999995}}
+DLLL {"timestamp": "1689557928.103626", "datetime": "2023-07-17 09:38:48.103626", "elapsedtime": "14.506414", "type": "LOG", "step": [0, 7], "data": {"average_loss": 10.859375, "step_loss": 10.859375, "learning_rate": 0.0003999959999799998}}
+DLLL {"timestamp": "1689557928.489023", "datetime": "2023-07-17 09:38:48.489023", "elapsedtime": "14.891811", "type": "LOG", "step": [0, 8], "data": {"average_loss": 10.625, "step_loss": 10.625, "learning_rate": 0.00039999399995499935}}
+DLLL {"timestamp": "1689557928.873728", "datetime": "2023-07-17 09:38:48.873728", "elapsedtime": "15.276516", "type": "LOG", "step": [0, 9], "data": {"average_loss": 10.4140625, "step_loss": 10.4140625, "learning_rate": 0.0003999919999199984}}
+DLLL {"timestamp": "1689557929.258729", "datetime": "2023-07-17 09:38:49.258729", "elapsedtime": "15.661517", "type": "LOG", "step": [0, 10], "data": {"average_loss": 10.3125, "step_loss": 10.3125, "learning_rate": 0.0003999899998749969}}
+DLLL {"timestamp": "1689557929.645026", "datetime": "2023-07-17 09:38:49.645026", "elapsedtime": "16.047814", "type": "LOG", "step": [0, 11], "data": {"average_loss": 10.3515625, "step_loss": 10.3515625, "learning_rate": 0.0003999879998199946}}
+DLLL {"timestamp": "1689557930.031004", "datetime": "2023-07-17 09:38:50.031004", "elapsedtime": "16.433792", "type": "LOG", "step": [0, 12], "data": {"average_loss": 9.9453125, "step_loss": 9.9453125, "learning_rate": 0.0003999859997549914}}
+DLLL {"timestamp": "1689557930.417146", "datetime": "2023-07-17 09:38:50.417146", "elapsedtime": "16.819934", "type": "LOG", "step": [0, 13], "data": {"average_loss": 10.0, "step_loss": 10.0, "learning_rate": 0.00039998399967998725}}
+DLLL {"timestamp": "1689557930.80166", "datetime": "2023-07-17 09:38:50.801660", "elapsedtime": "17.204448", "type": "LOG", "step": [0, 14], "data": {"average_loss": 9.96875, "step_loss": 9.96875, "learning_rate": 0.0003999819995949818}}
+DLLL {"timestamp": "1689557931.18733", "datetime": "2023-07-17 09:38:51.187330", "elapsedtime": "17.590118", "type": "LOG", "step": [0, 15], "data": {"average_loss": 9.859375, "step_loss": 9.859375, "learning_rate": 0.00039997999949997504}}
+DLLL {"timestamp": "1689557931.573538", "datetime": "2023-07-17 09:38:51.573538", "elapsedtime": "17.976326", "type": "LOG", "step": [0, 16], "data": {"average_loss": 9.8515625, "step_loss": 9.8515625, "learning_rate": 0.0003999779993949667}}