Merge branch 'xlnet'

f31154cb · thomwolf · 78462aad · 1b35d05d · f31154cb · f31154cb
Commit f31154cb authored Jul 16, 2019 by thomwolf
5 changed files
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for XLNet model."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+import os
+from shutil import copyfile
+
+import unicodedata
+import six
+
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlnet-base-cased': None,
+    'xlnet-large-cased': None,
+}
+
+SPIECE_UNDERLINE = u'▁'
+
+# Segments (not really needed)
+SEG_ID_A   = 0
+SEG_ID_B   = 1
+SEG_ID_CLS = 2
+SEG_ID_SEP = 3
+SEG_ID_PAD = 4
+
+class XLNetTokenizer(PreTrainedTokenizer):
+    """
+        SentencePiece based tokenizer. Peculiarities:
+
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, max_len=None,
+                 do_lower_case=False, remove_space=True, keep_accents=False,
+                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
+                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
+                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
+        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
+                                             unk_token=unk_token, sep_token=sep_token,
+                                             pad_token=pad_token, cls_token=cls_token,
+                                             mask_token=mask_token, additional_special_tokens=
+                                             additional_special_tokens, **kwargs)
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = ' '.join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if six.PY2 and isinstance(outputs, str):
+            outputs = outputs.decode('utf-8')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize('NFKD', outputs)
+            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text, return_unicode=True, sample=False):
+        """ Tokenize a string.
+            return_unicode is used only for py2
+        """
+        text = self.preprocess_text(text)
+        # note(zhiliny): in some systems, sentencepiece only accepts str for py2
+        if six.PY2 and isinstance(text, unicode):
+            text = text.encode('utf-8')
+
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(
+                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+
+        # note(zhiliny): convert back to unicode for py2
+        if six.PY2 and return_unicode:
+            ret_pieces = []
+            for piece in new_pieces:
+                if isinstance(piece, str):
+                    piece = piece.decode('utf-8')
+                ret_pieces.append(piece)
+            new_pieces = ret_pieces
+
+        return new_pieces
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index, return_unicode=True):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        if six.PY2 and return_unicode and isinstance(token, str):
+            token = token.decode('utf-8')
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,6 @@ boto3
 # Used for downloading models over HTTP
 requests
 # For OpenAI GPT
-regex
\ No newline at end of file
+regex
+# For XLNet
+sentencepiece
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -37,16 +37,16 @@ from io import open
 from setuptools import find_packages, setup

 setup(
-    name="pytorch_pretrained_bert",
-    version="0.6.2",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
+    name="pytorch_transformers",
+    version="0.7.0",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
    author_email="thomas@huggingface.co",
-    description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
+    description="Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM",
    long_description=open("README.md", "r", encoding='utf-8').read(),
    long_description_content_type="text/markdown",
-    keywords='BERT NLP deep learning google',
+    keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
    license='Apache',
-    url="https://github.com/huggingface/pytorch-pretrained-BERT",
+    url="https://github.com/huggingface/pytorch-transformers",
    packages=find_packages(exclude=["*.tests", "*.tests.*",
                                    "tests.*", "tests"]),
    install_requires=['torch>=0.4.1',
@@ -54,10 +54,11 @@ setup(
                      'boto3',
                      'requests',
                      'tqdm',
-                      'regex'],
+                      'regex',
+                      'sentencepiece'],
    entry_points={
      'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
+        "pytorch_transformers=pytorch_transformers.__main__:main",
      ]
    },
    # python_requires='>=3.5.0',

--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
-
-from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
-                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
-from pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP
-
-class GPT2ModelTest(unittest.TestCase):
-    class GPT2ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_position_ids=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_special=1,
-                     n_positions=33,
-                     n_embd=32,
-                     n_layer=5,
-                     n_head=4,
-                     n_choices=3,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     scope=None):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_position_ids = use_position_ids
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_special = n_special
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.n_choices = n_choices
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            total_num_tokens = self.vocab_size + self.n_special
-            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
-
-            position_ids = None
-            if self.use_position_ids:
-                position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                total_voc = self.vocab_size
-                token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
-
-            mc_labels = None
-            lm_labels = None
-            mc_token_ids = None
-            if self.use_labels:
-                mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
-                lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
-
-            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
-                n_special=self.n_special,
-                n_positions=self.n_positions,
-                n_embd=self.n_embd,
-                n_layer=self.n_layer,
-                n_head=self.n_head,
-                initializer_range=self.initializer_range)
-
-            return (config, input_ids, token_type_ids, position_ids,
-                    mc_labels, lm_labels, mc_token_ids)
-
-        def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,
-                                mc_labels, lm_labels, mc_token_ids):
-            model = GPT2Model(config)
-            model.eval()
-            hidden_states, presents = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "hidden_states": hidden_states,
-                "presents": presents,
-            }
-            return outputs
-
-        def check_gpt2_model_output(self, result):
-            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
-            self.parent.assertListEqual(
-                list(result["hidden_states"][0].size()),
-                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
-
-
-        def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2LMHeadModel(config)
-            model.eval()
-            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
-            lm_logits, presents = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "presents": presents,
-            }
-            return outputs
-
-        def create_gpt2_lm_head_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2LMHeadModel(config, output_attentions=True)
-            model.eval()
-            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
-            attentions, lm_logits, presents = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "presents": presents,
-                "attentions": attentions,
-            }
-            return outputs
-
-        def check_gpt2_lm_head_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertEqual(self.n_layer, len(result["presents"]))
-            self.parent.assertListEqual(
-                list(result["presents"][0].size()),
-                [2, self.batch_size * self.n_choices, self.n_head, self.seq_length, self.n_embd // self.n_head])
-
-        def check_gpt2_lm_head_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-
-        def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2DoubleHeadsModel(config)
-            model.eval()
-            loss = model(input_ids, mc_token_ids,
-                         lm_labels=lm_labels, mc_labels=mc_labels,
-                         token_type_ids=token_type_ids, position_ids=position_ids)
-            lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits,
-                "presents": presents,
-            }
-            return outputs
-
-        def create_gpt2_double_heads_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2DoubleHeadsModel(config, output_attentions=True)
-            model.eval()
-            loss = model(input_ids, mc_token_ids,
-                         lm_labels=lm_labels, mc_labels=mc_labels,
-                         token_type_ids=token_type_ids, position_ids=position_ids)
-            attentions, lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits,
-                "presents": presents,
-                "attentions": attentions,
-            }
-            return outputs
-
-        def check_gpt2_double_heads_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].size()),
-                [self.batch_size, self.n_choices])
-
-        def check_gpt2_double_heads_loss_output(self, result):
-            self.parent.assertListEqual(
-                [list(l.size()) for l in result["loss"]],
-                [[], []])
-
-        def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
-                                                mc_labels, lm_labels, mc_token_ids):
-            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
-                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
-                if isinstance(model, GPT2DoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
-                else:
-                    output = model(input_ids, head_mask=head_mask)
-
-                if isinstance(model, GPT2Model):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output[:-1])
-                output = output.sum()
-                output.backward()
-                multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[1].nonzero()),
-                    multihead_outputs[1].numel())
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-        def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
-                                                   mc_labels, lm_labels, mc_token_ids):
-            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                transformer = model if isinstance(model, GPT2Model) else model.transformer
-                heads_to_prune = {0: list(range(1, self.n_head)),
-                                  -1: [0]}
-                transformer.prune_heads(heads_to_prune)
-                if isinstance(model, GPT2DoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids)
-                else:
-                    output = model(input_ids)
-
-                if isinstance(model, GPT2Model):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output[:-1])
-                output = output.sum()
-                output.backward()
-                multihead_outputs = transformer.get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, 1,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head-1,
-                        self.seq_length, self.n_embd // self.n_head])
-
-
-    def test_default(self):
-        self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
-
-    def test_config_to_json_string(self):
-        config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
-        obj = json.loads(config.to_json_string())
-        self.assertEqual(obj["vocab_size"], 99)
-        self.assertEqual(obj["n_embd"], 37)
-
-    def test_config_to_json_file(self):
-        config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = GPT2Config.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    @pytest.mark.slow
-    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_gpt2_model(*config_and_inputs)
-        tester.check_gpt2_model_output(output_result)
-
-        output_result = tester.create_gpt2_lm_head(*config_and_inputs)
-        tester.check_gpt2_lm_head_output(output_result)
-        tester.check_gpt2_lm_head_loss_output(output_result)
-
-        output_result = tester.create_gpt2_double_heads(*config_and_inputs)
-        tester.check_gpt2_double_heads_output(output_result)
-        tester.check_gpt2_double_heads_loss_output(output_result)
-
-        tester.create_and_check_gpt2_for_headmasking(*config_and_inputs)
-        tester.create_and_check_gpt2_for_head_pruning(*config_and_inputs)
-
-    @classmethod
-    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a random int32 tensor of the shape within the vocab size."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
-
-from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
-                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP
-
-class OpenAIGPTModelTest(unittest.TestCase):
-    class OpenAIGPTModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_position_ids=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_special=1,
-                     n_positions=33,
-                     n_embd=32,
-                     n_layer=5,
-                     n_head=4,
-                     n_choices=3,
-                     afn="gelu",
-                     resid_pdrop=0.1,
-                     attn_pdrop=0.1,
-                     embd_pdrop=0.1,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     scope=None):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_position_ids = use_position_ids
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_special = n_special
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.afn = afn
-            self.n_choices = n_choices
-            self.resid_pdrop = resid_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
-
-            position_ids = None
-            if self.use_position_ids:
-                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                total_voc = self.vocab_size + self.n_special
-                token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
-
-            mc_labels = None
-            lm_labels = None
-            mc_token_ids = None
-            if self.use_labels:
-                mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
-                lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
-
-            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
-                n_positions=self.n_positions,
-                n_special=self.n_special,
-                n_embd=self.n_embd,
-                n_layer=self.n_layer,
-                n_head=self.n_head,
-                afn=self.afn,
-                resid_pdrop=self.resid_pdrop,
-                attn_pdrop=self.attn_pdrop,
-                embd_pdrop=self.embd_pdrop,
-                initializer_range=self.initializer_range)
-
-            return (config, input_ids, token_type_ids, position_ids,
-                    mc_labels, lm_labels, mc_token_ids)
-
-        def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
-                                mc_labels, lm_labels, mc_token_ids):
-            model = OpenAIGPTModel(config)
-            model.eval()
-            hidden_states = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "hidden_states": hidden_states,
-            }
-            return outputs
-
-        def check_openai_model_output(self, result):
-            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
-            self.parent.assertListEqual(
-                list(result["hidden_states"][0].size()),
-                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
-
-
-        def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = OpenAIGPTLMHeadModel(config)
-            model.eval()
-            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
-            lm_logits = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-            }
-            return outputs
-
-        def check_openai_lm_head_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-
-        def check_openai_lm_head_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-
-        def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = OpenAIGPTDoubleHeadsModel(config)
-            model.eval()
-            loss = model(input_ids, mc_token_ids,
-                         lm_labels=lm_labels, mc_labels=mc_labels,
-                         token_type_ids=token_type_ids, position_ids=position_ids)
-            lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits,
-            }
-            return outputs
-
-        def check_openai_double_heads_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].size()),
-                [self.batch_size, self.n_choices])
-
-        def check_openai_double_heads_loss_output(self, result):
-            self.parent.assertListEqual(
-                [list(l.size()) for l in result["loss"]],
-                [[], []])
-
-        def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
-                                                mc_labels, lm_labels, mc_token_ids):
-            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
-                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
-                if isinstance(model, OpenAIGPTDoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
-                else:
-                    output = model(input_ids, head_mask=head_mask)
-
-                if isinstance(model, OpenAIGPTModel):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output)
-                output = output.sum()
-                output.backward()
-                multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[1].nonzero()),
-                    multihead_outputs[1].numel())
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-
-        def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
-                                                     mc_labels, lm_labels, mc_token_ids):
-            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                transformer = model if isinstance(model, OpenAIGPTModel) else model.transformer
-                heads_to_prune = {0: list(range(1, self.n_head)),
-                                  -1: [0]}
-                transformer.prune_heads(heads_to_prune)
-                if isinstance(model, OpenAIGPTDoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids)
-                else:
-                    output = model(input_ids)
-
-                if isinstance(model, OpenAIGPTModel):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output)
-                output = output.sum()
-                output.backward()
-                multihead_outputs = transformer.get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, 1,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head-1,
-                        self.seq_length, self.n_embd // self.n_head])
-
-
-    def test_default(self):
-        self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
-
-    def test_config_to_json_string(self):
-        config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
-        obj = json.loads(config.to_json_string())
-        self.assertEqual(obj["vocab_size"], 99)
-        self.assertEqual(obj["n_embd"], 37)
-
-    def test_config_to_json_file(self):
-        config_first = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = OpenAIGPTConfig.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    @pytest.mark.slow
-    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_openai_model(*config_and_inputs)
-        tester.check_openai_model_output(output_result)
-
-        output_result = tester.create_openai_lm_head(*config_and_inputs)
-        tester.check_openai_lm_head_output(output_result)
-        tester.check_openai_lm_head_loss_output(output_result)
-
-        output_result = tester.create_openai_double_heads(*config_and_inputs)
-        tester.check_openai_double_heads_output(output_result)
-        tester.check_openai_double_heads_loss_output(output_result)
-
-        tester.create_and_check_openai_for_headmasking(*config_and_inputs)
-        tester.create_and_check_openai_for_head_pruning(*config_and_inputs)
-
-    @classmethod
-    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a random int32 tensor of the shape within the vocab size."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
-
-
-if __name__ == "__main__":
-    unittest.main()