Merge pull request #254 from huggingface/python_2

Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2

Merge pull request #254 from huggingface/python_2
Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
03cdb2a3 · Thomas Wolf · GitHub · 2dfaf2f2 · 1e71f11d · 03cdb2a3
Unverified Commit 03cdb2a3 authored Feb 11, 2019 by Thomas Wolf Committed by GitHub Feb 11, 2019
10 changed files
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 """Tokenization classes."""
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function, unicode_literals
-from __future__ import division
-from __future__ import print_function
 import collections
-import unicodedata
-import os
 import logging
+import os
+import unicodedata
+from io import open
 from .file_utils import cached_path
@@ -117,26 +116,26 @@ class BertTokenizer(object):
        return tokens
    @classmethod
-    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
-        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
-            vocab_file = pretrained_model_name
+            vocab_file = pretrained_model_name_or_path
        if os.path.isdir(vocab_file):
            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
-                    pretrained_model_name,
+                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    vocab_file))
            return None
@@ -145,10 +144,10 @@ class BertTokenizer(object):
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
-        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)

--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
--- a/setup.py
+++ b/setup.py
@@ -33,12 +33,13 @@ To create the package for pypi.
 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 """
+from io import open
 from setuptools import find_packages, setup
 setup(
    name="pytorch_pretrained_bert",
-    version="0.4.0",
+    version="0.5.0",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
+    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
    author_email="thomas@huggingface.co",
    description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
    long_description=open("README.md", "r", encoding='utf-8').read(),
@@ -55,10 +56,10 @@ setup(
                      'tqdm'],
    entry_points={
      'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main"
+        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
      ]
    },
-    python_requires='>=3.5.0',
+    # python_requires='>=3.5.0',
    tests_require=['pytest'],
    classifiers=[
          'Intended Audience :: Science/Research',

--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -114,6 +114,7 @@ class BertModelTest(unittest.TestCase):
        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertModel(config=config)
+            model.eval()
            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
            outputs = {
                "sequence_output": all_encoder_layers[-1],
@@ -134,6 +135,7 @@ class BertModelTest(unittest.TestCase):
        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForMaskedLM(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels)
            prediction_scores = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -149,6 +151,7 @@ class BertModelTest(unittest.TestCase):
        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForNextSentencePrediction(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
            seq_relationship_score = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -165,6 +168,7 @@ class BertModelTest(unittest.TestCase):
        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForPreTraining(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
            prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -185,6 +189,7 @@ class BertModelTest(unittest.TestCase):
        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForQuestionAnswering(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -205,6 +210,7 @@ class BertModelTest(unittest.TestCase):
        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
            logits = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -221,6 +227,7 @@ class BertModelTest(unittest.TestCase):
        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForTokenClassification(config=config, num_labels=self.num_labels)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels)
            logits = model(input_ids, token_type_ids, input_mask)
            outputs = {

--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import unittest
+import json
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+class OpenAIGPTTokenizationTest(unittest.TestCase):
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "w</w>", "r</w>", "t</w>",
+                 "lo", "low", "er</w>",
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
+        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
+            json.dump(vocab_tokens, fp)
+            vocab_file = fp.name
+        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
+            fp.write("\n".join(merges))
+            merges_file = fp.name
+        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
+        os.remove(vocab_file)
+        os.remove(merges_file)
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py