Merge pull request #254 from huggingface/python_2

Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2

Merge pull request #254 from huggingface/python_2
Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
03cdb2a3 · Thomas Wolf · GitHub · 2dfaf2f2 · 1e71f11d · 03cdb2a3
Unverified Commit 03cdb2a3 authored Feb 11, 2019 by Thomas Wolf Committed by GitHub Feb 11, 2019
10 changed files
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 """Tokenization classes."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals

 import collections
-import unicodedata
-import os
 import logging
+import os
+import unicodedata
+from io import open

 from .file_utils import cached_path

@@ -117,26 +116,26 @@ class BertTokenizer(object):
        return tokens

    @classmethod
-    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
-        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
-            vocab_file = pretrained_model_name
+            vocab_file = pretrained_model_name_or_path
        if os.path.isdir(vocab_file):
            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
-                    pretrained_model_name,
+                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    vocab_file))
            return None
@@ -145,10 +144,10 @@ class BertTokenizer(object):
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
-        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)

--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
--- a/setup.py
+++ b/setup.py
@@ -33,12 +33,13 @@ To create the package for pypi.
 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.

 """
+from io import open
 from setuptools import find_packages, setup

 setup(
    name="pytorch_pretrained_bert",
-    version="0.4.0",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
+    version="0.5.0",
+    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
    author_email="thomas@huggingface.co",
    description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
    long_description=open("README.md", "r", encoding='utf-8').read(),
@@ -55,10 +56,10 @@ setup(
                      'tqdm'],
    entry_points={
      'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main"
+        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
      ]
    },
-    python_requires='>=3.5.0',
+    # python_requires='>=3.5.0',
    tests_require=['pytest'],
    classifiers=[
          'Intended Audience :: Science/Research',

--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -114,6 +114,7 @@ class BertModelTest(unittest.TestCase):

        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertModel(config=config)
+            model.eval()
            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
            outputs = {
                "sequence_output": all_encoder_layers[-1],
@@ -134,6 +135,7 @@ class BertModelTest(unittest.TestCase):

        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForMaskedLM(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels)
            prediction_scores = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -149,6 +151,7 @@ class BertModelTest(unittest.TestCase):

        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForNextSentencePrediction(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
            seq_relationship_score = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -165,6 +168,7 @@ class BertModelTest(unittest.TestCase):

        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForPreTraining(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
            prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -185,6 +189,7 @@ class BertModelTest(unittest.TestCase):

        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForQuestionAnswering(config=config)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -205,6 +210,7 @@ class BertModelTest(unittest.TestCase):

        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
            logits = model(input_ids, token_type_ids, input_mask)
            outputs = {
@@ -221,6 +227,7 @@ class BertModelTest(unittest.TestCase):

        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertForTokenClassification(config=config, num_labels=self.num_labels)
+            model.eval()
            loss = model(input_ids, token_type_ids, input_mask, token_labels)
            logits = model(input_ids, token_type_ids, input_mask)
            outputs = {

--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -12,15 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals

 import os
 import unittest
+from io import open

-from pytorch_pretrained_bert.tokenization import (BertTokenizer, BasicTokenizer, WordpieceTokenizer,
-                                                  _is_whitespace, _is_control, _is_punctuation)
+from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
+                                                  BertTokenizer,
+                                                  WordpieceTokenizer,
+                                                  _is_control, _is_punctuation,
+                                                  _is_whitespace)


 class TokenizationTest(unittest.TestCase):
@@ -30,7 +32,7 @@ class TokenizationTest(unittest.TestCase):
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing", ","
        ]
-        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
+        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

            vocab_file = vocab_writer.name
@@ -49,7 +51,7 @@ class TokenizationTest(unittest.TestCase):
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing", ","
        ]
-        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
+        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
            vocab_file = vocab_writer.name


--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py