Merge pull request #500 from huggingface/network

Updating network handling

Merge pull request #500 from huggingface/network
Updating network handling
68a889ee · Thomas Wolf · GitHub · 929579f3 · 34ae5bf8 · 68a889ee
Unverified Commit 68a889ee authored Apr 17, 2019 by Thomas Wolf Committed by GitHub Apr 17, 2019
11 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -9,7 +9,7 @@ jobs:
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest ftfy spacy
            - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/
+            - run: python -m pytest -sv tests/ --runslow
    build_py2:
        working_directory: ~/pytorch-pretrained-BERT
        docker:
@@ -20,7 +20,7 @@ jobs:
            - run: sudo pip install pytest spacy
            - run: sudo pip install ftfy==4.4.3
            - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/
+            - run: python -m pytest -sv tests/ --runslow
 workflows:
  version: 2
  build_and_test:

--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -5,11 +5,13 @@ Copyright by the AllenNLP authors.
 """
 from __future__ import (absolute_import, division, print_function, unicode_literals)

+import sys
 import json
 import logging
 import os
 import shutil
 import tempfile
+import fnmatch
 from functools import wraps
 from hashlib import sha256
 import sys
@@ -191,17 +193,30 @@ def get_from_cache(url, cache_dir=None):
    if url.startswith("s3://"):
        etag = s3_etag(url)
    else:
-        response = requests.head(url, allow_redirects=True)
-        if response.status_code != 200:
-            raise IOError("HEAD request failed for url {} with status code {}"
-                          .format(url, response.status_code))
-        etag = response.headers.get("ETag")
+        try:
+            response = requests.head(url, allow_redirects=True)
+            if response.status_code != 200:
+                etag = None
+            else:
+                etag = response.headers.get("ETag")
+        except EnvironmentError:
+            etag = None

+    if sys.version_info[0] == 2 and etag is not None:
+        etag = etag.decode('utf-8')
    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

+    # If we don't have a connection (etag is None) and can't identify the file
+    # try to get the last downloaded one
+    if not os.path.exists(cache_path) and etag is None:
+        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
+        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
+        if matching_files:
+            cache_path = os.path.join(cache_dir, matching_files[-1])
+
    if not os.path.exists(cache_path):
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
@@ -226,8 +241,11 @@ def get_from_cache(url, cache_dir=None):
            logger.info("creating metadata file for %s", cache_path)
            meta = {'url': url, 'etag': etag}
            meta_path = cache_path + '.json'
-            with open(meta_path, 'w', encoding="utf-8") as meta_file:
-                meta_file.write(json.dumps(meta))
+            with open(meta_path, 'w') as meta_file:
+                output_string = json.dumps(meta)
+                if sys.version_info[0] == 2 and isinstance(output_string, str):
+                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                meta_file.write(output_string)

            logger.info("removing temp file %s", temp_file.name)


--- a/tests/conftest.py
+++ b/tests/conftest.py
+# content of conftest.py
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--runslow", action="store_true", default=False, help="run slow tests"
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--runslow"):
+        # --runslow given in cli: do not skip slow tests
+        return
+    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+    for item in items:
+        if "slow" in item.keywords:
+            item.add_marker(skip_slow)
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -20,12 +20,14 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest

 import torch

 from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
-
+from pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP

 class GPT2ModelTest(unittest.TestCase):
    class GPT2ModelTester(object):
@@ -185,6 +187,14 @@ class GPT2ModelTest(unittest.TestCase):
        os.remove(json_file_path)
        self.assertEqual(config_second.to_dict(), config_first.to_dict())

+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        output_result = tester.create_gpt2_model(*config_and_inputs)

--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -20,12 +20,14 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest

 import torch

 from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-
+from pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP

 class OpenAIGPTModelTest(unittest.TestCase):
    class OpenAIGPTModelTester(object):
@@ -197,6 +199,14 @@ class OpenAIGPTModelTest(unittest.TestCase):
        os.remove(json_file_path)
        self.assertEqual(config_second.to_dict(), config_first.to_dict())

+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        output_result = tester.create_openai_model(*config_and_inputs)

--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -20,6 +20,8 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest

 import torch

@@ -27,6 +29,7 @@ from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
                                     BertForNextSentencePrediction, BertForPreTraining,
                                     BertForQuestionAnswering, BertForSequenceClassification,
                                     BertForTokenClassification)
+from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP


 class BertModelTest(unittest.TestCase):
@@ -260,6 +263,14 @@ class BertModelTest(unittest.TestCase):
        os.remove(json_file_path)
        self.assertEqual(config_second.to_dict(), config_first.to_dict())

+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()
        output_result = tester.create_bert_model(*config_and_inputs)

--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
@@ -20,11 +20,13 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest

 import torch

 from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-
+from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP

 class TransfoXLModelTest(unittest.TestCase):
    class TransfoXLModelTester(object):
@@ -195,6 +197,14 @@ class TransfoXLModelTest(unittest.TestCase):
        os.remove(json_file_path)
        self.assertEqual(config_second.to_dict(), config_first.to_dict())

+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
    def run_tester(self, tester):
        config_and_inputs = tester.prepare_config_and_inputs()


--- a/tests/tokenization_gpt2_test.py
+++ b/tests/tokenization_gpt2_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
+import shutil
+import pytest

-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP


 class GPT2TokenizationTest(unittest.TestCase):
@@ -38,7 +40,6 @@ class GPT2TokenizationTest(unittest.TestCase):
            merges_file = fp.name

        tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        print("encoder", tokenizer.byte_encoder)
        os.remove(vocab_file)
        os.remove(merges_file)

@@ -64,6 +65,13 @@ class GPT2TokenizationTest(unittest.TestCase):
            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])

+    # @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)

 if __name__ == '__main__':
    unittest.main()
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
+import shutil
+import pytest

-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP


 class OpenAIGPTTokenizationTest(unittest.TestCase):
@@ -64,6 +66,14 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])

+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)
+

 if __name__ == '__main__':
    unittest.main()
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -17,12 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
+import shutil
+import pytest

 from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                  BertTokenizer,
                                                  WordpieceTokenizer,
                                                  _is_control, _is_punctuation,
-                                                  _is_whitespace)
+                                                  _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)


 class TokenizationTest(unittest.TestCase):
@@ -56,6 +58,13 @@ class TokenizationTest(unittest.TestCase):
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)

    def test_chinese(self):
        tokenizer = BasicTokenizer()

--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
+import shutil
+import pytest

-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP


 class TransfoXLTokenizationTest(unittest.TestCase):
@@ -66,6 +68,13 @@ class TransfoXLTokenizationTest(unittest.TestCase):
            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"])

+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)

 if __name__ == '__main__':
    unittest.main()