"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "e4679cddced7d746427066a78e8079fb40e51528"
Unverified Commit 68a889ee authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #500 from huggingface/network

Updating network handling
parents 929579f3 34ae5bf8
...@@ -9,7 +9,7 @@ jobs: ...@@ -9,7 +9,7 @@ jobs:
- run: sudo pip install --progress-bar off . - run: sudo pip install --progress-bar off .
- run: sudo pip install pytest ftfy spacy - run: sudo pip install pytest ftfy spacy
- run: sudo python -m spacy download en - run: sudo python -m spacy download en
- run: python -m pytest -sv tests/ - run: python -m pytest -sv tests/ --runslow
build_py2: build_py2:
working_directory: ~/pytorch-pretrained-BERT working_directory: ~/pytorch-pretrained-BERT
docker: docker:
...@@ -20,7 +20,7 @@ jobs: ...@@ -20,7 +20,7 @@ jobs:
- run: sudo pip install pytest spacy - run: sudo pip install pytest spacy
- run: sudo pip install ftfy==4.4.3 - run: sudo pip install ftfy==4.4.3
- run: sudo python -m spacy download en - run: sudo python -m spacy download en
- run: python -m pytest -sv tests/ - run: python -m pytest -sv tests/ --runslow
workflows: workflows:
version: 2 version: 2
build_and_test: build_and_test:
......
...@@ -5,11 +5,13 @@ Copyright by the AllenNLP authors. ...@@ -5,11 +5,13 @@ Copyright by the AllenNLP authors.
""" """
from __future__ import (absolute_import, division, print_function, unicode_literals) from __future__ import (absolute_import, division, print_function, unicode_literals)
import sys
import json import json
import logging import logging
import os import os
import shutil import shutil
import tempfile import tempfile
import fnmatch
from functools import wraps from functools import wraps
from hashlib import sha256 from hashlib import sha256
import sys import sys
...@@ -191,17 +193,30 @@ def get_from_cache(url, cache_dir=None): ...@@ -191,17 +193,30 @@ def get_from_cache(url, cache_dir=None):
if url.startswith("s3://"): if url.startswith("s3://"):
etag = s3_etag(url) etag = s3_etag(url)
else: else:
response = requests.head(url, allow_redirects=True) try:
if response.status_code != 200: response = requests.head(url, allow_redirects=True)
raise IOError("HEAD request failed for url {} with status code {}" if response.status_code != 200:
.format(url, response.status_code)) etag = None
etag = response.headers.get("ETag") else:
etag = response.headers.get("ETag")
except EnvironmentError:
etag = None
if sys.version_info[0] == 2 and etag is not None:
etag = etag.decode('utf-8')
filename = url_to_filename(url, etag) filename = url_to_filename(url, etag)
# get cache path to put the file # get cache path to put the file
cache_path = os.path.join(cache_dir, filename) cache_path = os.path.join(cache_dir, filename)
# If we don't have a connection (etag is None) and can't identify the file
# try to get the last downloaded one
if not os.path.exists(cache_path) and etag is None:
matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
if matching_files:
cache_path = os.path.join(cache_dir, matching_files[-1])
if not os.path.exists(cache_path): if not os.path.exists(cache_path):
# Download to temporary file, then copy to cache dir once finished. # Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted. # Otherwise you get corrupt cache entries if the download gets interrupted.
...@@ -226,8 +241,11 @@ def get_from_cache(url, cache_dir=None): ...@@ -226,8 +241,11 @@ def get_from_cache(url, cache_dir=None):
logger.info("creating metadata file for %s", cache_path) logger.info("creating metadata file for %s", cache_path)
meta = {'url': url, 'etag': etag} meta = {'url': url, 'etag': etag}
meta_path = cache_path + '.json' meta_path = cache_path + '.json'
with open(meta_path, 'w', encoding="utf-8") as meta_file: with open(meta_path, 'w') as meta_file:
meta_file.write(json.dumps(meta)) output_string = json.dumps(meta)
if sys.version_info[0] == 2 and isinstance(output_string, str):
output_string = unicode(output_string, 'utf-8') # The beauty of python 2
meta_file.write(output_string)
logger.info("removing temp file %s", temp_file.name) logger.info("removing temp file %s", temp_file.name)
......
# content of conftest.py
import pytest
def pytest_addoption(parser):
parser.addoption(
"--runslow", action="store_true", default=False, help="run slow tests"
)
def pytest_collection_modifyitems(config, items):
if config.getoption("--runslow"):
# --runslow given in cli: do not skip slow tests
return
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
...@@ -20,12 +20,14 @@ import os ...@@ -20,12 +20,14 @@ import os
import unittest import unittest
import json import json
import random import random
import shutil
import pytest
import torch import torch
from pytorch_pretrained_bert import (GPT2Config, GPT2Model, from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
GPT2LMHeadModel, GPT2DoubleHeadsModel) GPT2LMHeadModel, GPT2DoubleHeadsModel)
from pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP
class GPT2ModelTest(unittest.TestCase): class GPT2ModelTest(unittest.TestCase):
class GPT2ModelTester(object): class GPT2ModelTester(object):
...@@ -185,6 +187,14 @@ class GPT2ModelTest(unittest.TestCase): ...@@ -185,6 +187,14 @@ class GPT2ModelTest(unittest.TestCase):
os.remove(json_file_path) os.remove(json_file_path)
self.assertEqual(config_second.to_dict(), config_first.to_dict()) self.assertEqual(config_second.to_dict(), config_first.to_dict())
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
def run_tester(self, tester): def run_tester(self, tester):
config_and_inputs = tester.prepare_config_and_inputs() config_and_inputs = tester.prepare_config_and_inputs()
output_result = tester.create_gpt2_model(*config_and_inputs) output_result = tester.create_gpt2_model(*config_and_inputs)
......
...@@ -20,12 +20,14 @@ import os ...@@ -20,12 +20,14 @@ import os
import unittest import unittest
import json import json
import random import random
import shutil
import pytest
import torch import torch
from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
from pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP
class OpenAIGPTModelTest(unittest.TestCase): class OpenAIGPTModelTest(unittest.TestCase):
class OpenAIGPTModelTester(object): class OpenAIGPTModelTester(object):
...@@ -197,6 +199,14 @@ class OpenAIGPTModelTest(unittest.TestCase): ...@@ -197,6 +199,14 @@ class OpenAIGPTModelTest(unittest.TestCase):
os.remove(json_file_path) os.remove(json_file_path)
self.assertEqual(config_second.to_dict(), config_first.to_dict()) self.assertEqual(config_second.to_dict(), config_first.to_dict())
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
def run_tester(self, tester): def run_tester(self, tester):
config_and_inputs = tester.prepare_config_and_inputs() config_and_inputs = tester.prepare_config_and_inputs()
output_result = tester.create_openai_model(*config_and_inputs) output_result = tester.create_openai_model(*config_and_inputs)
......
...@@ -20,6 +20,8 @@ import os ...@@ -20,6 +20,8 @@ import os
import unittest import unittest
import json import json
import random import random
import shutil
import pytest
import torch import torch
...@@ -27,6 +29,7 @@ from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM, ...@@ -27,6 +29,7 @@ from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
BertForNextSentencePrediction, BertForPreTraining, BertForNextSentencePrediction, BertForPreTraining,
BertForQuestionAnswering, BertForSequenceClassification, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification) BertForTokenClassification)
from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
class BertModelTest(unittest.TestCase): class BertModelTest(unittest.TestCase):
...@@ -260,6 +263,14 @@ class BertModelTest(unittest.TestCase): ...@@ -260,6 +263,14 @@ class BertModelTest(unittest.TestCase):
os.remove(json_file_path) os.remove(json_file_path)
self.assertEqual(config_second.to_dict(), config_first.to_dict()) self.assertEqual(config_second.to_dict(), config_first.to_dict())
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
def run_tester(self, tester): def run_tester(self, tester):
config_and_inputs = tester.prepare_config_and_inputs() config_and_inputs = tester.prepare_config_and_inputs()
output_result = tester.create_bert_model(*config_and_inputs) output_result = tester.create_bert_model(*config_and_inputs)
......
...@@ -20,11 +20,13 @@ import os ...@@ -20,11 +20,13 @@ import os
import unittest import unittest
import json import json
import random import random
import shutil
import pytest
import torch import torch
from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
class TransfoXLModelTest(unittest.TestCase): class TransfoXLModelTest(unittest.TestCase):
class TransfoXLModelTester(object): class TransfoXLModelTester(object):
...@@ -195,6 +197,14 @@ class TransfoXLModelTest(unittest.TestCase): ...@@ -195,6 +197,14 @@ class TransfoXLModelTest(unittest.TestCase):
os.remove(json_file_path) os.remove(json_file_path)
self.assertEqual(config_second.to_dict(), config_first.to_dict()) self.assertEqual(config_second.to_dict(), config_first.to_dict())
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
def run_tester(self, tester): def run_tester(self, tester):
config_and_inputs = tester.prepare_config_and_inputs() config_and_inputs = tester.prepare_config_and_inputs()
......
...@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os import os
import unittest import unittest
import json import json
import shutil
import pytest
from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
class GPT2TokenizationTest(unittest.TestCase): class GPT2TokenizationTest(unittest.TestCase):
...@@ -38,7 +40,6 @@ class GPT2TokenizationTest(unittest.TestCase): ...@@ -38,7 +40,6 @@ class GPT2TokenizationTest(unittest.TestCase):
merges_file = fp.name merges_file = fp.name
tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"]) tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
print("encoder", tokenizer.byte_encoder)
os.remove(vocab_file) os.remove(vocab_file)
os.remove(merges_file) os.remove(merges_file)
...@@ -64,6 +65,13 @@ class GPT2TokenizationTest(unittest.TestCase): ...@@ -64,6 +65,13 @@ class GPT2TokenizationTest(unittest.TestCase):
[tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder]) tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
# @pytest.mark.slow
def test_tokenizer_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(tokenizer)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os import os
import unittest import unittest
import json import json
import shutil
import pytest
from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
class OpenAIGPTTokenizationTest(unittest.TestCase): class OpenAIGPTTokenizationTest(unittest.TestCase):
...@@ -64,6 +66,14 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): ...@@ -64,6 +66,14 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
[tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder]) tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
@pytest.mark.slow
def test_tokenizer_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(tokenizer)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -17,12 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -17,12 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os import os
import unittest import unittest
from io import open from io import open
import shutil
import pytest
from pytorch_pretrained_bert.tokenization import (BasicTokenizer, from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
BertTokenizer, BertTokenizer,
WordpieceTokenizer, WordpieceTokenizer,
_is_control, _is_punctuation, _is_control, _is_punctuation,
_is_whitespace) _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)
class TokenizationTest(unittest.TestCase): class TokenizationTest(unittest.TestCase):
...@@ -56,6 +58,13 @@ class TokenizationTest(unittest.TestCase): ...@@ -56,6 +58,13 @@ class TokenizationTest(unittest.TestCase):
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
@pytest.mark.slow
def test_tokenizer_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(tokenizer)
def test_chinese(self): def test_chinese(self):
tokenizer = BasicTokenizer() tokenizer = BasicTokenizer()
......
...@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os import os
import unittest import unittest
from io import open from io import open
import shutil
import pytest
from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
class TransfoXLTokenizationTest(unittest.TestCase): class TransfoXLTokenizationTest(unittest.TestCase):
...@@ -66,6 +68,13 @@ class TransfoXLTokenizationTest(unittest.TestCase): ...@@ -66,6 +68,13 @@ class TransfoXLTokenizationTest(unittest.TestCase):
tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "),
["HeLLo", "!", "how", "Are", "yoU", "?"]) ["HeLLo", "!", "how", "Are", "yoU", "?"])
@pytest.mark.slow
def test_tokenizer_from_pretrained(self):
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(tokenizer)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment