Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
68a889ee
Unverified
Commit
68a889ee
authored
Apr 17, 2019
by
Thomas Wolf
Committed by
GitHub
Apr 17, 2019
Browse files
Merge pull request #500 from huggingface/network
Updating network handling
parents
929579f3
34ae5bf8
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
131 additions
and
17 deletions
+131
-17
.circleci/config.yml
.circleci/config.yml
+2
-2
pytorch_pretrained_bert/file_utils.py
pytorch_pretrained_bert/file_utils.py
+25
-7
tests/conftest.py
tests/conftest.py
+19
-0
tests/modeling_gpt2_test.py
tests/modeling_gpt2_test.py
+11
-1
tests/modeling_openai_test.py
tests/modeling_openai_test.py
+11
-1
tests/modeling_test.py
tests/modeling_test.py
+11
-0
tests/modeling_transfo_xl_test.py
tests/modeling_transfo_xl_test.py
+11
-1
tests/tokenization_gpt2_test.py
tests/tokenization_gpt2_test.py
+10
-2
tests/tokenization_openai_test.py
tests/tokenization_openai_test.py
+11
-1
tests/tokenization_test.py
tests/tokenization_test.py
+10
-1
tests/tokenization_transfo_xl_test.py
tests/tokenization_transfo_xl_test.py
+10
-1
No files found.
.circleci/config.yml
View file @
68a889ee
...
...
@@ -9,7 +9,7 @@ jobs:
-
run
:
sudo pip install --progress-bar off .
-
run
:
sudo pip install pytest ftfy spacy
-
run
:
sudo python -m spacy download en
-
run
:
python -m pytest -sv tests/
-
run
:
python -m pytest -sv tests/
--runslow
build_py2
:
working_directory
:
~/pytorch-pretrained-BERT
docker
:
...
...
@@ -20,7 +20,7 @@ jobs:
-
run
:
sudo pip install pytest spacy
-
run
:
sudo pip install ftfy==4.4.3
-
run
:
sudo python -m spacy download en
-
run
:
python -m pytest -sv tests/
-
run
:
python -m pytest -sv tests/
--runslow
workflows
:
version
:
2
build_and_test
:
...
...
pytorch_pretrained_bert/file_utils.py
View file @
68a889ee
...
...
@@ -5,11 +5,13 @@ Copyright by the AllenNLP authors.
"""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
sys
import
json
import
logging
import
os
import
shutil
import
tempfile
import
fnmatch
from
functools
import
wraps
from
hashlib
import
sha256
import
sys
...
...
@@ -191,17 +193,30 @@ def get_from_cache(url, cache_dir=None):
if
url
.
startswith
(
"s3://"
):
etag
=
s3_etag
(
url
)
else
:
response
=
requests
.
head
(
url
,
allow_redirects
=
True
)
if
response
.
status_code
!=
200
:
raise
IOError
(
"HEAD request failed for url {} with status code {}"
.
format
(
url
,
response
.
status_code
))
etag
=
response
.
headers
.
get
(
"ETag"
)
try
:
response
=
requests
.
head
(
url
,
allow_redirects
=
True
)
if
response
.
status_code
!=
200
:
etag
=
None
else
:
etag
=
response
.
headers
.
get
(
"ETag"
)
except
EnvironmentError
:
etag
=
None
if
sys
.
version_info
[
0
]
==
2
and
etag
is
not
None
:
etag
=
etag
.
decode
(
'utf-8'
)
filename
=
url_to_filename
(
url
,
etag
)
# get cache path to put the file
cache_path
=
os
.
path
.
join
(
cache_dir
,
filename
)
# If we don't have a connection (etag is None) and can't identify the file
# try to get the last downloaded one
if
not
os
.
path
.
exists
(
cache_path
)
and
etag
is
None
:
matching_files
=
fnmatch
.
filter
(
os
.
listdir
(
cache_dir
),
filename
+
'.*'
)
matching_files
=
list
(
filter
(
lambda
s
:
not
s
.
endswith
(
'.json'
),
matching_files
))
if
matching_files
:
cache_path
=
os
.
path
.
join
(
cache_dir
,
matching_files
[
-
1
])
if
not
os
.
path
.
exists
(
cache_path
):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
...
...
@@ -226,8 +241,11 @@ def get_from_cache(url, cache_dir=None):
logger
.
info
(
"creating metadata file for %s"
,
cache_path
)
meta
=
{
'url'
:
url
,
'etag'
:
etag
}
meta_path
=
cache_path
+
'.json'
with
open
(
meta_path
,
'w'
,
encoding
=
"utf-8"
)
as
meta_file
:
meta_file
.
write
(
json
.
dumps
(
meta
))
with
open
(
meta_path
,
'w'
)
as
meta_file
:
output_string
=
json
.
dumps
(
meta
)
if
sys
.
version_info
[
0
]
==
2
and
isinstance
(
output_string
,
str
):
output_string
=
unicode
(
output_string
,
'utf-8'
)
# The beauty of python 2
meta_file
.
write
(
output_string
)
logger
.
info
(
"removing temp file %s"
,
temp_file
.
name
)
...
...
tests/conftest.py
0 → 100644
View file @
68a889ee
# content of conftest.py
import
pytest
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--runslow"
,
action
=
"store_true"
,
default
=
False
,
help
=
"run slow tests"
)
def
pytest_collection_modifyitems
(
config
,
items
):
if
config
.
getoption
(
"--runslow"
):
# --runslow given in cli: do not skip slow tests
return
skip_slow
=
pytest
.
mark
.
skip
(
reason
=
"need --runslow option to run"
)
for
item
in
items
:
if
"slow"
in
item
.
keywords
:
item
.
add_marker
(
skip_slow
)
tests/modeling_gpt2_test.py
View file @
68a889ee
...
...
@@ -20,12 +20,14 @@ import os
import
unittest
import
json
import
random
import
shutil
import
pytest
import
torch
from
pytorch_pretrained_bert
import
(
GPT2Config
,
GPT2Model
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
)
from
pytorch_pretrained_bert.modeling_gpt2
import
PRETRAINED_MODEL_ARCHIVE_MAP
class
GPT2ModelTest
(
unittest
.
TestCase
):
class
GPT2ModelTester
(
object
):
...
...
@@ -185,6 +187,14 @@ class GPT2ModelTest(unittest.TestCase):
os
.
remove
(
json_file_path
)
self
.
assertEqual
(
config_second
.
to_dict
(),
config_first
.
to_dict
())
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
GPT2Model
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
def
run_tester
(
self
,
tester
):
config_and_inputs
=
tester
.
prepare_config_and_inputs
()
output_result
=
tester
.
create_gpt2_model
(
*
config_and_inputs
)
...
...
tests/modeling_openai_test.py
View file @
68a889ee
...
...
@@ -20,12 +20,14 @@ import os
import
unittest
import
json
import
random
import
shutil
import
pytest
import
torch
from
pytorch_pretrained_bert
import
(
OpenAIGPTConfig
,
OpenAIGPTModel
,
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
)
from
pytorch_pretrained_bert.modeling_openai
import
PRETRAINED_MODEL_ARCHIVE_MAP
class
OpenAIGPTModelTest
(
unittest
.
TestCase
):
class
OpenAIGPTModelTester
(
object
):
...
...
@@ -197,6 +199,14 @@ class OpenAIGPTModelTest(unittest.TestCase):
os
.
remove
(
json_file_path
)
self
.
assertEqual
(
config_second
.
to_dict
(),
config_first
.
to_dict
())
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
OpenAIGPTModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
def
run_tester
(
self
,
tester
):
config_and_inputs
=
tester
.
prepare_config_and_inputs
()
output_result
=
tester
.
create_openai_model
(
*
config_and_inputs
)
...
...
tests/modeling_test.py
View file @
68a889ee
...
...
@@ -20,6 +20,8 @@ import os
import
unittest
import
json
import
random
import
shutil
import
pytest
import
torch
...
...
@@ -27,6 +29,7 @@ from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
BertForNextSentencePrediction
,
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForTokenClassification
)
from
pytorch_pretrained_bert.modeling
import
PRETRAINED_MODEL_ARCHIVE_MAP
class
BertModelTest
(
unittest
.
TestCase
):
...
...
@@ -260,6 +263,14 @@ class BertModelTest(unittest.TestCase):
os
.
remove
(
json_file_path
)
self
.
assertEqual
(
config_second
.
to_dict
(),
config_first
.
to_dict
())
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
BertModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
def
run_tester
(
self
,
tester
):
config_and_inputs
=
tester
.
prepare_config_and_inputs
()
output_result
=
tester
.
create_bert_model
(
*
config_and_inputs
)
...
...
tests/modeling_transfo_xl_test.py
View file @
68a889ee
...
...
@@ -20,11 +20,13 @@ import os
import
unittest
import
json
import
random
import
shutil
import
pytest
import
torch
from
pytorch_pretrained_bert
import
(
TransfoXLConfig
,
TransfoXLModel
,
TransfoXLLMHeadModel
)
from
pytorch_pretrained_bert.modeling_transfo_xl
import
PRETRAINED_MODEL_ARCHIVE_MAP
class
TransfoXLModelTest
(
unittest
.
TestCase
):
class
TransfoXLModelTester
(
object
):
...
...
@@ -195,6 +197,14 @@ class TransfoXLModelTest(unittest.TestCase):
os
.
remove
(
json_file_path
)
self
.
assertEqual
(
config_second
.
to_dict
(),
config_first
.
to_dict
())
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
TransfoXLModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
def
run_tester
(
self
,
tester
):
config_and_inputs
=
tester
.
prepare_config_and_inputs
()
...
...
tests/tokenization_gpt2_test.py
View file @
68a889ee
...
...
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
import
json
import
shutil
import
pytest
from
pytorch_pretrained_bert.tokenization_gpt2
import
GPT2Tokenizer
from
pytorch_pretrained_bert.tokenization_gpt2
import
GPT2Tokenizer
,
PRETRAINED_VOCAB_ARCHIVE_MAP
class
GPT2TokenizationTest
(
unittest
.
TestCase
):
...
...
@@ -38,7 +40,6 @@ class GPT2TokenizationTest(unittest.TestCase):
merges_file
=
fp
.
name
tokenizer
=
GPT2Tokenizer
(
vocab_file
,
merges_file
,
special_tokens
=
[
"<unk>"
,
"<pad>"
])
print
(
"encoder"
,
tokenizer
.
byte_encoder
)
os
.
remove
(
vocab_file
)
os
.
remove
(
merges_file
)
...
...
@@ -64,6 +65,13 @@ class GPT2TokenizationTest(unittest.TestCase):
[
tokenizer_2
.
encoder
,
tokenizer_2
.
decoder
,
tokenizer_2
.
bpe_ranks
,
tokenizer_2
.
special_tokens
,
tokenizer_2
.
special_tokens_decoder
])
# @pytest.mark.slow
def
test_tokenizer_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
())[:
1
]:
tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
tokenizer
)
if
__name__
==
'__main__'
:
unittest
.
main
()
tests/tokenization_openai_test.py
View file @
68a889ee
...
...
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
import
json
import
shutil
import
pytest
from
pytorch_pretrained_bert.tokenization_openai
import
OpenAIGPTTokenizer
from
pytorch_pretrained_bert.tokenization_openai
import
OpenAIGPTTokenizer
,
PRETRAINED_VOCAB_ARCHIVE_MAP
class
OpenAIGPTTokenizationTest
(
unittest
.
TestCase
):
...
...
@@ -64,6 +66,14 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
[
tokenizer_2
.
encoder
,
tokenizer_2
.
decoder
,
tokenizer_2
.
bpe_ranks
,
tokenizer_2
.
special_tokens
,
tokenizer_2
.
special_tokens_decoder
])
@
pytest
.
mark
.
slow
def
test_tokenizer_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
())[:
1
]:
tokenizer
=
OpenAIGPTTokenizer
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
tokenizer
)
if
__name__
==
'__main__'
:
unittest
.
main
()
tests/tokenization_test.py
View file @
68a889ee
...
...
@@ -17,12 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
from
io
import
open
import
shutil
import
pytest
from
pytorch_pretrained_bert.tokenization
import
(
BasicTokenizer
,
BertTokenizer
,
WordpieceTokenizer
,
_is_control
,
_is_punctuation
,
_is_whitespace
)
_is_whitespace
,
PRETRAINED_VOCAB_ARCHIVE_MAP
)
class
TokenizationTest
(
unittest
.
TestCase
):
...
...
@@ -56,6 +58,13 @@ class TokenizationTest(unittest.TestCase):
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
7
,
4
,
5
,
10
,
8
,
9
])
@
pytest
.
mark
.
slow
def
test_tokenizer_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
())[:
1
]:
tokenizer
=
BertTokenizer
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
tokenizer
)
def
test_chinese
(
self
):
tokenizer
=
BasicTokenizer
()
...
...
tests/tokenization_transfo_xl_test.py
View file @
68a889ee
...
...
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
from
io
import
open
import
shutil
import
pytest
from
pytorch_pretrained_bert.tokenization_transfo_xl
import
TransfoXLTokenizer
from
pytorch_pretrained_bert.tokenization_transfo_xl
import
TransfoXLTokenizer
,
PRETRAINED_VOCAB_ARCHIVE_MAP
class
TransfoXLTokenizationTest
(
unittest
.
TestCase
):
...
...
@@ -66,6 +68,13 @@ class TransfoXLTokenizationTest(unittest.TestCase):
tokenizer
.
tokenize
(
u
"
\t
HeLLo ! how
\n
Are yoU ? "
),
[
"HeLLo"
,
"!"
,
"how"
,
"Are"
,
"yoU"
,
"?"
])
@
pytest
.
mark
.
slow
def
test_tokenizer_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_pretrained_bert_test/"
for
model_name
in
list
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
())[:
1
]:
tokenizer
=
TransfoXLTokenizer
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
tokenizer
)
if
__name__
==
'__main__'
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment