Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a75c64d8
"pytorch_transformers/model_utils.py" did not exist on "4d47f4985dfb09237b6e11b5eafb0b1935f8c634"
Commit
a75c64d8
authored
Aug 26, 2020
by
Lysandre
Browse files
Black 20 release
parent
e78c1103
Changes
191
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
129 additions
and
36 deletions
+129
-36
tests/test_modeling_transfo_xl.py
tests/test_modeling_transfo_xl.py
+2
-1
tests/test_modeling_xlm.py
tests/test_modeling_xlm.py
+2
-1
tests/test_modeling_xlnet.py
tests/test_modeling_xlnet.py
+22
-4
tests/test_pipelines.py
tests/test_pipelines.py
+36
-7
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+25
-8
tests/test_tokenization_fast.py
tests/test_tokenization_fast.py
+24
-7
tests/test_tokenization_mbart.py
tests/test_tokenization_mbart.py
+3
-1
tests/test_tokenization_reformer.py
tests/test_tokenization_reformer.py
+4
-2
tests/test_tokenization_t5.py
tests/test_tokenization_t5.py
+5
-1
tests/test_trainer.py
tests/test_trainer.py
+3
-1
utils/link_tester.py
utils/link_tester.py
+3
-3
No files found.
tests/test_modeling_transfo_xl.py
View file @
a75c64d8
...
...
@@ -32,7 +32,8 @@ if is_torch_available():
class
TransfoXLModelTester
:
def
__init__
(
self
,
parent
,
self
,
parent
,
):
self
.
parent
=
parent
self
.
batch_size
=
14
...
...
tests/test_modeling_xlm.py
View file @
a75c64d8
...
...
@@ -41,7 +41,8 @@ if is_torch_available():
class
XLMModelTester
:
def
__init__
(
self
,
parent
,
self
,
parent
,
):
self
.
parent
=
parent
self
.
batch_size
=
13
...
...
tests/test_modeling_xlnet.py
View file @
a75c64d8
...
...
@@ -104,10 +104,20 @@ class XLNetModelTester:
input_ids_q
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
+
1
],
self
.
vocab_size
)
perm_mask
=
torch
.
zeros
(
self
.
batch_size
,
self
.
seq_length
+
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
,
device
=
torch_device
,
self
.
batch_size
,
self
.
seq_length
+
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
,
device
=
torch_device
,
)
perm_mask
[:,
:,
-
1
]
=
1.0
# Previous tokens don't see last token
target_mapping
=
torch
.
zeros
(
self
.
batch_size
,
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
,
device
=
torch_device
,)
target_mapping
=
torch
.
zeros
(
self
.
batch_size
,
1
,
self
.
seq_length
+
1
,
dtype
=
torch
.
float
,
device
=
torch_device
,
)
target_mapping
[:,
0
,
-
1
]
=
1.0
# predict last token
sequence_labels
=
None
...
...
@@ -217,7 +227,11 @@ class XLNetModelTester:
# first forward pass
causal_mask
=
torch
.
ones
(
input_ids_1
.
shape
[
0
],
input_ids_1
.
shape
[
1
],
input_ids_1
.
shape
[
1
],
dtype
=
torch
.
float
,
device
=
torch_device
,
input_ids_1
.
shape
[
0
],
input_ids_1
.
shape
[
1
],
input_ids_1
.
shape
[
1
],
dtype
=
torch
.
float
,
device
=
torch_device
,
)
causal_mask
=
torch
.
triu
(
causal_mask
,
diagonal
=
0
)
outputs_cache
=
model
(
input_ids_1
,
use_cache
=
True
,
perm_mask
=
causal_mask
)
...
...
@@ -363,7 +377,11 @@ class XLNetModelTester:
total_loss
,
mems
=
result_with_labels
.
to_tuple
()
result_with_labels
=
model
(
input_ids_1
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
,)
result_with_labels
=
model
(
input_ids_1
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
,
)
total_loss
,
mems
=
result_with_labels
.
to_tuple
()
...
...
tests/test_pipelines.py
View file @
a75c64d8
...
...
@@ -164,7 +164,8 @@ class MonoColumnInputTestCase(unittest.TestCase):
for
result
,
expect
in
zip
(
multi_result
,
expected_multi_result
):
for
key
in
expected_check_keys
or
[]:
self
.
assertEqual
(
set
([
o
[
key
]
for
o
in
result
]),
set
([
o
[
key
]
for
o
in
expect
]),
set
([
o
[
key
]
for
o
in
result
]),
set
([
o
[
key
]
for
o
in
expect
]),
)
if
isinstance
(
multi_result
[
0
],
list
):
...
...
@@ -214,7 +215,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
"This is"
# No mask_token is not supported
]
for
model_name
in
FILL_MASK_FINETUNED_MODELS
:
nlp
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
,
topk
=
2
,)
nlp
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
,
topk
=
2
,
)
self
.
_test_mono_column_pipeline
(
nlp
,
valid_inputs
,
mandatory_keys
,
invalid_inputs
,
expected_check_keys
=
[
"sequence"
]
)
...
...
@@ -231,7 +238,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
"This is"
# No mask_token is not supported
]
for
model_name
in
FILL_MASK_FINETUNED_MODELS
:
nlp
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
,
topk
=
2
,)
nlp
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
,
topk
=
2
,
)
self
.
_test_mono_column_pipeline
(
nlp
,
valid_inputs
,
mandatory_keys
,
invalid_inputs
,
expected_check_keys
=
[
"sequence"
]
)
...
...
@@ -274,7 +287,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
]
valid_targets
=
[
" Patrick"
,
" Clara"
]
for
model_name
in
LARGE_FILL_MASK_FINETUNED_MODELS
:
nlp
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
,
topk
=
2
,)
nlp
=
pipeline
(
task
=
"fill-mask"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"pt"
,
topk
=
2
,
)
self
.
_test_mono_column_pipeline
(
nlp
,
valid_inputs
,
...
...
@@ -343,7 +362,12 @@ class MonoColumnInputTestCase(unittest.TestCase):
invalid_inputs
=
[
4
,
"<mask>"
]
mandatory_keys
=
[
"summary_text"
]
for
model_name
in
TF_SUMMARIZATION_FINETUNED_MODELS
:
nlp
=
pipeline
(
task
=
"summarization"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
,)
nlp
=
pipeline
(
task
=
"summarization"
,
model
=
model_name
,
tokenizer
=
model_name
,
framework
=
"tf"
,
)
self
.
_test_mono_column_pipeline
(
nlp
,
VALID_INPUTS
,
mandatory_keys
,
invalid_inputs
=
invalid_inputs
,
**
SUMMARIZATION_KWARGS
)
...
...
@@ -355,7 +379,10 @@ class MonoColumnInputTestCase(unittest.TestCase):
for
model_name
,
task
in
TRANSLATION_FINETUNED_MODELS
:
nlp
=
pipeline
(
task
=
task
,
model
=
model_name
,
tokenizer
=
model_name
)
self
.
_test_mono_column_pipeline
(
nlp
,
VALID_INPUTS
,
mandatory_keys
,
invalid_inputs
,
nlp
,
VALID_INPUTS
,
mandatory_keys
,
invalid_inputs
,
)
@
require_tf
...
...
@@ -655,7 +682,9 @@ class QAPipelineTests(unittest.TestCase):
class
NerPipelineTests
(
unittest
.
TestCase
):
def
_test_ner_pipeline
(
self
,
nlp
:
Pipeline
,
output_keys
:
Iterable
[
str
],
self
,
nlp
:
Pipeline
,
output_keys
:
Iterable
[
str
],
):
ungrouped_ner_inputs
=
[
...
...
tests/test_tokenization_common.py
View file @
a75c64d8
...
...
@@ -882,8 +882,7 @@ class TokenizerTesterMixin:
assert
encoded_sequence
==
padded_sequence_left
def
test_padding_to_max_length
(
self
):
""" We keep this test for backward compatibility but it should be remove when `pad_to_max_length` will e deprecated
"""
"""We keep this test for backward compatibility but it should be remove when `pad_to_max_length` will e deprecated"""
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
...
...
@@ -972,7 +971,11 @@ class TokenizerTesterMixin:
# Test 'longest' and 'no_padding' don't do anything
tokenizer
.
padding_side
=
"right"
not_padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
padding
=
True
,
return_special_tokens_mask
=
True
,)
not_padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
padding
=
True
,
return_special_tokens_mask
=
True
,
)
not_padded_input_ids
=
not_padded_sequence
[
"input_ids"
]
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
...
...
@@ -982,7 +985,11 @@ class TokenizerTesterMixin:
assert
input_ids
==
not_padded_input_ids
assert
special_tokens_mask
==
not_padded_special_tokens_mask
not_padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
padding
=
False
,
return_special_tokens_mask
=
True
,)
not_padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
padding
=
False
,
return_special_tokens_mask
=
True
,
)
not_padded_input_ids
=
not_padded_sequence
[
"input_ids"
]
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
...
...
@@ -1148,7 +1155,8 @@ class TokenizerTesterMixin:
)
for
key
in
encoded_sequences_batch_padded_1
.
keys
():
self
.
assertListEqual
(
encoded_sequences_batch_padded_1
[
key
],
encoded_sequences_batch_padded_2
[
key
],
encoded_sequences_batch_padded_1
[
key
],
encoded_sequences_batch_padded_2
[
key
],
)
# check 'no_padding' is unsensitive to a max length
...
...
@@ -1158,7 +1166,8 @@ class TokenizerTesterMixin:
)
for
key
in
encoded_sequences_batch_padded_1
.
keys
():
self
.
assertListEqual
(
encoded_sequences_batch_padded_1
[
key
],
encoded_sequences_batch_padded_2
[
key
],
encoded_sequences_batch_padded_1
[
key
],
encoded_sequences_batch_padded_2
[
key
],
)
def
test_added_token_serializable
(
self
):
...
...
@@ -1361,10 +1370,18 @@ class TokenizerTesterMixin:
if
tokenizer
.
pad_token_id
is
None
:
self
.
assertRaises
(
ValueError
,
tokenizer
.
batch_encode_plus
,
sequences
,
padding
=
True
,
return_tensors
=
"pt"
,
ValueError
,
tokenizer
.
batch_encode_plus
,
sequences
,
padding
=
True
,
return_tensors
=
"pt"
,
)
self
.
assertRaises
(
ValueError
,
tokenizer
.
batch_encode_plus
,
sequences
,
padding
=
"longest"
,
return_tensors
=
"tf"
,
ValueError
,
tokenizer
.
batch_encode_plus
,
sequences
,
padding
=
"longest"
,
return_tensors
=
"tf"
,
)
else
:
pytorch_tensor
=
tokenizer
.
batch_encode_plus
(
sequences
,
padding
=
True
,
return_tensors
=
"pt"
)
...
...
tests/test_tokenization_fast.py
View file @
a75c64d8
...
...
@@ -228,7 +228,8 @@ class CommonFastTokenizerTest(unittest.TestCase):
def
assert_special_tokens_map_equal
(
self
,
tokenizer_r
,
tokenizer_p
):
# Assert the set of special tokens match.
self
.
assertSequenceEqual
(
tokenizer_p
.
special_tokens_map
.
items
(),
tokenizer_r
.
special_tokens_map
.
items
(),
tokenizer_p
.
special_tokens_map
.
items
(),
tokenizer_r
.
special_tokens_map
.
items
(),
)
def
assert_add_tokens
(
self
,
tokenizer_r
):
...
...
@@ -544,18 +545,26 @@ class CommonFastTokenizerTest(unittest.TestCase):
assert_batch_padded_input_match
(
input_r
,
input_p
,
max_length
)
input_r
=
tokenizer_r
.
batch_encode_plus
(
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
"max_length"
,
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
"max_length"
,
)
input_p
=
tokenizer_p
.
batch_encode_plus
(
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
"max_length"
,
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
"max_length"
,
)
assert_batch_padded_input_match
(
input_r
,
input_p
,
max_length
)
input_r
=
tokenizer_r
.
batch_encode_plus
(
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
"longest"
,
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
"longest"
,
)
input_p
=
tokenizer_p
.
batch_encode_plus
(
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
True
,
[
"This is a simple input 1"
,
"This is a simple input 2"
],
max_length
=
max_length
,
padding
=
True
,
)
assert_batch_padded_input_match
(
input_r
,
input_p
,
len
(
input_r
[
"input_ids"
][
0
]))
...
...
@@ -865,7 +874,11 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
# Simple input
self
.
assertRaises
(
ValueError
,
tokenizer_r
.
batch_encode_plus
,
s2
,
max_length
=
max_length
,
padding
=
"max_length"
,
ValueError
,
tokenizer_r
.
batch_encode_plus
,
s2
,
max_length
=
max_length
,
padding
=
"max_length"
,
)
# Pair input
...
...
@@ -876,7 +889,11 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
# Pair input
self
.
assertRaises
(
ValueError
,
tokenizer_r
.
batch_encode_plus
,
p2
,
max_length
=
max_length
,
padding
=
"max_length"
,
ValueError
,
tokenizer_r
.
batch_encode_plus
,
p2
,
max_length
=
max_length
,
padding
=
"max_length"
,
)
...
...
tests/test_tokenization_mbart.py
View file @
a75c64d8
...
...
@@ -125,7 +125,9 @@ class MBartEnroIntegrationTest(unittest.TestCase):
def
test_enro_tokenizer_prepare_seq2seq_batch
(
self
):
batch
=
self
.
tokenizer
.
prepare_seq2seq_batch
(
self
.
src_text
,
tgt_texts
=
self
.
tgt_text
,
max_length
=
len
(
self
.
expected_src_tokens
),
self
.
src_text
,
tgt_texts
=
self
.
tgt_text
,
max_length
=
len
(
self
.
expected_src_tokens
),
)
self
.
assertIsInstance
(
batch
,
BatchEncoding
)
...
...
tests/test_tokenization_reformer.py
View file @
a75c64d8
...
...
@@ -44,7 +44,8 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self
.
assertListEqual
(
tokens
,
[
"▁This"
,
"▁is"
,
"▁a"
,
"▁t"
,
"est"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
285
,
46
,
10
,
170
,
382
],
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
285
,
46
,
10
,
170
,
382
],
)
tokens
=
tokenizer
.
tokenize
(
"I was born in 92000, and this is falsé."
)
...
...
@@ -76,7 +77,8 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
self
.
assertListEqual
(
ids
,
[
8
,
21
,
84
,
55
,
24
,
19
,
7
,
0
,
602
,
347
,
347
,
347
,
3
,
12
,
66
,
46
,
72
,
80
,
6
,
0
,
4
],
ids
,
[
8
,
21
,
84
,
55
,
24
,
19
,
7
,
0
,
602
,
347
,
347
,
347
,
3
,
12
,
66
,
46
,
72
,
80
,
6
,
0
,
4
],
)
back_tokens
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
...
...
tests/test_tokenization_t5.py
View file @
a75c64d8
...
...
@@ -126,7 +126,11 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"Another summary."
,
]
expected_src_tokens
=
[
71
,
307
,
8986
,
21
,
4505
,
51
,
52
,
1707
,
5
,
tokenizer
.
eos_token_id
]
batch
=
tokenizer
.
prepare_seq2seq_batch
(
src_text
,
tgt_texts
=
tgt_text
,
return_tensors
=
FRAMEWORK
,)
batch
=
tokenizer
.
prepare_seq2seq_batch
(
src_text
,
tgt_texts
=
tgt_text
,
return_tensors
=
FRAMEWORK
,
)
self
.
assertIsInstance
(
batch
,
BatchEncoding
)
result
=
list
(
batch
.
input_ids
.
numpy
()[
0
])
self
.
assertListEqual
(
expected_src_tokens
,
result
)
...
...
tests/test_trainer.py
View file @
a75c64d8
...
...
@@ -275,7 +275,9 @@ class TrainerIntegrationTest(unittest.TestCase):
MODEL_ID
=
"distilroberta-base"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_ID
)
dataset
=
LineByLineTextDataset
(
tokenizer
=
tokenizer
,
file_path
=
PATH_SAMPLE_TEXT
,
block_size
=
tokenizer
.
max_len_single_sentence
,
tokenizer
=
tokenizer
,
file_path
=
PATH_SAMPLE_TEXT
,
block_size
=
tokenizer
.
max_len_single_sentence
,
)
self
.
assertEqual
(
len
(
dataset
),
31
)
...
...
utils/link_tester.py
View file @
a75c64d8
...
...
@@ -18,7 +18,7 @@ S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
def
list_python_files_in_repository
():
"""
List all python files in the repository.
"""List all python files in the repository.
This function assumes that the script is executed in the root folder.
"""
...
...
@@ -43,7 +43,7 @@ def find_all_links(file_paths):
def
scan_code_for_links
(
source
):
"""
Scans the file to find links using a regular expression.
"""Scans the file to find links using a regular expression.
Returns a list of links.
"""
with
open
(
source
,
"r"
)
as
content
:
...
...
@@ -55,7 +55,7 @@ def scan_code_for_links(source):
def
check_all_links
(
links
):
"""
Check that the provided links are valid.
"""Check that the provided links are valid.
Links are considered valid if a HEAD request to the server
returns a 200 status code.
...
...
Prev
1
…
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment