Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
632675ea
Commit
632675ea
authored
Jan 14, 2020
by
Lysandre
Committed by
Lysandre Debut
Jan 23, 2020
Browse files
Can test examples spread over multiple blocks
parent
eaa6b9af
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
77 additions
and
14 deletions
+77
-14
docs/source/glossary.rst
docs/source/glossary.rst
+15
-4
tests/test_examples.py
tests/test_examples.py
+62
-10
No files found.
docs/source/glossary.rst
View file @
632675ea
...
@@ -24,6 +24,7 @@ The tokenizer takes care of splitting the sequence into tokens available in the
...
@@ -24,6 +24,7 @@ The tokenizer takes care of splitting the sequence into tokens available in the
::
::
# Continuation of the previous script
tokenized_sequence = tokenizer.tokenize(sequence)
tokenized_sequence = tokenizer.tokenize(sequence)
assert tokenized_sequence == ['
A
', '
Titan
', '
R
', '
##
T
', '
##
X
', '
has
', '
24
', '
##
GB
', '
of
', '
V
', '
##
RA
', '
##
M
']
assert tokenized_sequence == ['
A
', '
Titan
', '
R
', '
##
T
', '
##
X
', '
has
', '
24
', '
##
GB
', '
of
', '
V
', '
##
RA
', '
##
M
']
...
@@ -33,6 +34,7 @@ this, the recommended being `encode` or `encode_plus`, which leverage the Rust i
...
@@ -33,6 +34,7 @@ this, the recommended being `encode` or `encode_plus`, which leverage the Rust i
::
::
# Continuation of the previous script
encoded_sequence = tokenizer.encode(sequence)
encoded_sequence = tokenizer.encode(sequence)
assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
...
@@ -48,6 +50,9 @@ For example, consider these two sequences:
...
@@ -48,6 +50,9 @@ For example, consider these two sequences:
::
::
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "This is a short sequence."
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
...
@@ -65,10 +70,11 @@ In the first case, the list of IDs will be extended by the padding indices:
...
@@ -65,10 +70,11 @@ In the first case, the list of IDs will be extended by the padding indices:
::
::
#
Continuation
of
the
previous
script
padded_sequence_a
=
tokenizer
.
encode
(
sequence_a
,
max_length
=
19
,
pad_to_max_length
=
True
)
padded_sequence_a
=
tokenizer
.
encode
(
sequence_a
,
max_length
=
19
,
pad_to_max_length
=
True
)
assert
padded_sequence_a
=
[
101
,
1188
,
1110
,
170
,
1603
,
4954
,
119
,
102
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
assert
padded_sequence_a
=
=
[
101
,
1188
,
1110
,
170
,
1603
,
4954
,
119
,
102
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
assert
encoded_sequence_b
=
[
101
,
1188
,
1110
,
170
,
1897
,
1263
,
4954
,
119
,
1135
,
1110
,
1120
,
1655
,
2039
,
1190
,
1103
,
4954
,
138
,
119
,
102
]
assert
encoded_sequence_b
=
=
[
101
,
1188
,
1110
,
170
,
1897
,
1263
,
4954
,
119
,
1135
,
1110
,
1120
,
1655
,
2039
,
1190
,
1103
,
4954
,
138
,
119
,
102
]
These
can
then
be
converted
into
a
tensor
in
PyTorch
or
TensorFlow
.
The
attention
mask
is
a
binary
tensor
indicating
These
can
then
be
converted
into
a
tensor
in
PyTorch
or
TensorFlow
.
The
attention
mask
is
a
binary
tensor
indicating
the
position
of
the
padded
indices
so
that
the
model
does
not
attend
to
them
.
For
the
the
position
of
the
padded
indices
so
that
the
model
does
not
attend
to
them
.
For
the
...
@@ -79,6 +85,7 @@ The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to
...
@@ -79,6 +85,7 @@ The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to
::
::
#
Continuation
of
the
previous
script
sequence_a_dict
=
tokenizer
.
encode_plus
(
sequence_a
,
max_length
=
19
,
pad_to_max_length
=
True
)
sequence_a_dict
=
tokenizer
.
encode_plus
(
sequence_a
,
max_length
=
19
,
pad_to_max_length
=
True
)
assert
sequence_a_dict
[
'input_ids'
]
==
[
101
,
1188
,
1110
,
170
,
1603
,
4954
,
119
,
102
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
assert
sequence_a_dict
[
'input_ids'
]
==
[
101
,
1188
,
1110
,
170
,
1603
,
4954
,
119
,
102
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
...
@@ -94,6 +101,9 @@ tokens. For example, the BERT model builds its two sequence input as such:
...
@@ -94,6 +101,9 @@ tokens. For example, the BERT model builds its two sequence input as such:
::
::
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# [CLS] SEQ_A [SEP] SEQ_B [SEP]
# [CLS] SEQ_A [SEP] SEQ_B [SEP]
sequence_a = "HuggingFace is based in NYC"
sequence_a = "HuggingFace is based in NYC"
...
@@ -110,10 +120,11 @@ We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output
...
@@ -110,10 +120,11 @@ We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output
::
::
# Continuation of the previous script
encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
assert
sequence_a
_dict['
input_ids
'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
assert
encoded
_dict['
input_ids
'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
assert
sequence_a
_dict['
token_type_ids
'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
assert
encoded
_dict['
token_type_ids
'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
...
...
tests/test_examples.py
View file @
632675ea
...
@@ -15,6 +15,8 @@
...
@@ -15,6 +15,8 @@
import
os
import
os
import
unittest
import
unittest
from
typing
import
List
,
Union
from
.utils
import
require_torch
from
.utils
import
require_torch
...
@@ -26,34 +28,84 @@ def get_examples_from_file(file):
...
@@ -26,34 +28,84 @@ def get_examples_from_file(file):
for
i
,
line
in
enumerate
(
file
):
for
i
,
line
in
enumerate
(
file
):
if
example_mode
:
if
example_mode
:
current_indentation
=
len
(
line
)
-
len
(
line
.
strip
())
-
1
current_indentation
=
len
(
line
)
-
len
(
line
.
strip
())
-
1
if
current_indentation
==
example_indentation
or
'"""'
in
line
:
# Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
empty_line
=
example_indentation
==
0
and
len
(
line
)
==
1
# If we're back to the example indentation or if it's the end of the docstring.
if
(
current_indentation
==
example_indentation
and
not
empty_line
)
or
'"""'
in
line
:
# Exit the example mode and add the example to the examples list
example_mode
=
False
example_mode
=
False
example_indentation
=
None
example_indentation
=
None
examples
.
append
(
example
)
examples
.
append
(
example
)
example
=
[]
example
=
[]
else
:
else
:
# If line is not empty, add it to the current example
if
line
is
not
"
\n
"
:
if
line
is
not
"
\n
"
:
example
.
append
(
line
[
example_indentation
+
4
:
-
1
])
example
.
append
(
line
[
example_indentation
+
4
:
-
1
])
# Detect the example from '::' or 'example::'
if
"example::"
in
line
.
lower
():
if
"example::"
in
line
.
lower
():
example_mode
=
True
example_mode
=
True
example_indentation
=
line
.
lower
().
find
(
"example::"
)
example_indentation
=
line
.
lower
().
find
(
"example::"
)
elif
"examples::"
in
line
.
lower
():
example_mode
=
True
example_indentation
=
line
.
lower
().
find
(
"examples::"
)
elif
"::"
in
line
.
lower
():
example_mode
=
True
example_indentation
=
line
.
lower
().
find
(
"::"
)
return
[
'
\n
'
.
join
(
example
)
for
example
in
examples
]
return
[
"
\n
"
.
join
(
example
)
for
example
in
examples
]
@
require_torch
@
require_torch
class
TestCodeExamples
(
unittest
.
TestCase
):
class
TestCodeExamples
(
unittest
.
TestCase
):
def
test_configuration_examples
(
self
):
def
analyze_directory
(
transformers_directory
=
"../src/transformers"
self
,
directory
:
str
,
identifier
:
Union
[
str
,
None
]
=
None
,
ignore_files
:
Union
[
List
[
str
],
None
]
=
None
configuration_files
=
[
file
for
file
in
os
.
listdir
(
transformers_directory
)
if
"configuration"
in
file
]
):
files
=
[
file
for
file
in
os
.
listdir
(
directory
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
directory
,
file
))]
for
configuration_file
in
configuration_files
:
if
identifier
is
not
None
:
with
open
(
os
.
path
.
join
(
transformers_directory
,
configuration_file
))
as
f
:
files
=
[
file
for
file
in
files
if
identifier
in
file
]
if
ignore_files
is
not
None
:
files
=
[
file
for
file
in
files
if
file
not
in
ignore_files
]
for
file
in
files
:
# Open all files
with
open
(
os
.
path
.
join
(
directory
,
file
))
as
f
:
# Retrieve examples
examples
=
get_examples_from_file
(
f
)
examples
=
get_examples_from_file
(
f
)
print
(
"Testing"
,
configuration_file
,
str
(
len
(
examples
))
+
"/"
+
str
(
len
(
examples
)))
joined_examples
=
[]
def
execute_example
(
code_example
):
def
execute_example
(
code_example
):
exec
(
code_example
)
exec
(
code_example
)
with
self
.
subTest
(
msg
=
configuration_file
):
# Some examples are the continuation of others.
[
execute_example
(
code_example
)
for
code_example
in
examples
]
if
len
(
examples
)
>
1
:
joined_examples
.
append
(
examples
[
0
])
joined_examples_index
=
0
for
example
in
examples
[
1
:]:
# If they contain this line, then they're a continuation of the previous script
if
"# Continuation of the previous script"
in
example
:
joined_examples
[
joined_examples_index
]
+=
"
\n
"
+
example
# If not, create a new example and increment the index
else
:
joined_examples
.
append
(
example
)
joined_examples_index
+=
1
print
(
"Testing"
,
file
,
str
(
len
(
joined_examples
))
+
"/"
+
str
(
len
(
joined_examples
)))
# Execute sub tests with every example.
with
self
.
subTest
(
msg
=
file
):
[
execute_example
(
code_example
)
for
code_example
in
joined_examples
]
def
test_configuration_examples
(
self
):
transformers_directory
=
"src/transformers"
configuration_files
=
"configuration"
ignore_files
=
[
"configuration_auto.py"
,
"configuration_utils.py"
]
self
.
analyze_directory
(
transformers_directory
,
identifier
=
configuration_files
,
ignore_files
=
ignore_files
)
def
test_main_doc_examples
(
self
):
doc_directory
=
"docs/source"
self
.
analyze_directory
(
doc_directory
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment