Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
306c9ee9
Unverified
Commit
306c9ee9
authored
Apr 19, 2022
by
Li-Huai (Allan) Lin
Committed by
GitHub
Apr 19, 2022
Browse files
Fix `LayoutLMv2` tokenization docstrings (#16187)
* Fix docstrings * Fix up * Fix
parent
7db7aab4
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
67 additions
and
13 deletions
+67
-13
src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
...transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+56
-7
src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
...formers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+11
-6
No files found.
src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
View file @
306c9ee9
...
@@ -22,7 +22,6 @@ from typing import Dict, List, Optional, Tuple, Union
...
@@ -22,7 +22,6 @@ from typing import Dict, List, Optional, Tuple, Union
from
...tokenization_utils
import
PreTrainedTokenizer
,
_is_control
,
_is_punctuation
,
_is_whitespace
from
...tokenization_utils
import
PreTrainedTokenizer
,
_is_control
,
_is_punctuation
,
_is_whitespace
from
...tokenization_utils_base
import
(
from
...tokenization_utils_base
import
(
ENCODE_KWARGS_DOCSTRING
,
BatchEncoding
,
BatchEncoding
,
EncodedInput
,
EncodedInput
,
PreTokenizedInput
,
PreTokenizedInput
,
...
@@ -57,6 +56,56 @@ PRETRAINED_INIT_CONFIGURATION = {
...
@@ -57,6 +56,56 @@ PRETRAINED_INIT_CONFIGURATION = {
}
}
LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING
=
r
"""
add_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not to encode the sequences with the special tokens relative to their model.
padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
Activates and controls truncation. Accepts the following values:
- `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
to the maximum acceptable input length for the model if that argument is not provided. This will
truncate token by token, removing a token from the longest sequence in the pair if a pair of
sequences (or a batch of pairs) is provided.
- `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. This will only
truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
maximum acceptable input length for the model if that argument is not provided. This will only
truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
greater than the model maximum admissible input size).
max_length (`int`, *optional*):
Controls the maximum length to use by one of the truncation/padding parameters.
If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
is required by one of the truncation/padding parameters. If the model has no specific maximum input
length (like XLNet) truncation/padding to a maximum length will be deactivated.
stride (`int`, *optional*, defaults to 0):
If set to a number along with `max_length`, the overflowing tokens returned when
`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
returned to provide some overlap between truncated and overflowing sequences. The value of this
argument defines the number of overlapping tokens.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
"""
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
=
r
"""
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
=
r
"""
add_special_tokens (`bool`, *optional*, defaults to `True`):
add_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not to encode the sequences with the special tokens relative to their model.
Whether or not to encode the sequences with the special tokens relative to their model.
...
@@ -362,7 +411,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
...
@@ -362,7 +411,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
index
+=
1
index
+=
1
return
(
vocab_file
,)
return
(
vocab_file
,)
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
__call__
(
def
__call__
(
self
,
self
,
text
:
Union
[
TextInput
,
PreTokenizedInput
,
List
[
TextInput
],
List
[
PreTokenizedInput
]],
text
:
Union
[
TextInput
,
PreTokenizedInput
,
List
[
TextInput
],
List
[
PreTokenizedInput
]],
...
@@ -507,7 +556,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
...
@@ -507,7 +556,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
**
kwargs
,
**
kwargs
,
)
)
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
batch_encode_plus
(
def
batch_encode_plus
(
self
,
self
,
batch_text_or_text_pairs
:
Union
[
batch_text_or_text_pairs
:
Union
[
...
@@ -623,7 +672,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
...
@@ -623,7 +672,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
return
BatchEncoding
(
batch_outputs
)
return
BatchEncoding
(
batch_outputs
)
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
_batch_prepare_for_model
(
def
_batch_prepare_for_model
(
self
,
self
,
batch_text_or_text_pairs
,
batch_text_or_text_pairs
,
...
@@ -694,7 +743,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
...
@@ -694,7 +743,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
return
batch_outputs
return
batch_outputs
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
)
def
encode
(
def
encode
(
self
,
self
,
text
:
Union
[
TextInput
,
PreTokenizedInput
],
text
:
Union
[
TextInput
,
PreTokenizedInput
],
...
@@ -741,7 +790,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
...
@@ -741,7 +790,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
return
encoded_inputs
[
"input_ids"
]
return
encoded_inputs
[
"input_ids"
]
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
encode_plus
(
def
encode_plus
(
self
,
self
,
text
:
Union
[
TextInput
,
PreTokenizedInput
],
text
:
Union
[
TextInput
,
PreTokenizedInput
],
...
@@ -860,7 +909,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
...
@@ -860,7 +909,7 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
verbose
=
verbose
,
verbose
=
verbose
,
)
)
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
prepare_for_model
(
def
prepare_for_model
(
self
,
self
,
text
:
Union
[
TextInput
,
PreTokenizedInput
],
text
:
Union
[
TextInput
,
PreTokenizedInput
],
...
...
src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
View file @
306c9ee9
...
@@ -23,17 +23,22 @@ from typing import Dict, List, Optional, Tuple, Union
...
@@ -23,17 +23,22 @@ from typing import Dict, List, Optional, Tuple, Union
from
tokenizers
import
normalizers
from
tokenizers
import
normalizers
from
...tokenization_utils_base
import
(
from
...tokenization_utils_base
import
(
ENCODE_KWARGS_DOCSTRING
,
BatchEncoding
,
BatchEncoding
,
EncodedInput
,
EncodedInput
,
PaddingStrategy
,
PreTokenizedInput
,
PreTokenizedInput
,
TensorType
,
TextInput
,
TextInput
,
TextInputPair
,
TextInputPair
,
TruncationStrategy
,
TruncationStrategy
,
)
)
from
...tokenization_utils_fast
import
PreTrainedTokenizerFast
from
...tokenization_utils_fast
import
PreTrainedTokenizerFast
from
...utils
import
PaddingStrategy
,
TensorType
,
add_end_docstrings
,
logging
from
...utils
import
add_end_docstrings
,
logging
from
.tokenization_layoutlmv2
import
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
,
LayoutLMv2Tokenizer
from
.tokenization_layoutlmv2
import
(
LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
,
LayoutLMv2Tokenizer
,
)
logger
=
logging
.
get_logger
(
__name__
)
logger
=
logging
.
get_logger
(
__name__
)
...
@@ -167,7 +172,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
...
@@ -167,7 +172,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
self
.
pad_token_label
=
pad_token_label
self
.
pad_token_label
=
pad_token_label
self
.
only_label_first_subword
=
only_label_first_subword
self
.
only_label_first_subword
=
only_label_first_subword
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
__call__
(
def
__call__
(
self
,
self
,
text
:
Union
[
TextInput
,
PreTokenizedInput
,
List
[
TextInput
],
List
[
PreTokenizedInput
]],
text
:
Union
[
TextInput
,
PreTokenizedInput
,
List
[
TextInput
],
List
[
PreTokenizedInput
]],
...
@@ -312,7 +317,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
...
@@ -312,7 +317,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
**
kwargs
,
**
kwargs
,
)
)
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
batch_encode_plus
(
def
batch_encode_plus
(
self
,
self
,
batch_text_or_text_pairs
:
Union
[
batch_text_or_text_pairs
:
Union
[
...
@@ -380,7 +385,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
...
@@ -380,7 +385,7 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
return
encodings
[
0
].
tokens
return
encodings
[
0
].
tokens
@
add_end_docstrings
(
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
@
add_end_docstrings
(
LAYOUTLMV2_
ENCODE_KWARGS_DOCSTRING
,
LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
)
def
encode_plus
(
def
encode_plus
(
self
,
self
,
text
:
Union
[
TextInput
,
PreTokenizedInput
],
text
:
Union
[
TextInput
,
PreTokenizedInput
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment