Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
47d68534
Commit
47d68534
authored
Aug 23, 2019
by
thomwolf
Browse files
adding max_lengths for single sentences and sentences pairs
parent
90dcd8c0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
40 additions
and
0 deletions
+40
-0
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+8
-0
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+8
-0
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+8
-0
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+8
-0
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+8
-0
No files found.
pytorch_transformers/tokenization_bert.py
View file @
47d68534
...
...
@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer):
tokenize_chinese_chars
=
tokenize_chinese_chars
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
@
property
def
max_len_single_sentence
(
self
):
return
self
.
max_len
-
2
# take into account special tokens
@
property
def
max_len_sentences_pair
(
self
):
return
self
.
max_len
-
3
# take into account special tokens
@
property
def
vocab_size
(
self
):
return
len
(
self
.
vocab
)
...
...
pytorch_transformers/tokenization_roberta.py
View file @
47d68534
...
...
@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
text
=
bytearray
([
self
.
byte_decoder
[
c
]
for
c
in
text
]).
decode
(
'utf-8'
,
errors
=
self
.
errors
)
return
text
@
property
def
max_len_single_sentence
(
self
):
return
self
.
max_len
-
2
# take into account special tokens
@
property
def
max_len_sentences_pair
(
self
):
return
self
.
max_len
-
4
# take into account special tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence for sequence classification tasks.
...
...
pytorch_transformers/tokenization_utils.py
View file @
47d68534
...
...
@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object):
"pad_token"
,
"cls_token"
,
"mask_token"
,
"additional_special_tokens"
]
@
property
def
max_len_single_sentence
(
self
):
return
self
.
max_len
# Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@
property
def
max_len_sentences_pair
(
self
):
return
self
.
max_len
# Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@
property
def
bos_token
(
self
):
""" Beginning of sentence token (string). Log an error if used while not having been set. """
...
...
pytorch_transformers/tokenization_xlm.py
View file @
47d68534
...
...
@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string
=
''
.
join
(
tokens
).
replace
(
'</w>'
,
' '
).
strip
()
return
out_string
@
property
def
max_len_single_sentence
(
self
):
return
self
.
max_len
-
2
# take into account special tokens
@
property
def
max_len_sentences_pair
(
self
):
return
self
.
max_len
-
3
# take into account special tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence for sequence classification tasks.
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
47d68534
...
...
@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
@
property
def
max_len_single_sentence
(
self
):
return
self
.
max_len
-
2
# take into account special tokens
@
property
def
max_len_sentences_pair
(
self
):
return
self
.
max_len
-
3
# take into account special tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment