Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5340d1f2
"...runners/git@developer.sourcefind.cn:Wenxuan/LightX2V.git" did not exist on "d91e8d68e87ccbe1d2a6ad36db18093c5115ac01"
Unverified
Commit
5340d1f2
authored
Nov 27, 2019
by
Thomas Wolf
Committed by
GitHub
Nov 27, 2019
Browse files
Merge branch 'master' into resumable_http
parents
0e4cc050
10bd1ddb
Changes
131
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
527 additions
and
40 deletions
+527
-40
transformers/tokenization_albert.py
transformers/tokenization_albert.py
+252
-0
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+7
-2
transformers/tokenization_bert.py
transformers/tokenization_bert.py
+1
-1
transformers/tokenization_camembert.py
transformers/tokenization_camembert.py
+157
-0
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+60
-5
transformers/tokenization_distilbert.py
transformers/tokenization_distilbert.py
+2
-0
transformers/tokenization_gpt2.py
transformers/tokenization_gpt2.py
+7
-4
transformers/tokenization_roberta.py
transformers/tokenization_roberta.py
+7
-1
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+25
-18
transformers/tokenization_xlm.py
transformers/tokenization_xlm.py
+4
-4
transformers/tokenization_xlnet.py
transformers/tokenization_xlnet.py
+5
-5
No files found.
transformers/tokenization_albert.py
0 → 100644
View file @
5340d1f2
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
from
.tokenization_utils
import
PreTrainedTokenizer
import
logging
import
unicodedata
import
six
import
os
from
shutil
import
copyfile
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'spiece.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'albert-base-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model"
,
'albert-large-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model"
,
'albert-xlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model"
,
'albert-xxlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model"
,
'albert-base-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model"
,
'albert-large-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model"
,
'albert-xlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model"
,
'albert-xxlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'albert-base-v1'
:
512
,
'albert-large-v1'
:
512
,
'albert-xlarge-v1'
:
512
,
'albert-xxlarge-v1'
:
512
,
'albert-base-v2'
:
512
,
'albert-large-v2'
:
512
,
'albert-xlarge-v2'
:
512
,
'albert-xxlarge-v2'
:
512
,
}
SPIECE_UNDERLINE
=
u
'▁'
class
AlbertTokenizer
(
PreTrainedTokenizer
):
"""
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
remove_space
=
True
,
keep_accents
=
False
,
bos_token
=
"[CLS]"
,
eos_token
=
"[SEP]"
,
unk_token
=
"<unk>"
,
sep_token
=
"[SEP]"
,
pad_token
=
"<pad>"
,
cls_token
=
"[CLS]"
,
mask_token
=
"[MASK]>"
,
**
kwargs
):
super
(
AlbertTokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
do_lower_case
=
do_lower_case
self
.
remove_space
=
remove_space
self
.
keep_accents
=
keep_accents
self
.
vocab_file
=
vocab_file
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
vocab_file
)
@
property
def
vocab_size
(
self
):
return
len
(
self
.
sp_model
)
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
preprocess_text
(
self
,
inputs
):
if
self
.
remove_space
:
outputs
=
' '
.
join
(
inputs
.
strip
().
split
())
else
:
outputs
=
inputs
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
outputs
=
outputs
.
decode
(
'utf-8'
)
if
not
self
.
keep_accents
:
outputs
=
unicodedata
.
normalize
(
'NFKD'
,
outputs
)
outputs
=
''
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
if
self
.
do_lower_case
:
outputs
=
outputs
.
lower
()
return
outputs
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Tokenize a string.
return_unicode is used only for py2
"""
text
=
self
.
preprocess_text
(
text
)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if
six
.
PY2
and
isinstance
(
text
,
unicode
):
text
=
text
.
encode
(
'utf-8'
)
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
else
:
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
new_pieces
=
[]
for
piece
in
pieces
:
if
len
(
piece
)
>
1
and
piece
[
-
1
]
==
','
and
piece
[
-
2
].
isdigit
():
cur_pieces
=
self
.
sp_model
.
EncodeAsPieces
(
piece
[:
-
1
].
replace
(
SPIECE_UNDERLINE
,
''
))
if
piece
[
0
]
!=
SPIECE_UNDERLINE
and
cur_pieces
[
0
][
0
]
==
SPIECE_UNDERLINE
:
if
len
(
cur_pieces
[
0
])
==
1
:
cur_pieces
=
cur_pieces
[
1
:]
else
:
cur_pieces
[
0
]
=
cur_pieces
[
0
][
1
:]
cur_pieces
.
append
(
piece
[
-
1
])
new_pieces
.
extend
(
cur_pieces
)
else
:
new_pieces
.
append
(
piece
)
# note(zhiliny): convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
new_pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
'utf-8'
)
ret_pieces
.
append
(
piece
)
new_pieces
=
ret_pieces
return
new_pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
'utf-8'
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
transformers/tokenization_auto.py
View file @
5340d1f2
...
@@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer
...
@@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -41,6 +42,7 @@ class AutoTokenizer(object):
...
@@ -41,6 +42,7 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
...
@@ -64,8 +66,9 @@ class AutoTokenizer(object):
...
@@ -64,8 +66,9 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (
XLM
model)
- contains `roberta`: RobertaTokenizer (
RoBERTa
model)
- contains `bert`: BertTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
...
@@ -106,6 +109,8 @@ class AutoTokenizer(object):
...
@@ -106,6 +109,8 @@ class AutoTokenizer(object):
"""
"""
if
'distilbert'
in
pretrained_model_name_or_path
:
if
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
DistilBertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'camembert'
in
pretrained_model_name_or_path
:
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
elif
'bert'
in
pretrained_model_name_or_path
:
...
@@ -124,4 +129,4 @@ class AutoTokenizer(object):
...
@@ -124,4 +129,4 @@ class AutoTokenizer(object):
return
CTRLTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
CTRLTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl'"
.
format
(
pretrained_model_name_or_path
))
"'xlm', 'roberta',
'camembert',
'ctrl'"
.
format
(
pretrained_model_name_or_path
))
transformers/tokenization_bert.py
View file @
5340d1f2
...
@@ -220,7 +220,7 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -220,7 +220,7 @@ class BertTokenizer(PreTrainedTokenizer):
special tokens for the model
special tokens for the model
Returns:
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
...
...
transformers/tokenization_camembert.py
0 → 100644
View file @
5340d1f2
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for Camembert model."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
logging
import
os
from
shutil
import
copyfile
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'sentencepiece.bpe.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'camembert-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'camembert-base'
:
None
,
}
class
CamembertTokenizer
(
PreTrainedTokenizer
):
"""
Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
additional_special_tokens
=
[
'<s>NOTUSED'
,
'<s>NOTUSED'
],
**
kwargs
):
super
(
CamembertTokenizer
,
self
).
__init__
(
max_len
=
512
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
str
(
vocab_file
))
self
.
vocab_file
=
vocab_file
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
# sentencepiece vocabulary (this is the case for <s> and </s>
self
.
fairseq_tokens_to_ids
=
{
'<s>NOTUSED'
:
0
,
'<pad>'
:
1
,
'</s>NOTUSED'
:
2
,
'<unk>'
:
3
}
self
.
fairseq_offset
=
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_tokens_to_ids
[
'<mask>'
]
=
len
(
self
.
sp_model
)
+
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_ids_to_tokens
=
{
v
:
k
for
k
,
v
in
self
.
fairseq_tokens_to_ids
.
items
()}
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
@
property
def
vocab_size
(
self
):
return
self
.
fairseq_offset
+
len
(
self
.
sp_model
)
def
_tokenize
(
self
,
text
):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
if
index
in
self
.
fairseq_ids_to_tokens
:
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
transformers/tokenization_ctrl.py
View file @
5340d1f2
...
@@ -46,6 +46,64 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -46,6 +46,64 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ctrl'
:
256
,
'ctrl'
:
256
,
}
}
CONTROL_CODES
=
{
"Pregnancy"
:
168629
,
"Christianity"
:
7675
,
"Explain"
:
106423
,
"Fitness"
:
63440
,
"Saving"
:
63163
,
"Ask"
:
27171
,
"Ass"
:
95985
,
"Joke"
:
163509
,
"Questions"
:
45622
,
"Thoughts"
:
49605
,
"Retail"
:
52342
,
"Feminism"
:
164338
,
"Writing"
:
11992
,
"Atheism"
:
192263
,
"Netflix"
:
48616
,
"Computing"
:
39639
,
"Opinion"
:
43213
,
"Alone"
:
44967
,
"Funny"
:
58917
,
"Gaming"
:
40358
,
"Human"
:
4088
,
"India"
:
1331
,
"Joker"
:
77138
,
"Diet"
:
36206
,
"Legal"
:
11859
,
"Norman"
:
4939
,
"Tip"
:
72689
,
"Weight"
:
52343
,
"Movies"
:
46273
,
"Running"
:
23425
,
"Science"
:
2090
,
"Horror"
:
37793
,
"Confession"
:
60572
,
"Finance"
:
12250
,
"Politics"
:
16360
,
"Scary"
:
191985
,
"Support"
:
12654
,
"Technologies"
:
32516
,
"Teenage"
:
66160
,
"Event"
:
32769
,
"Learned"
:
67460
,
"Notion"
:
182770
,
"Wikipedia"
:
37583
,
"Books"
:
6665
,
"Extract"
:
76050
,
"Confessions"
:
102701
,
"Conspiracy"
:
75932
,
"Links"
:
63674
,
"Narcissus"
:
150425
,
"Relationship"
:
54766
,
"Relationships"
:
134796
,
"Reviews"
:
41671
,
"News"
:
4256
,
"Translation"
:
26820
,
"multilingual"
:
128406
,
}
def
get_pairs
(
word
):
def
get_pairs
(
word
):
"""Return set of symbol pairs in a word.
"""Return set of symbol pairs in a word.
...
@@ -63,15 +121,12 @@ def get_pairs(word):
...
@@ -63,15 +121,12 @@ def get_pairs(word):
class
CTRLTokenizer
(
PreTrainedTokenizer
):
class
CTRLTokenizer
(
PreTrainedTokenizer
):
"""
"""
CTRL BPE tokenizer. Peculiarities:
CTRL BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
control_codes
=
CONTROL_CODES
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
**
kwargs
):
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
**
kwargs
):
super
(
CTRLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
**
kwargs
)
super
(
CTRLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
**
kwargs
)
...
...
transformers/tokenization_distilbert.py
View file @
5340d1f2
...
@@ -33,12 +33,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -33,12 +33,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
{
{
'distilbert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
,
'distilbert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
,
'distilbert-base-uncased-distilled-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
,
'distilbert-base-uncased-distilled-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
,
'distilbert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt"
,
}
}
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'distilbert-base-uncased'
:
512
,
'distilbert-base-uncased'
:
512
,
'distilbert-base-uncased-distilled-squad'
:
512
,
'distilbert-base-uncased-distilled-squad'
:
512
,
'distilbert-base-multilingual-cased'
:
512
,
}
}
...
...
transformers/tokenization_gpt2.py
View file @
5340d1f2
...
@@ -46,6 +46,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -46,6 +46,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
,
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json"
,
'gpt2-xl'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json"
,
},
},
'merges_file'
:
'merges_file'
:
...
@@ -53,6 +54,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -53,6 +54,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
,
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt"
,
'gpt2-xl'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt"
,
},
},
}
}
...
@@ -61,6 +63,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -61,6 +63,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2'
:
1024
,
'gpt2'
:
1024
,
'gpt2-medium'
:
1024
,
'gpt2-medium'
:
1024
,
'gpt2-large'
:
1024
,
'gpt2-large'
:
1024
,
'gpt2-xl'
:
1024
,
'distilgpt2'
:
1024
,
'distilgpt2'
:
1024
,
}
}
...
@@ -104,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -104,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
"""
"""
GPT-2 BPE tokenizer. Peculiarities:
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
- Requires a space to start the input string => the encoding
and tokenize
methods should be called with the
``add_prefix_space`` flag set to ``True``.
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode``
and
``decode`` method will not conserve
Otherwise, this tokenizer
's
``encode``
,
``decode``
, and ``tokenize``
method
s
will not conserve
the
absence of a
space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = "
Hello"`
the space
s
at the beginning of a string: `tokenizer.decode(tokenizer.encode("
Hello")) = "Hello"`
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
...
@@ -181,7 +184,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -181,7 +184,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
""" Tokenize a string.
""" Tokenize a string.
Args:
Args:
- add_prefix_space (boolean, default False):
- add_prefix_space (boolean, default False):
Begin the sentence with at least one space to
to
get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
"""
"""
if
add_prefix_space
:
if
add_prefix_space
:
text
=
' '
+
text
text
=
' '
+
text
...
...
transformers/tokenization_roberta.py
View file @
5340d1f2
...
@@ -47,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -47,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json"
,
'roberta-base-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
,
'roberta-large-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
,
},
},
'merges_file'
:
'merges_file'
:
{
{
...
@@ -54,6 +56,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
...
@@ -54,6 +56,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt"
,
'roberta-base-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
,
'roberta-large-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
,
},
},
}
}
...
@@ -62,6 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -62,6 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'roberta-large'
:
512
,
'roberta-large'
:
512
,
'roberta-large-mnli'
:
512
,
'roberta-large-mnli'
:
512
,
'distilroberta-base'
:
512
,
'distilroberta-base'
:
512
,
'roberta-base-openai-detector'
:
512
,
'roberta-large-openai-detector'
:
512
,
}
}
...
@@ -114,7 +120,7 @@ class RobertaTokenizer(GPT2Tokenizer):
...
@@ -114,7 +120,7 @@ class RobertaTokenizer(GPT2Tokenizer):
special tokens for the model
special tokens for the model
Returns:
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
...
...
transformers/tokenization_utils.py
View file @
5340d1f2
...
@@ -21,6 +21,7 @@ import os
...
@@ -21,6 +21,7 @@ import os
import
json
import
json
import
six
import
six
import
copy
import
copy
import
itertools
from
io
import
open
from
io
import
open
from
.file_utils
import
cached_path
,
is_tf_available
,
is_torch_available
from
.file_utils
import
cached_path
,
is_tf_available
,
is_torch_available
...
@@ -516,6 +517,8 @@ class PreTrainedTokenizer(object):
...
@@ -516,6 +517,8 @@ class PreTrainedTokenizer(object):
to_add_tokens
=
[]
to_add_tokens
=
[]
for
token
in
new_tokens
:
for
token
in
new_tokens
:
assert
isinstance
(
token
,
str
)
or
(
six
.
PY2
and
isinstance
(
token
,
unicode
))
assert
isinstance
(
token
,
str
)
or
(
six
.
PY2
and
isinstance
(
token
,
unicode
))
if
self
.
init_kwargs
.
get
(
'do_lower_case'
,
False
):
token
=
token
.
lower
()
if
token
!=
self
.
unk_token
and
\
if
token
!=
self
.
unk_token
and
\
self
.
convert_tokens_to_ids
(
token
)
==
self
.
convert_tokens_to_ids
(
self
.
unk_token
)
and
\
self
.
convert_tokens_to_ids
(
token
)
==
self
.
convert_tokens_to_ids
(
self
.
unk_token
)
and
\
token
not
in
to_add_tokens
:
token
not
in
to_add_tokens
:
...
@@ -609,6 +612,9 @@ class PreTrainedTokenizer(object):
...
@@ -609,6 +612,9 @@ class PreTrainedTokenizer(object):
Take care of added tokens.
Take care of added tokens.
"""
"""
if
self
.
init_kwargs
.
get
(
'do_lower_case'
,
False
):
text
=
text
.
lower
()
def
split_on_token
(
tok
,
text
):
def
split_on_token
(
tok
,
text
):
result
=
[]
result
=
[]
split_text
=
text
.
split
(
tok
)
split_text
=
text
.
split
(
tok
)
...
@@ -645,9 +651,9 @@ class PreTrainedTokenizer(object):
...
@@ -645,9 +651,9 @@ class PreTrainedTokenizer(object):
tokenized_text
+=
[
sub_text
]
tokenized_text
+=
[
sub_text
]
text_list
=
tokenized_text
text_list
=
tokenized_text
return
sum
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
return
list
(
itertools
.
chain
.
from_iterable
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
in
self
.
added_tokens_encoder
and
token
not
in
self
.
all_special_tokens
\
in
self
.
added_tokens_encoder
and
token
not
in
self
.
all_special_tokens
\
else
[
token
]
for
token
in
tokenized_text
)
,
[]
)
else
[
token
]
for
token
in
tokenized_text
)
)
)
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
+
self
.
all_special_tokens
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
+
self
.
all_special_tokens
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
...
@@ -675,10 +681,6 @@ class PreTrainedTokenizer(object):
...
@@ -675,10 +681,6 @@ class PreTrainedTokenizer(object):
ids
=
[]
ids
=
[]
for
token
in
tokens
:
for
token
in
tokens
:
ids
.
append
(
self
.
_convert_token_to_id_with_added_voc
(
token
))
ids
.
append
(
self
.
_convert_token_to_id_with_added_voc
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
ids
return
ids
def
_convert_token_to_id_with_added_voc
(
self
,
token
):
def
_convert_token_to_id_with_added_voc
(
self
,
token
):
...
@@ -693,14 +695,14 @@ class PreTrainedTokenizer(object):
...
@@ -693,14 +695,14 @@ class PreTrainedTokenizer(object):
raise
NotImplementedError
raise
NotImplementedError
def
encode
(
self
,
def
encode
(
self
,
text
,
text
,
text_pair
=
None
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
,
return_tensors
=
None
,
**
kwargs
):
**
kwargs
):
"""
"""
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
...
@@ -743,7 +745,7 @@ class PreTrainedTokenizer(object):
...
@@ -743,7 +745,7 @@ class PreTrainedTokenizer(object):
def
encode_plus
(
self
,
def
encode_plus
(
self
,
text
,
text
,
text_pair
=
None
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
truncation_strategy
=
'longest_first'
,
...
@@ -798,7 +800,7 @@ class PreTrainedTokenizer(object):
...
@@ -798,7 +800,7 @@ class PreTrainedTokenizer(object):
truncation_strategy
=
truncation_strategy
,
truncation_strategy
=
truncation_strategy
,
return_tensors
=
return_tensors
)
return_tensors
=
return_tensors
)
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Fals
e
,
stride
=
0
,
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Tru
e
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
):
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
):
"""
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
...
@@ -881,6 +883,11 @@ class PreTrainedTokenizer(object):
...
@@ -881,6 +883,11 @@ class PreTrainedTokenizer(object):
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
if
max_length
is
None
and
len
(
encoded_inputs
[
"input_ids"
])
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
encoded_inputs
return
encoded_inputs
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
...
@@ -955,7 +962,7 @@ class PreTrainedTokenizer(object):
...
@@ -955,7 +962,7 @@ class PreTrainedTokenizer(object):
special tokens for the model
special tokens for the model
Returns:
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
"""
return
[
0
]
*
((
len
(
token_ids_1
)
if
token_ids_1
else
0
)
+
len
(
token_ids_0
))
return
[
0
]
*
((
len
(
token_ids_1
)
if
token_ids_1
else
0
)
+
len
(
token_ids_0
))
...
@@ -1059,7 +1066,7 @@ class PreTrainedTokenizer(object):
...
@@ -1059,7 +1066,7 @@ class PreTrainedTokenizer(object):
class attributes (cls_token, unk_token...).
class attributes (cls_token, unk_token...).
"""
"""
all_toks
=
self
.
all_special_tokens
all_toks
=
self
.
all_special_tokens
all_ids
=
list
(
self
.
_
convert_token_to_id
(
t
)
for
t
in
all_toks
)
all_ids
=
self
.
convert_token
s
_to_id
s
(
all_toks
)
return
all_ids
return
all_ids
@
staticmethod
@
staticmethod
...
...
transformers/tokenization_xlm.py
View file @
5340d1f2
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Tokenization classes for
OpenAI GPT
."""
"""Tokenization classes for
XLM
."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
unicode_literals
)
...
@@ -758,9 +758,9 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -758,9 +758,9 @@ class XLMTokenizer(PreTrainedTokenizer):
"""
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
by concatenating and adding special tokens.
A
RoBERTa
sequence has the following format:
A
XLM
sequence has the following format:
single sequence: <s> X </s>
single sequence: <s> X </s>
pair of sequences: <s> A </s>
</s>
B </s>
pair of sequences: <s> A </s> B </s>
"""
"""
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
...
@@ -781,7 +781,7 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -781,7 +781,7 @@ class XLMTokenizer(PreTrainedTokenizer):
special tokens for the model
special tokens for the model
Returns:
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
...
...
transformers/tokenization_xlnet.py
View file @
5340d1f2
...
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
"""
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
by concatenating and adding special tokens.
A
RoBERTa
sequence has the following format:
A
n XLNet
sequence has the following format:
single sequence:
<s> X </
s>
single sequence:
X <sep> <cl
s>
pair of sequences:
<s> A </s></s> B </
s>
pair of sequences:
A <sep> B <sep> <cl
s>
"""
"""
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
...
@@ -208,7 +208,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -208,7 +208,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
special tokens for the model
special tokens for the model
Returns:
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
...
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A
BERT
sequence pair mask has the following format:
A
n XLNet
sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
| first sequence | second sequence | CLS segment ID
...
...
Prev
1
…
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment