Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5340d1f2
Unverified
Commit
5340d1f2
authored
Nov 27, 2019
by
Thomas Wolf
Committed by
GitHub
Nov 27, 2019
Browse files
Merge branch 'master' into resumable_http
parents
0e4cc050
10bd1ddb
Changes
131
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
527 additions
and
40 deletions
+527
-40
transformers/tokenization_albert.py
transformers/tokenization_albert.py
+252
-0
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+7
-2
transformers/tokenization_bert.py
transformers/tokenization_bert.py
+1
-1
transformers/tokenization_camembert.py
transformers/tokenization_camembert.py
+157
-0
transformers/tokenization_ctrl.py
transformers/tokenization_ctrl.py
+60
-5
transformers/tokenization_distilbert.py
transformers/tokenization_distilbert.py
+2
-0
transformers/tokenization_gpt2.py
transformers/tokenization_gpt2.py
+7
-4
transformers/tokenization_roberta.py
transformers/tokenization_roberta.py
+7
-1
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+25
-18
transformers/tokenization_xlm.py
transformers/tokenization_xlm.py
+4
-4
transformers/tokenization_xlnet.py
transformers/tokenization_xlnet.py
+5
-5
No files found.
transformers/tokenization_albert.py
0 → 100644
View file @
5340d1f2
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
from
.tokenization_utils
import
PreTrainedTokenizer
import
logging
import
unicodedata
import
six
import
os
from
shutil
import
copyfile
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'spiece.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'albert-base-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model"
,
'albert-large-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model"
,
'albert-xlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model"
,
'albert-xxlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model"
,
'albert-base-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model"
,
'albert-large-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model"
,
'albert-xlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model"
,
'albert-xxlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'albert-base-v1'
:
512
,
'albert-large-v1'
:
512
,
'albert-xlarge-v1'
:
512
,
'albert-xxlarge-v1'
:
512
,
'albert-base-v2'
:
512
,
'albert-large-v2'
:
512
,
'albert-xlarge-v2'
:
512
,
'albert-xxlarge-v2'
:
512
,
}
SPIECE_UNDERLINE
=
u
'▁'
class
AlbertTokenizer
(
PreTrainedTokenizer
):
"""
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
remove_space
=
True
,
keep_accents
=
False
,
bos_token
=
"[CLS]"
,
eos_token
=
"[SEP]"
,
unk_token
=
"<unk>"
,
sep_token
=
"[SEP]"
,
pad_token
=
"<pad>"
,
cls_token
=
"[CLS]"
,
mask_token
=
"[MASK]>"
,
**
kwargs
):
super
(
AlbertTokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
do_lower_case
=
do_lower_case
self
.
remove_space
=
remove_space
self
.
keep_accents
=
keep_accents
self
.
vocab_file
=
vocab_file
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
vocab_file
)
@
property
def
vocab_size
(
self
):
return
len
(
self
.
sp_model
)
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
preprocess_text
(
self
,
inputs
):
if
self
.
remove_space
:
outputs
=
' '
.
join
(
inputs
.
strip
().
split
())
else
:
outputs
=
inputs
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
outputs
=
outputs
.
decode
(
'utf-8'
)
if
not
self
.
keep_accents
:
outputs
=
unicodedata
.
normalize
(
'NFKD'
,
outputs
)
outputs
=
''
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
if
self
.
do_lower_case
:
outputs
=
outputs
.
lower
()
return
outputs
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Tokenize a string.
return_unicode is used only for py2
"""
text
=
self
.
preprocess_text
(
text
)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if
six
.
PY2
and
isinstance
(
text
,
unicode
):
text
=
text
.
encode
(
'utf-8'
)
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
else
:
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
new_pieces
=
[]
for
piece
in
pieces
:
if
len
(
piece
)
>
1
and
piece
[
-
1
]
==
','
and
piece
[
-
2
].
isdigit
():
cur_pieces
=
self
.
sp_model
.
EncodeAsPieces
(
piece
[:
-
1
].
replace
(
SPIECE_UNDERLINE
,
''
))
if
piece
[
0
]
!=
SPIECE_UNDERLINE
and
cur_pieces
[
0
][
0
]
==
SPIECE_UNDERLINE
:
if
len
(
cur_pieces
[
0
])
==
1
:
cur_pieces
=
cur_pieces
[
1
:]
else
:
cur_pieces
[
0
]
=
cur_pieces
[
0
][
1
:]
cur_pieces
.
append
(
piece
[
-
1
])
new_pieces
.
extend
(
cur_pieces
)
else
:
new_pieces
.
append
(
piece
)
# note(zhiliny): convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
new_pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
'utf-8'
)
ret_pieces
.
append
(
piece
)
new_pieces
=
ret_pieces
return
new_pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
'utf-8'
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
transformers/tokenization_auto.py
View file @
5340d1f2
...
...
@@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -41,6 +42,7 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model)
...
...
@@ -64,8 +66,9 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (
XLM
model)
- contains `roberta`: RobertaTokenizer (
RoBERTa
model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
...
...
@@ -106,6 +109,8 @@ class AutoTokenizer(object):
"""
if
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'camembert'
in
pretrained_model_name_or_path
:
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
...
...
@@ -124,4 +129,4 @@ class AutoTokenizer(object):
return
CTRLTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl'"
.
format
(
pretrained_model_name_or_path
))
"'xlm', 'roberta',
'camembert',
'ctrl'"
.
format
(
pretrained_model_name_or_path
))
transformers/tokenization_bert.py
View file @
5340d1f2
...
...
@@ -220,7 +220,7 @@ class BertTokenizer(PreTrainedTokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
if
already_has_special_tokens
:
...
...
transformers/tokenization_camembert.py
0 → 100644
View file @
5340d1f2
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for Camembert model."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
logging
import
os
from
shutil
import
copyfile
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'sentencepiece.bpe.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'camembert-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'camembert-base'
:
None
,
}
class
CamembertTokenizer
(
PreTrainedTokenizer
):
"""
Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
additional_special_tokens
=
[
'<s>NOTUSED'
,
'<s>NOTUSED'
],
**
kwargs
):
super
(
CamembertTokenizer
,
self
).
__init__
(
max_len
=
512
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
str
(
vocab_file
))
self
.
vocab_file
=
vocab_file
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
# sentencepiece vocabulary (this is the case for <s> and </s>
self
.
fairseq_tokens_to_ids
=
{
'<s>NOTUSED'
:
0
,
'<pad>'
:
1
,
'</s>NOTUSED'
:
2
,
'<unk>'
:
3
}
self
.
fairseq_offset
=
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_tokens_to_ids
[
'<mask>'
]
=
len
(
self
.
sp_model
)
+
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_ids_to_tokens
=
{
v
:
k
for
k
,
v
in
self
.
fairseq_tokens_to_ids
.
items
()}
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
@
property
def
vocab_size
(
self
):
return
self
.
fairseq_offset
+
len
(
self
.
sp_model
)
def
_tokenize
(
self
,
text
):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
if
index
in
self
.
fairseq_ids_to_tokens
:
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
transformers/tokenization_ctrl.py
View file @
5340d1f2
...
...
@@ -46,6 +46,64 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ctrl'
:
256
,
}
CONTROL_CODES
=
{
"Pregnancy"
:
168629
,
"Christianity"
:
7675
,
"Explain"
:
106423
,
"Fitness"
:
63440
,
"Saving"
:
63163
,
"Ask"
:
27171
,
"Ass"
:
95985
,
"Joke"
:
163509
,
"Questions"
:
45622
,
"Thoughts"
:
49605
,
"Retail"
:
52342
,
"Feminism"
:
164338
,
"Writing"
:
11992
,
"Atheism"
:
192263
,
"Netflix"
:
48616
,
"Computing"
:
39639
,
"Opinion"
:
43213
,
"Alone"
:
44967
,
"Funny"
:
58917
,
"Gaming"
:
40358
,
"Human"
:
4088
,
"India"
:
1331
,
"Joker"
:
77138
,
"Diet"
:
36206
,
"Legal"
:
11859
,
"Norman"
:
4939
,
"Tip"
:
72689
,
"Weight"
:
52343
,
"Movies"
:
46273
,
"Running"
:
23425
,
"Science"
:
2090
,
"Horror"
:
37793
,
"Confession"
:
60572
,
"Finance"
:
12250
,
"Politics"
:
16360
,
"Scary"
:
191985
,
"Support"
:
12654
,
"Technologies"
:
32516
,
"Teenage"
:
66160
,
"Event"
:
32769
,
"Learned"
:
67460
,
"Notion"
:
182770
,
"Wikipedia"
:
37583
,
"Books"
:
6665
,
"Extract"
:
76050
,
"Confessions"
:
102701
,
"Conspiracy"
:
75932
,
"Links"
:
63674
,
"Narcissus"
:
150425
,
"Relationship"
:
54766
,
"Relationships"
:
134796
,
"Reviews"
:
41671
,
"News"
:
4256
,
"Translation"
:
26820
,
"multilingual"
:
128406
,
}
def
get_pairs
(
word
):
"""Return set of symbol pairs in a word.
...
...
@@ -63,15 +121,12 @@ def get_pairs(word):
class
CTRLTokenizer
(
PreTrainedTokenizer
):
"""
CTRL BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
- Byte-Pair-Encoding
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
control_codes
=
CONTROL_CODES
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
**
kwargs
):
super
(
CTRLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
**
kwargs
)
...
...
transformers/tokenization_distilbert.py
View file @
5340d1f2
...
...
@@ -33,12 +33,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
{
'distilbert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
,
'distilbert-base-uncased-distilled-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
,
'distilbert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'distilbert-base-uncased'
:
512
,
'distilbert-base-uncased-distilled-squad'
:
512
,
'distilbert-base-multilingual-cased'
:
512
,
}
...
...
transformers/tokenization_gpt2.py
View file @
5340d1f2
...
...
@@ -46,6 +46,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json"
,
'gpt2-xl'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json"
,
},
'merges_file'
:
...
...
@@ -53,6 +54,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt"
,
'gpt2-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt"
,
'gpt2-xl'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt"
,
'distilgpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt"
,
},
}
...
...
@@ -61,6 +63,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2'
:
1024
,
'gpt2-medium'
:
1024
,
'gpt2-large'
:
1024
,
'gpt2-xl'
:
1024
,
'distilgpt2'
:
1024
,
}
...
...
@@ -104,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
"""
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
- Requires a space to start the input string => the encoding
and tokenize
methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode``
and
``decode`` method will not conserve
the
absence of a
space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = "
Hello"`
Otherwise, this tokenizer
's
``encode``
,
``decode``
, and ``tokenize``
method
s
will not conserve
the space
s
at the beginning of a string: `tokenizer.decode(tokenizer.encode("
Hello")) = "Hello"`
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
...
...
@@ -181,7 +184,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
""" Tokenize a string.
Args:
- add_prefix_space (boolean, default False):
Begin the sentence with at least one space to
to
get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
"""
if
add_prefix_space
:
text
=
' '
+
text
...
...
transformers/tokenization_roberta.py
View file @
5340d1f2
...
...
@@ -47,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json"
,
'roberta-base-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
,
'roberta-large-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
,
},
'merges_file'
:
{
...
...
@@ -54,6 +56,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt"
,
'distilroberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt"
,
'roberta-base-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
,
'roberta-large-openai-detector'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
,
},
}
...
...
@@ -62,6 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'roberta-large'
:
512
,
'roberta-large-mnli'
:
512
,
'distilroberta-base'
:
512
,
'roberta-base-openai-detector'
:
512
,
'roberta-large-openai-detector'
:
512
,
}
...
...
@@ -114,7 +120,7 @@ class RobertaTokenizer(GPT2Tokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
...
...
transformers/tokenization_utils.py
View file @
5340d1f2
...
...
@@ -21,6 +21,7 @@ import os
import
json
import
six
import
copy
import
itertools
from
io
import
open
from
.file_utils
import
cached_path
,
is_tf_available
,
is_torch_available
...
...
@@ -516,6 +517,8 @@ class PreTrainedTokenizer(object):
to_add_tokens
=
[]
for
token
in
new_tokens
:
assert
isinstance
(
token
,
str
)
or
(
six
.
PY2
and
isinstance
(
token
,
unicode
))
if
self
.
init_kwargs
.
get
(
'do_lower_case'
,
False
):
token
=
token
.
lower
()
if
token
!=
self
.
unk_token
and
\
self
.
convert_tokens_to_ids
(
token
)
==
self
.
convert_tokens_to_ids
(
self
.
unk_token
)
and
\
token
not
in
to_add_tokens
:
...
...
@@ -609,6 +612,9 @@ class PreTrainedTokenizer(object):
Take care of added tokens.
"""
if
self
.
init_kwargs
.
get
(
'do_lower_case'
,
False
):
text
=
text
.
lower
()
def
split_on_token
(
tok
,
text
):
result
=
[]
split_text
=
text
.
split
(
tok
)
...
...
@@ -645,9 +651,9 @@ class PreTrainedTokenizer(object):
tokenized_text
+=
[
sub_text
]
text_list
=
tokenized_text
return
sum
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
return
list
(
itertools
.
chain
.
from_iterable
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
in
self
.
added_tokens_encoder
and
token
not
in
self
.
all_special_tokens
\
else
[
token
]
for
token
in
tokenized_text
)
,
[]
)
else
[
token
]
for
token
in
tokenized_text
)
)
)
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
+
self
.
all_special_tokens
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
...
...
@@ -675,10 +681,6 @@ class PreTrainedTokenizer(object):
ids
=
[]
for
token
in
tokens
:
ids
.
append
(
self
.
_convert_token_to_id_with_added_voc
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
ids
def
_convert_token_to_id_with_added_voc
(
self
,
token
):
...
...
@@ -693,14 +695,14 @@ class PreTrainedTokenizer(object):
raise
NotImplementedError
def
encode
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
max_length
=
None
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
,
**
kwargs
):
text
,
text_pair
=
None
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
,
**
kwargs
):
"""
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
...
...
@@ -743,7 +745,7 @@ class PreTrainedTokenizer(object):
def
encode_plus
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
...
...
@@ -798,7 +800,7 @@ class PreTrainedTokenizer(object):
truncation_strategy
=
truncation_strategy
,
return_tensors
=
return_tensors
)
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Fals
e
,
stride
=
0
,
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Tru
e
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
...
...
@@ -881,6 +883,11 @@ class PreTrainedTokenizer(object):
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
if
max_length
is
None
and
len
(
encoded_inputs
[
"input_ids"
])
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
encoded_inputs
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
...
...
@@ -955,7 +962,7 @@ class PreTrainedTokenizer(object):
special tokens for the model
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
return
[
0
]
*
((
len
(
token_ids_1
)
if
token_ids_1
else
0
)
+
len
(
token_ids_0
))
...
...
@@ -1059,7 +1066,7 @@ class PreTrainedTokenizer(object):
class attributes (cls_token, unk_token...).
"""
all_toks
=
self
.
all_special_tokens
all_ids
=
list
(
self
.
_
convert_token_to_id
(
t
)
for
t
in
all_toks
)
all_ids
=
self
.
convert_token
s
_to_id
s
(
all_toks
)
return
all_ids
@
staticmethod
...
...
transformers/tokenization_xlm.py
View file @
5340d1f2
...
...
@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for
OpenAI GPT
."""
"""Tokenization classes for
XLM
."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
...
...
@@ -758,9 +758,9 @@ class XLMTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A
RoBERTa
sequence has the following format:
A
XLM
sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s>
</s>
B </s>
pair of sequences: <s> A </s> B </s>
"""
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
...
...
@@ -781,7 +781,7 @@ class XLMTokenizer(PreTrainedTokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
if
already_has_special_tokens
:
...
...
transformers/tokenization_xlnet.py
View file @
5340d1f2
...
...
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A
RoBERTa
sequence has the following format:
single sequence:
<s> X </
s>
pair of sequences:
<s> A </s></s> B </
s>
A
n XLNet
sequence has the following format:
single sequence:
X <sep> <cl
s>
pair of sequences:
A <sep> B <sep> <cl
s>
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
...
...
@@ -208,7 +208,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]:
0
for a special token,
1
for a sequence token.
A list of integers in the range [0, 1]:
1
for a special token,
0
for a sequence token.
"""
if
already_has_special_tokens
:
...
...
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A
BERT
sequence pair mask has the following format:
A
n XLNet
sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
...
...
Prev
1
…
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment