Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ee20201d
Commit
ee20201d
authored
Oct 30, 2019
by
Lysandre
Committed by
Lysandre Debut
Nov 26, 2019
Browse files
Tokenization tests + fixes + init
parent
e3ea5d1d
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
102 additions
and
19 deletions
+102
-19
transformers/__init__.py
transformers/__init__.py
+5
-0
transformers/tests/fixtures/30k-clean.model
transformers/tests/fixtures/30k-clean.model
+0
-0
transformers/tests/tokenization_albert_test.py
transformers/tests/tokenization_albert_test.py
+78
-0
transformers/tokenization_albert.py
transformers/tokenization_albert.py
+15
-15
transformers/tokenization_xlnet.py
transformers/tokenization_xlnet.py
+4
-4
No files found.
transformers/__init__.py
View file @
ee20201d
...
...
@@ -42,6 +42,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_albert
import
AlbertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
# Configurations
...
...
@@ -57,6 +58,8 @@ from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_xlm
import
XLMConfig
,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_roberta
import
RobertaConfig
,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_distilbert
import
DistilBertConfig
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from
.configuration_albert
import
AlbertConfig
,
ALBERT
from
.configuration_albert
import
AlbertConfig
from
.configuration_camembert
import
CamembertConfig
,
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
# Modeling
...
...
@@ -104,6 +107,8 @@ if is_torch_available():
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_encoder_decoder
import
PreTrainedEncoderDecoder
,
Model2Model
from
.modeling_albert
import
(
AlbertModel
,
AlbertForMaskedLM
)
# Optimization
from
.optimization
import
(
AdamW
,
get_constant_schedule
,
get_constant_schedule_with_warmup
,
get_cosine_schedule_with_warmup
,
get_cosine_with_hard_restarts_schedule_with_warmup
,
get_linear_schedule_with_warmup
)
...
...
transformers/tests/fixtures/30k-clean.model
0 → 100644
View file @
ee20201d
File added
transformers/tests/tokenization_albert_test.py
0 → 100644
View file @
ee20201d
# coding=utf-8
# Copyright 2019 Hugging Face inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_albert
import
(
AlbertTokenizer
,
SPIECE_UNDERLINE
)
from
.tokenization_tests_commons
import
CommonTestCases
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'fixtures/30k-clean.model'
)
class
AlbertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
AlbertTokenizer
def
setUp
(
self
):
super
(
AlbertTokenizationTest
,
self
).
setUp
()
# We have a SentencePiece fixture for testing
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
AlbertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"this is a test"
output_text
=
u
"this is a test"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
,
keep_accents
=
True
)
tokens
=
tokenizer
.
tokenize
(
u
'This is a test'
)
self
.
assertListEqual
(
tokens
,
[
u
'▁this'
,
u
'▁is'
,
u
'▁a'
,
u
'▁test'
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
48
,
25
,
21
,
1289
])
tokens
=
tokenizer
.
tokenize
(
u
"I was born in 92000, and this is falsé."
)
self
.
assertListEqual
(
tokens
,
[
u
'▁i'
,
u
'▁was'
,
u
'▁born'
,
u
'▁in'
,
u
'▁9'
,
u
'2000'
,
u
','
,
u
'▁and'
,
u
'▁this'
,
u
'▁is'
,
u
'▁fal'
,
u
's'
,
u
'é'
,
u
'.'
])
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
self
.
assertListEqual
(
ids
,
[
31
,
23
,
386
,
19
,
561
,
3050
,
15
,
17
,
48
,
25
,
8256
,
18
,
1
,
9
])
back_tokens
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
self
.
assertListEqual
(
back_tokens
,
[
'▁i'
,
'▁was'
,
'▁born'
,
'▁in'
,
'▁9'
,
'2000'
,
','
,
'▁and'
,
'▁this'
,
'▁is'
,
'▁fal'
,
's'
,
'<unk>'
,
'.'
])
def
test_sequence_builders
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
assert
encoded_pair
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
+
text_2
+
[
tokenizer
.
sep_token_id
]
if
__name__
==
'__main__'
:
unittest
.
main
()
transformers/tokenization_albert.py
View file @
ee20201d
...
...
@@ -8,6 +8,7 @@ from shutil import copyfile
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'30k-clean.model'
}
SPIECE_UNDERLINE
=
u
'▁'
class
AlbertTokenizer
(
PreTrainedTokenizer
):
...
...
@@ -16,12 +17,12 @@ class AlbertTokenizer(PreTrainedTokenizer):
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
#
vocab_files_names = VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
# pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
Fals
e
,
remove_space
=
True
,
keep_accents
=
False
,
do_lower_case
=
Tru
e
,
remove_space
=
True
,
keep_accents
=
False
,
bos_token
=
"[CLS]"
,
eos_token
=
"[SEP]"
,
unk_token
=
"<unk>"
,
sep_token
=
"[SEP]"
,
pad_token
=
"<pad>"
,
cls_token
=
"[CLS]"
,
mask_token
=
"[MASK]>"
,
**
kwargs
):
super
(
AlbertTokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
...
...
@@ -142,15 +143,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A
Ro
BERT
a
sequence has the following format:
single sequence:
<s> X </s>
pair of sequences:
<s> A </s></s> B </s>
A
n AL
BERT sequence has the following format:
single sequence:
[CLS] X [SEP]
pair of sequences:
[CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
token_ids_0
+
sep
+
cls
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
...
...
@@ -175,25 +176,24 @@ class AlbertTokenizer(PreTrainedTokenizer):
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
,
1
]
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A
BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0
1
1 1 1 1 1 1 1 1 1 1
2
| first sequence | second sequence
| CLS segment ID
A
n AL
BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0
0
1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls_segment_id
=
[
2
]
if
token_ids_1
is
None
:
return
len
(
token_ids_0
+
sep
+
cls
)
*
[
0
]
return
len
(
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
+
cls_segment_id
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
...
...
transformers/tokenization_xlnet.py
View file @
ee20201d
...
...
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A
RoBERTa
sequence has the following format:
single sequence:
<s> X </
s>
pair of sequences:
<s> A </s></s> B </
s>
A
n XLNet
sequence has the following format:
single sequence:
X <sep> <cl
s>
pair of sequences:
A <sep> B <sep> <cl
s>
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
...
...
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A
BERT
sequence pair mask has the following format:
A
n XLNet
sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment