Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
21637d49
Unverified
Commit
21637d49
authored
Nov 27, 2019
by
Thomas Wolf
Committed by
GitHub
Nov 27, 2019
Browse files
Merge branch 'master' into do_lower_case
parents
7246d3c2
de2696f6
Changes
72
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
819 additions
and
33 deletions
+819
-33
transformers/tests/modeling_tf_albert_test.py
transformers/tests/modeling_tf_albert_test.py
+231
-0
transformers/tests/modeling_tf_bert_test.py
transformers/tests/modeling_tf_bert_test.py
+0
-4
transformers/tests/modeling_tf_common_test.py
transformers/tests/modeling_tf_common_test.py
+39
-0
transformers/tests/optimization_test.py
transformers/tests/optimization_test.py
+17
-12
transformers/tests/tokenization_albert_test.py
transformers/tests/tokenization_albert_test.py
+78
-0
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+21
-0
transformers/tokenization_albert.py
transformers/tokenization_albert.py
+252
-0
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+7
-2
transformers/tokenization_camembert.py
transformers/tokenization_camembert.py
+157
-0
transformers/tokenization_gpt2.py
transformers/tokenization_gpt2.py
+4
-4
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+9
-7
transformers/tokenization_xlnet.py
transformers/tokenization_xlnet.py
+4
-4
No files found.
transformers/tests/modeling_tf_albert_test.py
0 → 100644
View file @
21637d49
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
transformers
import
AlbertConfig
,
is_tf_available
if
is_tf_available
():
import
tensorflow
as
tf
from
transformers.modeling_tf_albert
import
(
TFAlbertModel
,
TFAlbertForMaskedLM
,
TFAlbertForSequenceClassification
,
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
class
TFAlbertModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFAlbertModel
,
TFAlbertForMaskedLM
,
TFAlbertForSequenceClassification
)
if
is_tf_available
()
else
()
class
TFAlbertModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_input_mask
=
True
,
use_token_type_ids
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
embedding_size
=
16
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_input_mask
=
use_input_mask
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
embedding_size
=
embedding_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
(
[
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
(
[
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
AlbertConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
hidden_size
=
self
.
hidden_size
,
num_hidden_layers
=
self
.
num_hidden_layers
,
num_attention_heads
=
self
.
num_attention_heads
,
intermediate_size
=
self
.
intermediate_size
,
hidden_act
=
self
.
hidden_act
,
hidden_dropout_prob
=
self
.
hidden_dropout_prob
,
attention_probs_dropout_prob
=
self
.
attention_probs_dropout_prob
,
max_position_embeddings
=
self
.
max_position_embeddings
,
type_vocab_size
=
self
.
type_vocab_size
,
initializer_range
=
self
.
initializer_range
)
return
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
def
create_and_check_albert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
TFAlbertModel
(
config
=
config
)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
sequence_output
,
pooled_output
=
model
(
inputs
)
inputs
=
[
input_ids
,
input_mask
]
sequence_output
,
pooled_output
=
model
(
inputs
)
sequence_output
,
pooled_output
=
model
(
input_ids
)
result
=
{
"sequence_output"
:
sequence_output
.
numpy
(),
"pooled_output"
:
pooled_output
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"pooled_output"
].
shape
),
[
self
.
batch_size
,
self
.
hidden_size
])
def
create_and_check_albert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
TFAlbertForMaskedLM
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
prediction_scores
,
=
model
(
inputs
)
result
=
{
"prediction_scores"
:
prediction_scores
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
create_and_check_albert_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
TFAlbertForSequenceClassification
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
logits
,
=
model
(
inputs
)
result
=
{
"logits"
:
logits
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
shape
),
[
self
.
batch_size
,
self
.
num_labels
])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
TFAlbertModelTest
.
TFAlbertModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
AlbertConfig
,
hidden_size
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_albert_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_albert_model
(
*
config_and_inputs
)
def
test_for_masked_lm
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_albert_for_masked_lm
(
*
config_and_inputs
)
def
test_for_sequence_classification
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_albert_for_sequence_classification
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
# for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
for
model_name
in
[
'albert-base-uncased'
]:
model
=
TFAlbertModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_bert_test.py
View file @
21637d49
...
@@ -131,10 +131,6 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -131,10 +131,6 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
TFBertModel
(
config
=
config
)
model
=
TFBertModel
(
config
=
config
)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs)
inputs
=
{
'input_ids'
:
input_ids
,
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
'token_type_ids'
:
token_type_ids
}
...
...
transformers/tests/modeling_tf_common_test.py
View file @
21637d49
...
@@ -360,6 +360,16 @@ class TFCommonTestCases:
...
@@ -360,6 +360,16 @@ class TFCommonTestCases:
# self.assertTrue(models_equal)
# self.assertTrue(models_equal)
def
test_model_common_attributes
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
assert
isinstance
(
model
.
get_input_embeddings
(),
tf
.
keras
.
layers
.
Layer
)
x
=
model
.
get_output_embeddings
()
assert
x
is
None
or
isinstance
(
x
,
tf
.
keras
.
layers
.
Layer
)
def
test_tie_model_weights
(
self
):
def
test_tie_model_weights
(
self
):
pass
pass
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...
@@ -401,6 +411,35 @@ class TFCommonTestCases:
...
@@ -401,6 +411,35 @@ class TFCommonTestCases:
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
self
.
assertTrue
(
tf
.
math
.
equal
(
first
,
second
).
numpy
().
all
())
self
.
assertTrue
(
tf
.
math
.
equal
(
first
,
second
).
numpy
().
all
())
def
test_inputs_embeds
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
wte
=
model
.
get_input_embeddings
()
try
:
x
=
wte
(
input_ids
,
mode
=
"embedding"
)
except
:
try
:
x
=
wte
([
input_ids
],
mode
=
"embedding"
)
except
:
try
:
x
=
wte
([
input_ids
,
None
,
None
,
None
],
mode
=
"embedding"
)
except
:
if
hasattr
(
self
.
model_tester
,
"embedding_size"
):
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
embedding_size
],
dtype
=
tf
.
dtypes
.
float32
)
else
:
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
hidden_size
],
dtype
=
tf
.
dtypes
.
float32
)
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
#
inputs_dict
[
"inputs_embeds"
]
=
x
outputs
=
model
(
inputs_dict
)
def
ids_tensor
(
shape
,
vocab_size
,
rng
=
None
,
name
=
None
,
dtype
=
None
):
def
ids_tensor
(
shape
,
vocab_size
,
rng
=
None
,
name
=
None
,
dtype
=
None
):
"""Creates a random int32 tensor of the shape within the vocab size."""
"""Creates a random int32 tensor of the shape within the vocab size."""
...
...
transformers/tests/optimization_test.py
View file @
21637d49
...
@@ -25,8 +25,12 @@ from transformers import is_torch_available
...
@@ -25,8 +25,12 @@ from transformers import is_torch_available
if
is_torch_available
():
if
is_torch_available
():
import
torch
import
torch
from
transformers
import
(
AdamW
,
ConstantLRSchedule
,
WarmupConstantSchedule
,
from
transformers
import
(
AdamW
,
WarmupCosineSchedule
,
WarmupCosineWithHardRestartsSchedule
,
WarmupLinearSchedule
)
get_constant_schedule
,
get_constant_schedule_with_warmup
,
get_cosine_schedule_with_warmup
,
get_cosine_with_hard_restarts_schedule_with_warmup
,
get_linear_schedule_with_warmup
)
else
:
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
...
@@ -87,59 +91,60 @@ class ScheduleInitTest(unittest.TestCase):
...
@@ -87,59 +91,60 @@ class ScheduleInitTest(unittest.TestCase):
self
.
assertAlmostEqual
(
a
,
b
,
delta
=
tol
)
self
.
assertAlmostEqual
(
a
,
b
,
delta
=
tol
)
def
test_constant_scheduler
(
self
):
def
test_constant_scheduler
(
self
):
scheduler
=
C
onstant
LRS
chedule
(
self
.
optimizer
)
scheduler
=
get_c
onstant
_s
chedule
(
self
.
optimizer
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
10.
]
*
self
.
num_steps
expected_learning_rates
=
[
10.
]
*
self
.
num_steps
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
C
onstant
LRS
chedule
(
self
.
optimizer
)
scheduler
=
get_c
onstant
_s
chedule
(
self
.
optimizer
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_constant_scheduler
(
self
):
def
test_warmup_constant_scheduler
(
self
):
scheduler
=
WarmupC
onstant
S
chedule
(
self
.
optimizer
,
warmup_steps
=
4
)
scheduler
=
get_c
onstant
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
4
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
2.5
,
5.0
,
7.5
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
]
expected_learning_rates
=
[
2.5
,
5.0
,
7.5
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
WarmupC
onstant
S
chedule
(
self
.
optimizer
,
warmup_steps
=
4
)
scheduler
=
get_c
onstant
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
4
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_linear_scheduler
(
self
):
def
test_warmup_linear_scheduler
(
self
):
scheduler
=
WarmupL
inear
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_l
inear
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.75
,
7.5
,
6.25
,
5.0
,
3.75
,
2.5
,
1.25
,
0.0
]
expected_learning_rates
=
[
5.0
,
10.0
,
8.75
,
7.5
,
6.25
,
5.0
,
3.75
,
2.5
,
1.25
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
scheduler
=
WarmupL
inear
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_l
inear
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_cosine_scheduler
(
self
):
def
test_warmup_cosine_scheduler
(
self
):
scheduler
=
WarmupC
osine
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
9.61
,
8.53
,
6.91
,
5.0
,
3.08
,
1.46
,
0.38
,
0.0
]
expected_learning_rates
=
[
5.0
,
10.0
,
9.61
,
8.53
,
6.91
,
5.0
,
3.08
,
1.46
,
0.38
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
scheduler
=
WarmupC
osine
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
def
test_warmup_cosine_hard_restart_scheduler
(
self
):
def
test_warmup_cosine_hard_restart_scheduler
(
self
):
scheduler
=
WarmupC
osine
W
ith
H
ard
R
estarts
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
cycles
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_w
ith
_h
ard
_r
estarts
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_
cycles
=
2
,
num_training_steps
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.53
,
5.0
,
1.46
,
10.0
,
8.53
,
5.0
,
1.46
,
0.0
]
expected_learning_rates
=
[
5.0
,
10.0
,
8.53
,
5.0
,
1.46
,
10.0
,
8.53
,
5.0
,
1.46
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
scheduler
=
WarmupC
osine
W
ith
H
ard
R
estarts
S
chedule
(
self
.
optimizer
,
warmup_steps
=
2
,
cycles
=
2
,
t_total
=
10
)
scheduler
=
get_c
osine
_w
ith
_h
ard
_r
estarts
_s
chedule
_with_warmup
(
self
.
optimizer
,
num_
warmup_steps
=
2
,
num_
cycles
=
2
,
num_training_steps
=
10
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
lrs_2
=
unwrap_and_save_reload_schedule
(
scheduler
,
self
.
num_steps
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
[
l
[
0
]
for
l
in
lrs_2
])
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
transformers/tests/tokenization_albert_test.py
0 → 100644
View file @
21637d49
# coding=utf-8
# Copyright 2019 Hugging Face inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_albert
import
(
AlbertTokenizer
,
SPIECE_UNDERLINE
)
from
.tokenization_tests_commons
import
CommonTestCases
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'fixtures/spiece.model'
)
class
AlbertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
AlbertTokenizer
def
setUp
(
self
):
super
(
AlbertTokenizationTest
,
self
).
setUp
()
# We have a SentencePiece fixture for testing
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
AlbertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"this is a test"
output_text
=
u
"this is a test"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
,
keep_accents
=
True
)
tokens
=
tokenizer
.
tokenize
(
u
'This is a test'
)
self
.
assertListEqual
(
tokens
,
[
u
'▁this'
,
u
'▁is'
,
u
'▁a'
,
u
'▁test'
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
48
,
25
,
21
,
1289
])
tokens
=
tokenizer
.
tokenize
(
u
"I was born in 92000, and this is falsé."
)
self
.
assertListEqual
(
tokens
,
[
u
'▁i'
,
u
'▁was'
,
u
'▁born'
,
u
'▁in'
,
u
'▁9'
,
u
'2000'
,
u
','
,
u
'▁and'
,
u
'▁this'
,
u
'▁is'
,
u
'▁fal'
,
u
's'
,
u
'é'
,
u
'.'
])
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
self
.
assertListEqual
(
ids
,
[
31
,
23
,
386
,
19
,
561
,
3050
,
15
,
17
,
48
,
25
,
8256
,
18
,
1
,
9
])
back_tokens
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
self
.
assertListEqual
(
back_tokens
,
[
'▁i'
,
'▁was'
,
'▁born'
,
'▁in'
,
'▁9'
,
'2000'
,
','
,
'▁and'
,
'▁this'
,
'▁is'
,
'▁fal'
,
's'
,
'<unk>'
,
'.'
])
def
test_sequence_builders
(
self
):
tokenizer
=
AlbertTokenizer
(
SAMPLE_VOCAB
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
assert
encoded_pair
==
[
tokenizer
.
cls_token_id
]
+
text
+
[
tokenizer
.
sep_token_id
]
+
text_2
+
[
tokenizer
.
sep_token_id
]
if
__name__
==
'__main__'
:
unittest
.
main
()
transformers/tests/tokenization_tests_commons.py
View file @
21637d49
...
@@ -190,6 +190,27 @@ class CommonTestCases:
...
@@ -190,6 +190,27 @@ class CommonTestCases:
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
def
test_add_special_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
special_token
=
"[SPECIAL TOKEN]"
tokenizer
.
add_special_tokens
({
"cls_token"
:
special_token
})
encoded_special_token
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
len
(
encoded_special_token
)
==
1
text
=
" "
.
join
([
input_text
,
special_token
,
output_text
])
encoded
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
input_encoded
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
output_encoded
=
tokenizer
.
encode
(
output_text
,
add_special_tokens
=
False
)
special_token_id
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
encoded
==
input_encoded
+
special_token_id
+
output_encoded
decoded
=
tokenizer
.
decode
(
encoded
,
skip_special_tokens
=
True
)
assert
special_token
not
in
decoded
def
test_required_methods_tokenizer
(
self
):
def
test_required_methods_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
...
...
transformers/tokenization_albert.py
0 → 100644
View file @
21637d49
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
from
.tokenization_utils
import
PreTrainedTokenizer
import
logging
import
unicodedata
import
six
import
os
from
shutil
import
copyfile
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'spiece.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'albert-base-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model"
,
'albert-large-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model"
,
'albert-xlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model"
,
'albert-xxlarge-v1'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model"
,
'albert-base-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model"
,
'albert-large-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model"
,
'albert-xlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model"
,
'albert-xxlarge-v2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'albert-base-v1'
:
512
,
'albert-large-v1'
:
512
,
'albert-xlarge-v1'
:
512
,
'albert-xxlarge-v1'
:
512
,
'albert-base-v2'
:
512
,
'albert-large-v2'
:
512
,
'albert-xlarge-v2'
:
512
,
'albert-xxlarge-v2'
:
512
,
}
SPIECE_UNDERLINE
=
u
'▁'
class
AlbertTokenizer
(
PreTrainedTokenizer
):
"""
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
remove_space
=
True
,
keep_accents
=
False
,
bos_token
=
"[CLS]"
,
eos_token
=
"[SEP]"
,
unk_token
=
"<unk>"
,
sep_token
=
"[SEP]"
,
pad_token
=
"<pad>"
,
cls_token
=
"[CLS]"
,
mask_token
=
"[MASK]>"
,
**
kwargs
):
super
(
AlbertTokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
do_lower_case
=
do_lower_case
self
.
remove_space
=
remove_space
self
.
keep_accents
=
keep_accents
self
.
vocab_file
=
vocab_file
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
vocab_file
)
@
property
def
vocab_size
(
self
):
return
len
(
self
.
sp_model
)
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
preprocess_text
(
self
,
inputs
):
if
self
.
remove_space
:
outputs
=
' '
.
join
(
inputs
.
strip
().
split
())
else
:
outputs
=
inputs
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
outputs
=
outputs
.
decode
(
'utf-8'
)
if
not
self
.
keep_accents
:
outputs
=
unicodedata
.
normalize
(
'NFKD'
,
outputs
)
outputs
=
''
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
if
self
.
do_lower_case
:
outputs
=
outputs
.
lower
()
return
outputs
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Tokenize a string.
return_unicode is used only for py2
"""
text
=
self
.
preprocess_text
(
text
)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if
six
.
PY2
and
isinstance
(
text
,
unicode
):
text
=
text
.
encode
(
'utf-8'
)
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
else
:
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
new_pieces
=
[]
for
piece
in
pieces
:
if
len
(
piece
)
>
1
and
piece
[
-
1
]
==
','
and
piece
[
-
2
].
isdigit
():
cur_pieces
=
self
.
sp_model
.
EncodeAsPieces
(
piece
[:
-
1
].
replace
(
SPIECE_UNDERLINE
,
''
))
if
piece
[
0
]
!=
SPIECE_UNDERLINE
and
cur_pieces
[
0
][
0
]
==
SPIECE_UNDERLINE
:
if
len
(
cur_pieces
[
0
])
==
1
:
cur_pieces
=
cur_pieces
[
1
:]
else
:
cur_pieces
[
0
]
=
cur_pieces
[
0
][
1
:]
cur_pieces
.
append
(
piece
[
-
1
])
new_pieces
.
extend
(
cur_pieces
)
else
:
new_pieces
.
append
(
piece
)
# note(zhiliny): convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
new_pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
'utf-8'
)
ret_pieces
.
append
(
piece
)
new_pieces
=
ret_pieces
return
new_pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
'utf-8'
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
transformers/tokenization_auto.py
View file @
21637d49
...
@@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer
...
@@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -41,6 +42,7 @@ class AutoTokenizer(object):
...
@@ -41,6 +42,7 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
...
@@ -64,8 +66,9 @@ class AutoTokenizer(object):
...
@@ -64,8 +66,9 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (
XLM
model)
- contains `roberta`: RobertaTokenizer (
RoBERTa
model)
- contains `bert`: BertTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
...
@@ -103,6 +106,8 @@ class AutoTokenizer(object):
...
@@ -103,6 +106,8 @@ class AutoTokenizer(object):
"""
"""
if
'distilbert'
in
pretrained_model_name_or_path
:
if
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
DistilBertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'camembert'
in
pretrained_model_name_or_path
:
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
elif
'bert'
in
pretrained_model_name_or_path
:
...
@@ -121,4 +126,4 @@ class AutoTokenizer(object):
...
@@ -121,4 +126,4 @@ class AutoTokenizer(object):
return
CTRLTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
CTRLTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl'"
.
format
(
pretrained_model_name_or_path
))
"'xlm', 'roberta',
'camembert',
'ctrl'"
.
format
(
pretrained_model_name_or_path
))
transformers/tokenization_camembert.py
0 → 100644
View file @
21637d49
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for Camembert model."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
logging
import
os
from
shutil
import
copyfile
import
sentencepiece
as
spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'sentencepiece.bpe.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'camembert-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'camembert-base'
:
None
,
}
class
CamembertTokenizer
(
PreTrainedTokenizer
):
"""
Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
additional_special_tokens
=
[
'<s>NOTUSED'
,
'<s>NOTUSED'
],
**
kwargs
):
super
(
CamembertTokenizer
,
self
).
__init__
(
max_len
=
512
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
str
(
vocab_file
))
self
.
vocab_file
=
vocab_file
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
# sentencepiece vocabulary (this is the case for <s> and </s>
self
.
fairseq_tokens_to_ids
=
{
'<s>NOTUSED'
:
0
,
'<pad>'
:
1
,
'</s>NOTUSED'
:
2
,
'<unk>'
:
3
}
self
.
fairseq_offset
=
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_tokens_to_ids
[
'<mask>'
]
=
len
(
self
.
sp_model
)
+
len
(
self
.
fairseq_tokens_to_ids
)
self
.
fairseq_ids_to_tokens
=
{
v
:
k
for
k
,
v
in
self
.
fairseq_tokens_to_ids
.
items
()}
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
if
token_ids_1
is
None
:
return
[
self
.
cls_token_id
]
+
token_ids_0
+
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
,
token_ids_1
=
None
,
already_has_special_tokens
=
False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
@
property
def
vocab_size
(
self
):
return
self
.
fairseq_offset
+
len
(
self
.
sp_model
)
def
_tokenize
(
self
,
text
):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
if
index
in
self
.
fairseq_ids_to_tokens
:
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
transformers/tokenization_gpt2.py
View file @
21637d49
...
@@ -107,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -107,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
"""
"""
GPT-2 BPE tokenizer. Peculiarities:
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
- Requires a space to start the input string => the encoding
and tokenize
methods should be called with the
``add_prefix_space`` flag set to ``True``.
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode``
and
``decode`` method will not conserve
Otherwise, this tokenizer
's
``encode``
,
``decode``
, and ``tokenize``
method
s
will not conserve
the
absence of a
space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = "
Hello"`
the space
s
at the beginning of a string: `tokenizer.decode(tokenizer.encode("
Hello")) = "Hello"`
"""
"""
vocab_files_names
=
VOCAB_FILES_NAMES
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
...
@@ -184,7 +184,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -184,7 +184,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
""" Tokenize a string.
""" Tokenize a string.
Args:
Args:
- add_prefix_space (boolean, default False):
- add_prefix_space (boolean, default False):
Begin the sentence with at least one space to
to
get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
"""
"""
if
add_prefix_space
:
if
add_prefix_space
:
text
=
' '
+
text
text
=
' '
+
text
...
...
transformers/tokenization_utils.py
View file @
21637d49
...
@@ -21,6 +21,7 @@ import os
...
@@ -21,6 +21,7 @@ import os
import
json
import
json
import
six
import
six
import
copy
import
copy
import
itertools
from
io
import
open
from
io
import
open
from
.file_utils
import
cached_path
,
is_tf_available
,
is_torch_available
from
.file_utils
import
cached_path
,
is_tf_available
,
is_torch_available
...
@@ -646,9 +647,9 @@ class PreTrainedTokenizer(object):
...
@@ -646,9 +647,9 @@ class PreTrainedTokenizer(object):
tokenized_text
+=
[
sub_text
]
tokenized_text
+=
[
sub_text
]
text_list
=
tokenized_text
text_list
=
tokenized_text
return
sum
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
return
list
(
itertools
.
chain
.
from_iterable
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
in
self
.
added_tokens_encoder
and
token
not
in
self
.
all_special_tokens
\
in
self
.
added_tokens_encoder
and
token
not
in
self
.
all_special_tokens
\
else
[
token
]
for
token
in
tokenized_text
)
,
[]
)
else
[
token
]
for
token
in
tokenized_text
)
)
)
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
+
self
.
all_special_tokens
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
+
self
.
all_special_tokens
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
...
@@ -676,10 +677,6 @@ class PreTrainedTokenizer(object):
...
@@ -676,10 +677,6 @@ class PreTrainedTokenizer(object):
ids
=
[]
ids
=
[]
for
token
in
tokens
:
for
token
in
tokens
:
ids
.
append
(
self
.
_convert_token_to_id_with_added_voc
(
token
))
ids
.
append
(
self
.
_convert_token_to_id_with_added_voc
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
ids
return
ids
def
_convert_token_to_id_with_added_voc
(
self
,
token
):
def
_convert_token_to_id_with_added_voc
(
self
,
token
):
...
@@ -882,6 +879,11 @@ class PreTrainedTokenizer(object):
...
@@ -882,6 +879,11 @@ class PreTrainedTokenizer(object):
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"token_type_ids"
]
=
encoded_inputs
[
"token_type_ids"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
][:
max_length
]
if
max_length
is
None
and
len
(
encoded_inputs
[
"input_ids"
])
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
encoded_inputs
return
encoded_inputs
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
...
@@ -1060,7 +1062,7 @@ class PreTrainedTokenizer(object):
...
@@ -1060,7 +1062,7 @@ class PreTrainedTokenizer(object):
class attributes (cls_token, unk_token...).
class attributes (cls_token, unk_token...).
"""
"""
all_toks
=
self
.
all_special_tokens
all_toks
=
self
.
all_special_tokens
all_ids
=
list
(
self
.
_
convert_token_to_id
(
t
)
for
t
in
all_toks
)
all_ids
=
self
.
convert_token
s
_to_id
s
(
all_toks
)
return
all_ids
return
all_ids
@
staticmethod
@
staticmethod
...
...
transformers/tokenization_xlnet.py
View file @
21637d49
...
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
"""
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
by concatenating and adding special tokens.
A
RoBERTa
sequence has the following format:
A
n XLNet
sequence has the following format:
single sequence:
<s> X </
s>
single sequence:
X <sep> <cl
s>
pair of sequences:
<s> A </s></s> B </
s>
pair of sequences:
A <sep> B <sep> <cl
s>
"""
"""
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
...
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A
BERT
sequence pair mask has the following format:
A
n XLNet
sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
| first sequence | second sequence | CLS segment ID
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment