Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
770043ee
Commit
770043ee
authored
Aug 07, 2019
by
LysandreJik
Browse files
Sentence-pair tasks handling. Using common tests on RoBERTa. Forced push to fix indentation.
parent
cb9db101
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
287 additions
and
95 deletions
+287
-95
pytorch_transformers/__init__.py
pytorch_transformers/__init__.py
+3
-0
pytorch_transformers/modeling_roberta.py
pytorch_transformers/modeling_roberta.py
+22
-6
pytorch_transformers/tests/modeling_roberta_test.py
pytorch_transformers/tests/modeling_roberta_test.py
+165
-51
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+29
-16
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+68
-22
No files found.
pytorch_transformers/__init__.py
View file @
770043ee
...
@@ -5,6 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
...
@@ -5,6 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_xlnet
import
XLNetTokenizer
,
SPIECE_UNDERLINE
from
.tokenization_xlnet
import
XLNetTokenizer
,
SPIECE_UNDERLINE
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_utils
import
(
PreTrainedTokenizer
,
clean_up_tokenization
)
from
.tokenization_utils
import
(
PreTrainedTokenizer
,
clean_up_tokenization
)
from
.modeling_bert
import
(
BertConfig
,
BertPreTrainedModel
,
BertModel
,
BertForPreTraining
,
from
.modeling_bert
import
(
BertConfig
,
BertPreTrainedModel
,
BertModel
,
BertForPreTraining
,
...
@@ -33,6 +34,8 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
...
@@ -33,6 +34,8 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMForQuestionAnswering
,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
,
XLMForQuestionAnswering
,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_roberta
import
(
RobertaConfig
,
RobertaForMaskedLM
,
RobertaModel
,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_utils
import
(
WEIGHTS_NAME
,
CONFIG_NAME
,
TF_WEIGHTS_NAME
,
from
.modeling_utils
import
(
WEIGHTS_NAME
,
CONFIG_NAME
,
TF_WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
prune_layer
,
Conv1D
)
PretrainedConfig
,
PreTrainedModel
,
prune_layer
,
Conv1D
)
...
...
pytorch_transformers/modeling_roberta.py
View file @
770043ee
...
@@ -23,6 +23,7 @@ import logging
...
@@ -23,6 +23,7 @@ import logging
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
torch.nn
import
CrossEntropyLoss
from
pytorch_transformers.modeling_bert
import
(
BertConfig
,
BertEmbeddings
,
from
pytorch_transformers.modeling_bert
import
(
BertConfig
,
BertEmbeddings
,
BertLayerNorm
,
BertModel
,
BertLayerNorm
,
BertModel
,
...
@@ -78,7 +79,7 @@ class RobertaModel(BertModel):
...
@@ -78,7 +79,7 @@ class RobertaModel(BertModel):
super
(
RobertaModel
,
self
).
__init__
(
config
)
super
(
RobertaModel
,
self
).
__init__
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
apply
(
self
.
init_weights
)
class
RobertaForMaskedLM
(
BertPreTrainedModel
):
class
RobertaForMaskedLM
(
BertPreTrainedModel
):
...
@@ -94,16 +95,31 @@ class RobertaForMaskedLM(BertPreTrainedModel):
...
@@ -94,16 +95,31 @@ class RobertaForMaskedLM(BertPreTrainedModel):
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
lm_head
=
RobertaLMHead
(
config
)
self
.
lm_head
=
RobertaLMHead
(
config
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
self
.
apply
(
self
.
init_weights
)
self
.
tie_weights
()
def
tie_weights
(
self
):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self
.
_tie_or_clone_weights
(
self
.
lm_head
.
decoder
,
self
.
roberta
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
roberta
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
outputs
=
self
.
roberta
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
lm_head
(
sequence_output
)
prediction_scores
=
self
.
lm_head
(
sequence_output
)
outputs
=
(
prediction_scores
,)
+
outputs
[
2
:]
outputs
=
(
prediction_scores
,)
+
outputs
[
2
:]
return
outputs
if
masked_lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
outputs
=
(
masked_lm_loss
,)
+
outputs
return
outputs
class
RobertaLMHead
(
nn
.
Module
):
class
RobertaLMHead
(
nn
.
Module
):
...
@@ -114,7 +130,7 @@ class RobertaLMHead(nn.Module):
...
@@ -114,7 +130,7 @@ class RobertaLMHead(nn.Module):
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
layer_norm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
layer_norm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
weight
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
vocab_size
,
bias
=
False
)
.
weight
self
.
decoder
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
vocab_size
,
bias
=
False
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
vocab_size
))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
vocab_size
))
def
forward
(
self
,
features
,
**
kwargs
):
def
forward
(
self
,
features
,
**
kwargs
):
...
@@ -123,6 +139,6 @@ class RobertaLMHead(nn.Module):
...
@@ -123,6 +139,6 @@ class RobertaLMHead(nn.Module):
x
=
self
.
layer_norm
(
x
)
x
=
self
.
layer_norm
(
x
)
# project back to size of vocabulary with bias
# project back to size of vocabulary with bias
x
=
F
.
linear
(
x
,
self
.
weight
)
+
self
.
bias
x
=
self
.
decoder
(
x
)
+
self
.
bias
return
x
return
x
pytorch_transformers/tests/modeling_roberta_test.py
View file @
770043ee
...
@@ -12,58 +12,172 @@
...
@@ -12,58 +12,172 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
__future__
import
(
absolute_import
,
division
,
print_function
,
from
__future__
import
absolute_import
unicode_literals
)
from
__future__
import
division
from
__future__
import
print_function
import
os
import
unittest
import
unittest
import
shutil
import
pytest
import
pytest
import
torch
from
pytorch_transformers
import
(
RobertaConfig
,
RobertaModel
,
RobertaForMaskedLM
)
from
pytorch_transformers.modeling_roberta
import
(
RobertaForMaskedLM
,
from
pytorch_transformers.modeling_roberta
import
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
RobertaModel
)
from
.modeling_common_test
import
(
CommonTestCases
,
ConfigTester
,
ids_tensor
)
class
RobertaModelTest
(
unittest
.
TestCase
):
class
RobertaModelTest
(
CommonTestCases
.
CommonModelTester
):
# @pytest.mark.slow
def
test_inference_masked_lm
(
self
):
all_model_classes
=
(
RobertaForMaskedLM
,
RobertaModel
)
model
=
RobertaForMaskedLM
.
from_pretrained
(
'roberta-base'
)
class
RobertaModelTester
(
object
):
input_ids
=
torch
.
tensor
([[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]])
output
=
model
(
input_ids
)[
0
]
def
__init__
(
self
,
expected_shape
=
torch
.
Size
((
1
,
11
,
50265
))
parent
,
self
.
assertEqual
(
batch_size
=
13
,
output
.
shape
,
seq_length
=
7
,
expected_shape
is_training
=
True
,
)
use_input_mask
=
True
,
# compare the actual values for a slice.
use_token_type_ids
=
True
,
expected_slice
=
torch
.
Tensor
(
use_labels
=
True
,
[[[
33.8843
,
-
4.3107
,
22.7779
],
vocab_size
=
99
,
[
4.6533
,
-
2.8099
,
13.6252
],
hidden_size
=
32
,
[
1.8222
,
-
3.6898
,
8.8600
]]]
num_hidden_layers
=
5
,
)
num_attention_heads
=
4
,
self
.
assertTrue
(
intermediate_size
=
37
,
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
hidden_act
=
"gelu"
,
)
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
# @pytest.mark.slow
max_position_embeddings
=
512
,
def
test_inference_no_head
(
self
):
type_vocab_size
=
16
,
model
=
RobertaModel
.
from_pretrained
(
'roberta-base'
)
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
input_ids
=
torch
.
tensor
([[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]])
num_labels
=
3
,
output
=
model
(
input_ids
)[
0
]
num_choices
=
4
,
# compare the actual values for a slice.
scope
=
None
,
expected_slice
=
torch
.
Tensor
(
):
[[[
-
0.0231
,
0.0782
,
0.0074
],
self
.
parent
=
parent
[
-
0.1854
,
0.0539
,
-
0.0174
],
self
.
batch_size
=
batch_size
[
0.0548
,
0.0799
,
0.1687
]]]
self
.
seq_length
=
seq_length
)
self
.
is_training
=
is_training
self
.
assertTrue
(
self
.
use_input_mask
=
use_input_mask
torch
.
allclose
(
output
[:,
:
3
,
:
3
],
expected_slice
,
atol
=
1e-3
)
self
.
use_token_type_ids
=
use_token_type_ids
)
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
if
__name__
==
'__main__'
:
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
RobertaConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
hidden_size
=
self
.
hidden_size
,
num_hidden_layers
=
self
.
num_hidden_layers
,
num_attention_heads
=
self
.
num_attention_heads
,
intermediate_size
=
self
.
intermediate_size
,
hidden_act
=
self
.
hidden_act
,
hidden_dropout_prob
=
self
.
hidden_dropout_prob
,
attention_probs_dropout_prob
=
self
.
attention_probs_dropout_prob
,
max_position_embeddings
=
self
.
max_position_embeddings
,
type_vocab_size
=
self
.
type_vocab_size
,
initializer_range
=
self
.
initializer_range
)
return
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
def
check_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
[])
def
create_and_check_roberta_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
RobertaModel
(
config
=
config
)
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
,
input_mask
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
)
result
=
{
"sequence_output"
:
sequence_output
,
"pooled_output"
:
pooled_output
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"pooled_output"
].
size
()),
[
self
.
batch_size
,
self
.
hidden_size
])
def
create_and_check_roberta_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
RobertaForMaskedLM
(
config
=
config
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
token_labels
)
result
=
{
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
self
.
check_loss_output
(
result
)
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
RobertaModelTest
.
RobertaModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
RobertaConfig
,
hidden_size
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_roberta_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_roberta_model
(
*
config_and_inputs
)
def
test_for_masked_lm
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_roberta_for_masked_lm
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_transformers_test/"
for
model_name
in
list
(
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
RobertaModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
770043ee
...
@@ -12,32 +12,45 @@
...
@@ -12,32 +12,45 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
__future__
import
(
absolute_import
,
division
,
print_function
,
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
unicode_literals
)
import
os
import
os
import
unittest
import
unittest
import
pytest
import
six
from
pytorch_transformers.tokenization_roberta
import
RobertaTokenizer
from
pytorch_transformers.tokenization_roberta
import
RobertaTokenizer
,
VOCAB_FILES_NAMES
from
.tokenization_tests_commons
import
create_and_check_tokenizer_commons
,
TemporaryDirectory
class
RobertaTokenizationTest
(
unittest
.
TestCase
):
class
RobertaTokenizationTest
(
unittest
.
TestCase
):
# @pytest.mark.slow
def
test_full_tokenizer
(
self
):
def
test_full_tokenizer
(
self
):
tokenizer
=
RobertaTokenizer
.
from_pretrained
(
'roberta-base'
)
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
self
.
assertListEqual
(
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
tokenizer
.
encode
(
'Hello world!'
),
"lo"
,
"low"
,
"er"
,
[
0
,
31414
,
232
,
328
,
2
]
"low"
,
"lowest"
,
"newer"
,
"wider"
,
"<unk>"
]
)
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
if
six
.
PY3
:
special_tokens_map
=
{
"unk_token"
:
"<unk>"
}
with
TemporaryDirectory
()
as
tmpdirname
:
vocab_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
with
open
(
vocab_file
,
"w"
)
as
fp
:
[
fp
.
write
(
f
"
{
vocab
}
{
index
}
\n
"
)
for
index
,
vocab
in
enumerate
(
vocab_tokens
)]
input_text
=
u
"lower newer"
output_text
=
u
"lower<unk>newer"
create_and_check_tokenizer_commons
(
self
,
input_text
,
output_text
,
RobertaTokenizer
,
tmpdirname
,
**
special_tokens_map
)
tokenizer
=
RobertaTokenizer
(
vocab_file
,
**
special_tokens_map
)
text
=
"lower"
bpe_tokens
=
[
"low"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_bpe_tokens
=
[
13
,
12
,
17
]
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world! cécé herlolip'
),
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
pytorch_transformers/tokenization_roberta.py
View file @
770043ee
...
@@ -22,22 +22,22 @@ import re
...
@@ -22,22 +22,22 @@ import re
from
io
import
open
from
io
import
open
import
six
import
six
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_utils
import
PreTrainedTokenizer
,
clean_up_tokenization
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
VOCAB_FILES_NAMES
=
{
'
dict
_file'
:
'dict.txt'
,
'
vocab
_file'
:
'dict.txt'
,
}
}
PRETRAINED_VOCAB_FILES_MAP
=
{
PRETRAINED_VOCAB_FILES_MAP
=
{
'
dict
_file'
:
'
vocab
_file'
:
{
{
'roberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt"
,
'roberta-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt"
,
'roberta-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt"
,
'roberta-large-mnli'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt"
,
},
},
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
...
@@ -46,7 +46,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -46,7 +46,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'roberta-large-mnli'
:
512
,
'roberta-large-mnli'
:
512
,
}
}
SPACE_NORMALIZER
=
re
.
compile
(
r
"\s+"
)
SPACE_NORMALIZER
=
re
.
compile
(
r
"\s+"
)
def
tokenize_line
(
line
):
def
tokenize_line
(
line
):
...
@@ -142,7 +141,7 @@ class Dictionary(object):
...
@@ -142,7 +141,7 @@ class Dictionary(object):
"rebuild the dataset"
.
format
(
f
))
"rebuild the dataset"
.
format
(
f
))
return
return
lines
=
f
.
readlines
()
lines
=
f
.
read
().
split
lines
()
for
line
in
lines
:
for
line
in
lines
:
idx
=
line
.
rfind
(
' '
)
idx
=
line
.
rfind
(
' '
)
if
idx
==
-
1
:
if
idx
==
-
1
:
...
@@ -152,7 +151,7 @@ class Dictionary(object):
...
@@ -152,7 +151,7 @@ class Dictionary(object):
self
.
indices
[
word
]
=
len
(
self
.
symbols
)
self
.
indices
[
word
]
=
len
(
self
.
symbols
)
self
.
symbols
.
append
(
word
)
self
.
symbols
.
append
(
word
)
self
.
count
.
append
(
count
)
self
.
count
.
append
(
count
)
def
encode_line
(
self
,
line
,
line_tokenizer
=
tokenize_line
,
add_if_not_exist
=
True
,
def
encode_line
(
self
,
line
,
line_tokenizer
=
tokenize_line
,
add_if_not_exist
=
True
,
consumer
=
None
,
append_eos
=
True
,
reverse_order
=
False
):
consumer
=
None
,
append_eos
=
True
,
reverse_order
=
False
):
words
=
line_tokenizer
(
line
)
words
=
line_tokenizer
(
line
)
...
@@ -174,8 +173,6 @@ class Dictionary(object):
...
@@ -174,8 +173,6 @@ class Dictionary(object):
return
ids
return
ids
class
RobertaTokenizer
(
PreTrainedTokenizer
):
class
RobertaTokenizer
(
PreTrainedTokenizer
):
"""
"""
RoBERTa tokenizer. Peculiarities:
RoBERTa tokenizer. Peculiarities:
...
@@ -185,25 +182,53 @@ class RobertaTokenizer(PreTrainedTokenizer):
...
@@ -185,25 +182,53 @@ class RobertaTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
dict
_file
,
def
__init__
(
self
,
vocab
_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
**
kwargs
):
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
**
kwargs
):
super
(
RobertaTokenizer
,
self
).
__init__
(
bos_token
=
b
os_token
,
eos_token
=
eos_token
,
**
kwargs
)
super
(
RobertaTokenizer
,
self
).
__init__
(
cls_token
=
bos_token
,
sep_token
=
e
os_token
,
eos_token
=
eos_token
,
**
kwargs
)
self
.
gpt2_tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
'gpt2'
)
self
.
gpt2_tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
'gpt2'
)
self
.
dictionary
=
Dictionary
.
load
(
dict
_file
)
self
.
dictionary
=
Dictionary
.
load
(
vocab
_file
)
def
_tokenize
(
self
,
text
):
def
_tokenize
(
self
,
text
):
""" Use GPT-2 Tokenizer """
""" Use GPT-2 Tokenizer """
return
self
.
gpt2_tokenizer
.
_tokenize
(
text
)
return
self
.
gpt2_tokenizer
.
_tokenize
(
text
)
def
encode
(
self
,
text
):
def
encode
(
self
,
text
,
*
args
):
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
"""
"""
gpt2_tokens_joined
=
" "
.
join
(
bpe_sentence
=
[
self
.
cls_token
]
+
\
str
(
x
)
for
x
in
self
.
gpt2_tokenizer
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
self
.
gpt2_tokenizer
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
+
\
)
[
self
.
sep_token
]
bpe_sentence
=
'<s> '
+
gpt2_tokens_joined
+
' </s>'
return
self
.
dictionary
.
encode_line
(
bpe_sentence
,
append_eos
=
False
)
if
len
(
args
):
for
additional_sentence
in
args
:
bpe_sentence
+=
[
self
.
sep_token
]
+
\
self
.
gpt2_tokenizer
.
convert_tokens_to_ids
(
self
.
tokenize
(
additional_sentence
))
+
\
[
self
.
sep_token
]
return
self
.
dictionary
.
encode_line
(
' '
.
join
([
str
(
token
)
for
token
in
bpe_sentence
]),
append_eos
=
False
)
def
decode
(
self
,
token_ids
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
Handles sentence pairs.
"""
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
if
any
(
isinstance
(
element
,
list
)
for
element
in
filtered_tokens
):
texts
=
[]
for
element
in
filtered_tokens
:
text
=
self
.
convert_tokens_to_string
(
element
)
if
clean_up_tokenization_spaces
:
text
=
clean_up_tokenization
(
text
)
texts
.
append
(
text
)
return
texts
else
:
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
if
clean_up_tokenization_spaces
:
text
=
clean_up_tokenization
(
text
)
return
text
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
return
self
.
dictionary
.
index
(
token
)
return
self
.
dictionary
.
index
(
token
)
...
@@ -218,3 +243,24 @@ class RobertaTokenizer(PreTrainedTokenizer):
...
@@ -218,3 +243,24 @@ class RobertaTokenizer(PreTrainedTokenizer):
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
return
self
.
gpt2_tokenizer
.
convert_tokens_to_string
(
tokens
)
return
self
.
gpt2_tokenizer
.
convert_tokens_to_string
(
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
# Remove the first and last tokens which are cls and sep tokens
ids
=
ids
[
1
:
-
1
]
# If multi sentence, then split (multi sentence found by looking for two sequential sep tokens)
ids
=
[
list
(
map
(
int
,
example
.
split
(
' '
)))
for
example
in
' '
.
join
([
str
(
id
)
for
id
in
ids
]).
split
(
' 2 2 '
)]
if
len
(
ids
)
==
1
:
tokens
=
self
.
gpt2_tokenizer
.
convert_ids_to_tokens
(
list
(
map
(
lambda
id
:
int
(
self
.
dictionary
[
id
]),
ids
[
0
])))
else
:
tokens
=
[]
for
example
in
ids
:
tokens
+=
[
self
.
gpt2_tokenizer
.
convert_ids_to_tokens
(
list
(
map
(
lambda
id
:
int
(
self
.
dictionary
[
id
]),
example
)))]
return
tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
tokens
=
" "
.
join
(
str
(
x
)
for
x
in
self
.
gpt2_tokenizer
.
convert_tokens_to_ids
(
tokens
))
bpe_sentence
=
'<s> '
+
tokens
+
' </s>'
return
self
.
dictionary
.
encode_line
(
bpe_sentence
,
append_eos
=
False
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment