Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
50792dbd
"...text-generation-inference.git" did not exist on "efd602c8207c90908fc4cd127235f1b57e741814"
Unverified
Commit
50792dbd
authored
Aug 28, 2019
by
Thomas Wolf
Committed by
GitHub
Aug 28, 2019
Browse files
Merge pull request #1127 from huggingface/dilbert
DilBERT
parents
d06c5a2a
e7706f51
Changes
25
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
335 additions
and
3 deletions
+335
-3
pytorch_transformers/tests/modeling_common_test.py
pytorch_transformers/tests/modeling_common_test.py
+7
-0
pytorch_transformers/tests/modeling_dilbert_test.py
pytorch_transformers/tests/modeling_dilbert_test.py
+217
-0
pytorch_transformers/tests/tokenization_bert_test.py
pytorch_transformers/tests/tokenization_bert_test.py
+3
-3
pytorch_transformers/tests/tokenization_dilbert_test.py
pytorch_transformers/tests/tokenization_dilbert_test.py
+46
-0
pytorch_transformers/tokenization_distilbert.py
pytorch_transformers/tokenization_distilbert.py
+62
-0
No files found.
pytorch_transformers/tests/modeling_common_test.py
View file @
50792dbd
...
@@ -49,6 +49,7 @@ class CommonTestCases:
...
@@ -49,6 +49,7 @@ class CommonTestCases:
test_torchscript
=
True
test_torchscript
=
True
test_pruning
=
True
test_pruning
=
True
test_resize_embeddings
=
True
test_resize_embeddings
=
True
test_head_masking
=
True
def
test_initialization
(
self
):
def
test_initialization
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -159,6 +160,9 @@ class CommonTestCases:
...
@@ -159,6 +160,9 @@ class CommonTestCases:
def
test_headmasking
(
self
):
def
test_headmasking
(
self
):
if
not
self
.
test_head_masking
:
return
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
.
output_attentions
=
True
config
.
output_attentions
=
True
...
@@ -282,6 +286,9 @@ class CommonTestCases:
...
@@ -282,6 +286,9 @@ class CommonTestCases:
self
.
assertTrue
(
models_equal
)
self
.
assertTrue
(
models_equal
)
def
test_tie_model_weights
(
self
):
def
test_tie_model_weights
(
self
):
if
not
self
.
test_torchscript
:
return
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
def
check_same_values
(
layer_1
,
layer_2
):
def
check_same_values
(
layer_1
,
layer_2
):
...
...
pytorch_transformers/tests/modeling_dilbert_test.py
0 → 100644
View file @
50792dbd
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
pytest
from
pytorch_transformers
import
(
DistilBertConfig
,
DistilBertModel
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
)
from
pytorch_transformers.modeling_distilbert
import
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_common_test
import
(
CommonTestCases
,
ConfigTester
,
ids_tensor
)
class
DistilBertModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
DistilBertModel
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
)
test_pruning
=
True
test_torchscript
=
True
test_resize_embeddings
=
True
test_head_masking
=
True
class
DistilBertModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_input_mask
=
True
,
use_token_type_ids
=
False
,
use_labels
=
True
,
vocab_size
=
99
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_input_mask
=
use_input_mask
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
DistilBertConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
dim
=
self
.
hidden_size
,
n_layers
=
self
.
num_hidden_layers
,
n_heads
=
self
.
num_attention_heads
,
hidden_dim
=
self
.
intermediate_size
,
hidden_act
=
self
.
hidden_act
,
dropout
=
self
.
hidden_dropout_prob
,
attention_dropout
=
self
.
attention_probs_dropout_prob
,
max_position_embeddings
=
self
.
max_position_embeddings
,
initializer_range
=
self
.
initializer_range
)
return
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
def
check_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
[])
def
create_and_check_distilbert_model
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertModel
(
config
=
config
)
model
.
eval
()
(
sequence_output
,)
=
model
(
input_ids
,
input_mask
)
(
sequence_output
,)
=
model
(
input_ids
)
result
=
{
"sequence_output"
:
sequence_output
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
def
create_and_check_distilbert_for_masked_lm
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForMaskedLM
(
config
=
config
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
masked_lm_labels
=
token_labels
)
result
=
{
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
self
.
check_loss_output
(
result
)
def
create_and_check_distilbert_for_question_answering
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForQuestionAnswering
(
config
=
config
)
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
,
sequence_labels
)
result
=
{
"loss"
:
loss
,
"start_logits"
:
start_logits
,
"end_logits"
:
end_logits
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"start_logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"end_logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
])
self
.
check_loss_output
(
result
)
def
create_and_check_distilbert_for_sequence_classification
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
DistilBertForSequenceClassification
(
config
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
size
()),
[
self
.
batch_size
,
self
.
num_labels
])
self
.
check_loss_output
(
result
)
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
DistilBertModelTest
.
DistilBertModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
DistilBertConfig
,
dim
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_distilbert_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_model
(
*
config_and_inputs
)
def
test_for_masked_lm
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_masked_lm
(
*
config_and_inputs
)
def
test_for_question_answering
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_question_answering
(
*
config_and_inputs
)
def
test_for_sequence_classification
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_sequence_classification
(
*
config_and_inputs
)
# @pytest.mark.slow
# def test_model_from_pretrained(self):
# cache_dir = "/tmp/pytorch_transformers_test/"
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
# model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
# shutil.rmtree(cache_dir)
# self.assertIsNotNone(model)
if
__name__
==
"__main__"
:
unittest
.
main
()
pytorch_transformers/tests/tokenization_bert_test.py
View file @
50792dbd
...
@@ -42,7 +42,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -42,7 +42,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
):
def
get_tokenizer
(
self
):
return
BertT
okenizer
.
from_pretrained
(
self
.
tmpdirname
)
return
self
.
t
okenizer
_class
.
from_pretrained
(
self
.
tmpdirname
)
def
get_input_output_texts
(
self
):
def
get_input_output_texts
(
self
):
input_text
=
u
"UNwant
\u00E9
d,running"
input_text
=
u
"UNwant
\u00E9
d,running"
...
@@ -50,7 +50,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -50,7 +50,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
return
input_text
,
output_text
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
def
test_full_tokenizer
(
self
):
tokenizer
=
BertT
okenizer
(
self
.
vocab_file
)
tokenizer
=
self
.
t
okenizer
_class
(
self
.
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
u
"UNwant
\u00E9
d,running"
)
tokens
=
tokenizer
.
tokenize
(
u
"UNwant
\u00E9
d,running"
)
self
.
assertListEqual
(
tokens
,
[
"un"
,
"##want"
,
"##ed"
,
","
,
"runn"
,
"##ing"
])
self
.
assertListEqual
(
tokens
,
[
"un"
,
"##want"
,
"##ed"
,
","
,
"runn"
,
"##ing"
])
...
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
self
.
assertFalse
(
_is_punctuation
(
u
" "
))
self
.
assertFalse
(
_is_punctuation
(
u
" "
))
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
BertT
okenizer
.
from_pretrained
(
"bert-base-uncased"
)
tokenizer
=
self
.
t
okenizer
_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
...
...
pytorch_transformers/tests/tokenization_dilbert_test.py
0 → 100644
View file @
50792dbd
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
pytorch_transformers.tokenization_distilbert
import
(
DistilBertTokenizer
)
from
.tokenization_tests_commons
import
CommonTestCases
from
.tokenization_bert_test
import
BertTokenizationTest
class
DistilBertTokenizationTest
(
BertTokenizationTest
):
tokenizer_class
=
DistilBertTokenizer
def
get_tokenizer
(
self
):
return
DistilBertTokenizer
.
from_pretrained
(
self
.
tmpdirname
)
def
test_sequence_builders
(
self
):
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sentence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sentences_pair
(
text
,
text_2
)
assert
encoded_sentence
==
[
101
]
+
text
+
[
102
]
assert
encoded_pair
==
[
101
]
+
text
+
[
102
]
+
text_2
+
[
102
]
if
__name__
==
'__main__'
:
unittest
.
main
()
pytorch_transformers/tokenization_distilbert.py
0 → 100644
View file @
50792dbd
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for DistilBERT."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
collections
import
logging
import
os
import
unicodedata
from
io
import
open
from
.tokenization_bert
import
BertTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'distilbert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
,
'distilbert-base-uncased-distilled-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'distilbert-base-uncased'
:
512
,
'distilbert-base-uncased-distilled-squad'
:
512
,
}
class
DistilBertTokenizer
(
BertTokenizer
):
r
"""
Constructs a DistilBertTokenizer.
:class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_wordpiece_only=False
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment