Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
d5712f7c
Unverified
Commit
d5712f7c
authored
Dec 12, 2019
by
Thomas Wolf
Committed by
GitHub
Dec 12, 2019
Browse files
Merge branch 'master' into check-link-validity
parents
f230d91b
9c58b236
Changes
98
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
445 additions
and
116 deletions
+445
-116
transformers/modeling_ctrl.py
transformers/modeling_ctrl.py
+2
-2
transformers/modeling_gpt2.py
transformers/modeling_gpt2.py
+3
-3
transformers/modeling_openai.py
transformers/modeling_openai.py
+4
-2
transformers/modeling_tf_bert.py
transformers/modeling_tf_bert.py
+10
-6
transformers/modeling_tf_ctrl.py
transformers/modeling_tf_ctrl.py
+2
-2
transformers/modeling_tf_distilbert.py
transformers/modeling_tf_distilbert.py
+47
-0
transformers/modeling_tf_gpt2.py
transformers/modeling_tf_gpt2.py
+3
-3
transformers/modeling_tf_utils.py
transformers/modeling_tf_utils.py
+8
-3
transformers/modeling_utils.py
transformers/modeling_utils.py
+12
-5
transformers/optimization_tf.py
transformers/optimization_tf.py
+254
-0
transformers/tests/conftest.py
transformers/tests/conftest.py
+0
-31
transformers/tests/modeling_albert_test.py
transformers/tests/modeling_albert_test.py
+7
-4
transformers/tests/modeling_auto_test.py
transformers/tests/modeling_auto_test.py
+12
-7
transformers/tests/modeling_bert_test.py
transformers/tests/modeling_bert_test.py
+20
-18
transformers/tests/modeling_common_test.py
transformers/tests/modeling_common_test.py
+33
-10
transformers/tests/modeling_ctrl_test.py
transformers/tests/modeling_ctrl_test.py
+5
-4
transformers/tests/modeling_distilbert_test.py
transformers/tests/modeling_distilbert_test.py
+8
-4
transformers/tests/modeling_encoder_decoder_test.py
transformers/tests/modeling_encoder_decoder_test.py
+3
-4
transformers/tests/modeling_gpt2_test.py
transformers/tests/modeling_gpt2_test.py
+6
-4
transformers/tests/modeling_openai_test.py
transformers/tests/modeling_openai_test.py
+6
-4
No files found.
transformers/modeling_ctrl.py
View file @
d5712f7c
...
...
@@ -252,7 +252,7 @@ class CTRLModel(CTRLPreTrainedModel):
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
...
...
@@ -438,7 +438,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
...
...
transformers/modeling_gpt2.py
View file @
d5712f7c
...
...
@@ -329,7 +329,7 @@ class GPT2Model(GPT2PreTrainedModel):
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
...
...
@@ -503,7 +503,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
...
...
@@ -596,7 +596,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
...
...
transformers/modeling_openai.py
View file @
d5712f7c
...
...
@@ -50,8 +50,10 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
logger
.
info
(
"Loading weights from {}"
.
format
(
openai_checkpoint_folder_path
))
names
=
json
.
load
(
open
(
openai_checkpoint_folder_path
+
'/parameters_names.json'
,
"r"
,
encoding
=
'utf-8'
))
shapes
=
json
.
load
(
open
(
openai_checkpoint_folder_path
+
'/params_shapes.json'
,
"r"
,
encoding
=
'utf-8'
))
with
open
(
openai_checkpoint_folder_path
+
'/parameters_names.json'
,
"r"
,
encoding
=
'utf-8'
)
as
names_handle
:
names
=
json
.
load
(
names_handle
)
with
open
(
openai_checkpoint_folder_path
+
'/params_shapes.json'
,
"r"
,
encoding
=
'utf-8'
)
as
shapes_handle
:
shapes
=
json
.
load
(
shapes_handle
)
offsets
=
np
.
cumsum
([
np
.
prod
(
shape
)
for
shape
in
shapes
])
init_params
=
[
np
.
load
(
openai_checkpoint_folder_path
+
'/params_{}.npy'
.
format
(
n
))
for
n
in
range
(
10
)]
init_params
=
np
.
split
(
np
.
concatenate
(
init_params
,
0
),
offsets
)[:
-
1
]
...
...
transformers/modeling_tf_bert.py
View file @
d5712f7c
...
...
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5"
,
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5"
}
...
...
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
...
...
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
input_shape
=
shape_list
(
input_ids
)
else
:
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
seq_length
=
input_shape
[
1
]
if
position_ids
is
None
:
position_ids
=
tf
.
range
(
seq_length
,
dtype
=
tf
.
int32
)[
tf
.
newaxis
,
:]
...
...
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
context_layer
=
tf
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
tf
.
transpose
(
context_layer
,
perm
=
[
0
,
2
,
1
,
3
])
context_layer
=
tf
.
reshape
(
context_layer
,
context_layer
=
tf
.
reshape
(
context_layer
,
(
batch_size
,
-
1
,
self
.
all_head_size
))
# (batch_size, seq_len_q, all_head_size)
outputs
=
(
context_layer
,
attention_probs
)
if
self
.
output_attentions
else
(
context_layer
,)
...
...
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
...
...
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
...
...
transformers/modeling_tf_ctrl.py
View file @
d5712f7c
...
...
@@ -400,7 +400,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**past**:
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``tf.Tensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
...
@@ -462,7 +462,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``tf.Tensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
...
transformers/modeling_tf_distilbert.py
View file @
d5712f7c
...
...
@@ -704,6 +704,53 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
return
outputs
# logits, (hidden_states), (attentions)
@
add_start_docstrings
(
"""DistilBert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """
,
DISTILBERT_START_DOCSTRING
,
DISTILBERT_INPUTS_DOCSTRING
)
class
TFDistilBertForTokenClassification
(
TFDistilBertPreTrainedModel
):
r
"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
model = TFDistilBertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
scores = outputs[0]
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
'distilbert'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'classifier'
)
def
call
(
self
,
inputs
,
**
kwargs
):
outputs
=
self
.
distilbert
(
inputs
,
**
kwargs
)
sequence_output
=
outputs
[
0
]
sequence_output
=
self
.
dropout
(
sequence_output
,
training
=
kwargs
.
get
(
'training'
,
False
))
logits
=
self
.
classifier
(
sequence_output
)
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
return
outputs
# scores, (hidden_states), (attentions)
@
add_start_docstrings
(
"""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """
,
DISTILBERT_START_DOCSTRING
,
DISTILBERT_INPUTS_DOCSTRING
)
...
...
transformers/modeling_tf_gpt2.py
View file @
d5712f7c
...
...
@@ -436,7 +436,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**past**:
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of ``tf.Tensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
...
@@ -476,7 +476,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of `tf.Tensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
...
@@ -535,7 +535,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
**past**:
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length,
sequence_length
)``:
list of `tf.Tensor`` (one for each layer) of shape ``(
2,
batch_size, num_heads, sequence_length,
embed_size_per_head
)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
...
transformers/modeling_tf_utils.py
View file @
d5712f7c
...
...
@@ -24,7 +24,8 @@ import os
import
tensorflow
as
tf
from
.configuration_utils
import
PretrainedConfig
from
.file_utils
import
cached_path
,
WEIGHTS_NAME
,
TF_WEIGHTS_NAME
,
TF2_WEIGHTS_NAME
from
.file_utils
import
(
TF2_WEIGHTS_NAME
,
TF_WEIGHTS_NAME
,
WEIGHTS_NAME
,
cached_path
,
hf_bucket_url
,
is_remote_url
)
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -257,10 +258,14 @@ class TFPreTrainedModel(tf.keras.Model):
raise
EnvironmentError
(
"Error no file named {} found in directory {} or `from_pt` set to False"
.
format
(
[
WEIGHTS_NAME
,
TF2_WEIGHTS_NAME
],
pretrained_model_name_or_path
))
elif
os
.
path
.
isfile
(
pretrained_model_name_or_path
):
elif
os
.
path
.
isfile
(
pretrained_model_name_or_path
)
or
is_remote_url
(
pretrained_model_name_or_path
)
:
archive_file
=
pretrained_model_name_or_path
elif
os
.
path
.
isfile
(
pretrained_model_name_or_path
+
".index"
):
archive_file
=
pretrained_model_name_or_path
+
".index"
else
:
raise
EnvironmentError
(
"Error file {} not found"
.
format
(
pretrained_model_name_or_path
))
archive_file
=
hf_bucket_url
(
pretrained_model_name_or_path
,
postfix
=
TF2_WEIGHTS_NAME
)
if
from_pt
:
raise
EnvironmentError
(
"Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name."
)
# redirect to the cache, if necessary
try
:
...
...
transformers/modeling_utils.py
View file @
d5712f7c
...
...
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
from
torch.nn
import
functional
as
F
from
.configuration_utils
import
PretrainedConfig
from
.file_utils
import
cached_path
,
WEIGHTS_NAME
,
TF_WEIGHTS_NAME
,
TF2_WEIGHTS_NAME
from
.file_utils
import
(
TF2_WEIGHTS_NAME
,
TF_WEIGHTS_NAME
,
WEIGHTS_NAME
,
cached_path
,
hf_bucket_url
,
is_remote_url
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -318,7 +319,8 @@ class PreTrainedModel(nn.Module):
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if
"albert"
in
pretrained_model_name_or_path
and
"v2"
in
pretrained_model_name_or_path
:
if
pretrained_model_name_or_path
is
not
None
and
(
"albert"
in
pretrained_model_name_or_path
and
"v2"
in
pretrained_model_name_or_path
):
logger
.
warning
(
"There is currently an upstream reproducibility issue with ALBERT v2 models. Please see "
+
"https://github.com/google-research/google-research/issues/119 for more information."
)
...
...
@@ -362,11 +364,16 @@ class PreTrainedModel(nn.Module):
raise
EnvironmentError
(
"Error no file named {} found in directory {} or `from_tf` set to False"
.
format
(
[
WEIGHTS_NAME
,
TF2_WEIGHTS_NAME
,
TF_WEIGHTS_NAME
+
".index"
],
pretrained_model_name_or_path
))
elif
os
.
path
.
isfile
(
pretrained_model_name_or_path
):
elif
os
.
path
.
isfile
(
pretrained_model_name_or_path
)
or
is_remote_url
(
pretrained_model_name_or_path
)
:
archive_file
=
pretrained_model_name_or_path
else
:
assert
from_tf
,
"Error finding file {}, no file or TF 1.X checkpoint found"
.
format
(
pretrained_model_name_or_path
)
elif
os
.
path
.
isfile
(
pretrained_model_name_or_path
+
".index"
):
assert
from_tf
,
"We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint"
.
format
(
pretrained_model_name_or_path
+
".index"
)
archive_file
=
pretrained_model_name_or_path
+
".index"
else
:
archive_file
=
hf_bucket_url
(
pretrained_model_name_or_path
,
postfix
=
WEIGHTS_NAME
)
if
from_tf
:
raise
EnvironmentError
(
"Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name."
)
# redirect to the cache, if necessary
try
:
...
...
transformers/optimization_tf.py
0 → 100644
View file @
d5712f7c
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
re
import
tensorflow
as
tf
class
WarmUp
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Applys a warmup schedule on a given learning rate decay schedule."""
def
__init__
(
self
,
initial_learning_rate
,
decay_schedule_fn
,
warmup_steps
,
power
=
1.0
,
name
=
None
):
super
(
WarmUp
,
self
).
__init__
()
self
.
initial_learning_rate
=
initial_learning_rate
self
.
warmup_steps
=
warmup_steps
self
.
power
=
power
self
.
decay_schedule_fn
=
decay_schedule_fn
self
.
name
=
name
def
__call__
(
self
,
step
):
with
tf
.
name_scope
(
self
.
name
or
'WarmUp'
)
as
name
:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float
=
tf
.
cast
(
step
,
tf
.
float32
)
warmup_steps_float
=
tf
.
cast
(
self
.
warmup_steps
,
tf
.
float32
)
warmup_percent_done
=
global_step_float
/
warmup_steps_float
warmup_learning_rate
=
(
self
.
initial_learning_rate
*
tf
.
math
.
pow
(
warmup_percent_done
,
self
.
power
))
return
tf
.
cond
(
global_step_float
<
warmup_steps_float
,
lambda
:
warmup_learning_rate
,
lambda
:
self
.
decay_schedule_fn
(
step
),
name
=
name
)
def
get_config
(
self
):
return
{
'initial_learning_rate'
:
self
.
initial_learning_rate
,
'decay_schedule_fn'
:
self
.
decay_schedule_fn
,
'warmup_steps'
:
self
.
warmup_steps
,
'power'
:
self
.
power
,
'name'
:
self
.
name
}
def
create_optimizer
(
init_lr
,
num_train_steps
,
num_warmup_steps
):
"""Creates an optimizer with learning rate schedule."""
# Implements linear decay of the learning rate.
learning_rate_fn
=
tf
.
keras
.
optimizers
.
schedules
.
PolynomialDecay
(
initial_learning_rate
=
init_lr
,
decay_steps
=
num_train_steps
,
end_learning_rate
=
0.0
)
if
num_warmup_steps
:
learning_rate_fn
=
WarmUp
(
initial_learning_rate
=
init_lr
,
decay_schedule_fn
=
learning_rate_fn
,
warmup_steps
=
num_warmup_steps
)
optimizer
=
AdamWeightDecay
(
learning_rate
=
learning_rate_fn
,
weight_decay_rate
=
0.01
,
beta_1
=
0.9
,
beta_2
=
0.999
,
epsilon
=
1e-6
,
exclude_from_weight_decay
=
[
'layer_norm'
,
'bias'
])
return
optimizer
class
AdamWeightDecay
(
tf
.
keras
.
optimizers
.
Adam
):
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
Just adding the square of the weights to the loss function is *not* the
correct way of using L2 regularization/weight decay with Adam, since that will
interact with the m and v parameters in strange ways.
Instead we want ot decay the weights in a manner that doesn't interact with
the m/v parameters. This is equivalent to adding the square of the weights to
the loss with plain (non-momentum) SGD.
"""
def
__init__
(
self
,
learning_rate
=
0.001
,
beta_1
=
0.9
,
beta_2
=
0.999
,
epsilon
=
1e-7
,
amsgrad
=
False
,
weight_decay_rate
=
0.0
,
include_in_weight_decay
=
None
,
exclude_from_weight_decay
=
None
,
name
=
'AdamWeightDecay'
,
**
kwargs
):
super
(
AdamWeightDecay
,
self
).
__init__
(
learning_rate
,
beta_1
,
beta_2
,
epsilon
,
amsgrad
,
name
,
**
kwargs
)
self
.
weight_decay_rate
=
weight_decay_rate
self
.
_include_in_weight_decay
=
include_in_weight_decay
self
.
_exclude_from_weight_decay
=
exclude_from_weight_decay
@
classmethod
def
from_config
(
cls
,
config
):
"""Creates an optimizer from its config with WarmUp custom object."""
custom_objects
=
{
'WarmUp'
:
WarmUp
}
return
super
(
AdamWeightDecay
,
cls
).
from_config
(
config
,
custom_objects
=
custom_objects
)
def
_prepare_local
(
self
,
var_device
,
var_dtype
,
apply_state
):
super
(
AdamWeightDecay
,
self
).
_prepare_local
(
var_device
,
var_dtype
,
apply_state
)
apply_state
[
'weight_decay_rate'
]
=
tf
.
constant
(
self
.
weight_decay_rate
,
name
=
'adam_weight_decay_rate'
)
def
_decay_weights_op
(
self
,
var
,
learning_rate
,
apply_state
):
do_decay
=
self
.
_do_use_weight_decay
(
var
.
name
)
if
do_decay
:
return
var
.
assign_sub
(
learning_rate
*
var
*
apply_state
[
'weight_decay_rate'
],
use_locking
=
self
.
_use_locking
)
return
tf
.
no_op
()
def
apply_gradients
(
self
,
grads_and_vars
,
clip_norm
,
name
=
None
):
grads
,
tvars
=
list
(
zip
(
*
grads_and_vars
))
(
grads
,
_
)
=
tf
.
clip_by_global_norm
(
grads
,
clip_norm
=
clip_norm
)
return
super
(
AdamWeightDecay
,
self
).
apply_gradients
(
zip
(
grads
,
tvars
))
def
_get_lr
(
self
,
var_device
,
var_dtype
,
apply_state
):
"""Retrieves the learning rate with the given state."""
if
apply_state
is
None
:
return
self
.
_decayed_lr_t
[
var_dtype
],
{}
apply_state
=
apply_state
or
{}
coefficients
=
apply_state
.
get
((
var_device
,
var_dtype
))
if
coefficients
is
None
:
coefficients
=
self
.
_fallback_apply_state
(
var_device
,
var_dtype
)
apply_state
[(
var_device
,
var_dtype
)]
=
coefficients
return
coefficients
[
'lr_t'
],
dict
(
apply_state
=
apply_state
)
def
_resource_apply_dense
(
self
,
grad
,
var
,
apply_state
=
None
):
lr_t
,
kwargs
=
self
.
_get_lr
(
var
.
device
,
var
.
dtype
.
base_dtype
,
apply_state
)
decay
=
self
.
_decay_weights_op
(
var
,
lr_t
,
apply_state
)
with
tf
.
control_dependencies
([
decay
]):
return
super
(
AdamWeightDecay
,
self
).
_resource_apply_dense
(
grad
,
var
,
**
kwargs
)
def
_resource_apply_sparse
(
self
,
grad
,
var
,
indices
,
apply_state
=
None
):
lr_t
,
kwargs
=
self
.
_get_lr
(
var
.
device
,
var
.
dtype
.
base_dtype
,
apply_state
)
decay
=
self
.
_decay_weights_op
(
var
,
lr_t
,
apply_state
)
with
tf
.
control_dependencies
([
decay
]):
return
super
(
AdamWeightDecay
,
self
).
_resource_apply_sparse
(
grad
,
var
,
indices
,
**
kwargs
)
def
get_config
(
self
):
config
=
super
(
AdamWeightDecay
,
self
).
get_config
()
config
.
update
({
'weight_decay_rate'
:
self
.
weight_decay_rate
,
})
return
config
def
_do_use_weight_decay
(
self
,
param_name
):
"""Whether to use L2 weight decay for `param_name`."""
if
self
.
weight_decay_rate
==
0
:
return
False
if
self
.
_include_in_weight_decay
:
for
r
in
self
.
_include_in_weight_decay
:
if
re
.
search
(
r
,
param_name
)
is
not
None
:
return
True
if
self
.
_exclude_from_weight_decay
:
for
r
in
self
.
_exclude_from_weight_decay
:
if
re
.
search
(
r
,
param_name
)
is
not
None
:
return
False
return
True
## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
class
GradientAccumulator
(
object
):
"""Distribution strategies-aware gradient accumulation utility."""
def
__init__
(
self
):
"""Initializes the accumulator."""
self
.
_gradients
=
[]
self
.
_accum_steps
=
tf
.
Variable
(
initial_value
=
0
,
dtype
=
tf
.
int64
,
trainable
=
False
,
aggregation
=
tf
.
VariableAggregation
.
ONLY_FIRST_REPLICA
)
@
property
def
step
(
self
):
"""Number of accumulated steps."""
return
self
.
_accum_steps
.
value
()
@
property
def
gradients
(
self
):
"""The accumulated gradients."""
return
list
(
gradient
.
value
()
if
gradient
is
not
None
else
gradient
for
gradient
in
self
.
_get_replica_gradients
())
def
__call__
(
self
,
gradients
):
"""Accumulates :obj:`gradients`."""
if
not
self
.
_gradients
:
self
.
_gradients
.
extend
([
tf
.
Variable
(
tf
.
zeros_like
(
gradient
),
trainable
=
False
)
if
gradient
is
not
None
else
gradient
for
gradient
in
gradients
])
if
len
(
gradients
)
!=
len
(
self
.
_gradients
):
raise
ValueError
(
"Expected %s gradients, but got %d"
%
(
len
(
self
.
_gradients
),
len
(
gradients
)))
for
accum_gradient
,
gradient
in
zip
(
self
.
_get_replica_gradients
(),
gradients
):
if
accum_gradient
is
not
None
:
accum_gradient
.
assign_add
(
gradient
)
self
.
_accum_steps
.
assign_add
(
1
)
def
reset
(
self
):
"""Resets the accumulated gradients."""
if
self
.
_gradients
:
self
.
_accum_steps
.
assign
(
0
)
for
gradient
in
self
.
_get_replica_gradients
():
if
gradient
is
not
None
:
gradient
.
assign
(
tf
.
zeros_like
(
gradient
))
def
_get_replica_gradients
(
self
):
if
tf
.
distribute
.
has_strategy
():
# In a replica context, we want to accumulate gradients on each replica
# without synchronization, so we directly assign the value of the
# current replica.
replica_context
=
tf
.
distribute
.
get_replica_context
()
if
replica_context
is
None
or
tf
.
distribute
.
get_strategy
().
num_replicas_in_sync
==
1
:
return
self
.
_gradients
return
(
gradient
.
device_map
.
select_for_current_replica
(
gradient
.
values
,
replica_context
)
for
gradient
in
self
.
_gradients
)
else
:
return
self
.
_gradients
transformers/tests/conftest.py
deleted
100644 → 0
View file @
f230d91b
# content of conftest.py
import
pytest
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--runslow"
,
action
=
"store_true"
,
default
=
False
,
help
=
"run slow tests"
)
parser
.
addoption
(
"--use_cuda"
,
action
=
"store_true"
,
default
=
False
,
help
=
"run tests on gpu"
)
def
pytest_configure
(
config
):
config
.
addinivalue_line
(
"markers"
,
"slow: mark test as slow to run"
)
def
pytest_collection_modifyitems
(
config
,
items
):
if
config
.
getoption
(
"--runslow"
):
# --runslow given in cli: do not skip slow tests
return
skip_slow
=
pytest
.
mark
.
skip
(
reason
=
"need --runslow option to run"
)
for
item
in
items
:
if
"slow"
in
item
.
keywords
:
item
.
add_marker
(
skip_slow
)
@
pytest
.
fixture
def
use_cuda
(
request
):
""" Run test on gpu """
return
request
.
config
.
getoption
(
"--use_cuda"
)
transformers/tests/modeling_albert_test.py
View file @
d5712f7c
...
...
@@ -18,22 +18,21 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
from
transformers
import
is_torch_available
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
if
is_torch_available
():
from
transformers
import
(
AlbertConfig
,
AlbertModel
,
AlbertForMaskedLM
,
AlbertForSequenceClassification
,
AlbertForQuestionAnswering
,
)
from
transformers.modeling_albert
import
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
@
require_torch
class
AlbertModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
AlbertModel
,
AlbertForMaskedLM
)
if
is_torch_available
()
else
()
...
...
@@ -133,6 +132,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_albert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
AlbertModel
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
...
...
@@ -150,6 +150,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_albert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
AlbertForMaskedLM
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
)
result
=
{
...
...
@@ -163,6 +164,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_albert_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
AlbertForQuestionAnswering
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
...
...
@@ -183,6 +185,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_albert_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
AlbertForSequenceClassification
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
sequence_labels
)
result
=
{
...
...
@@ -225,7 +228,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_albert_for_sequence_classification
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_auto_test.py
View file @
d5712f7c
...
...
@@ -18,11 +18,12 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
import
logging
from
transformers
import
is_torch_available
from
.utils
import
require_torch
,
slow
,
SMALL_MODEL_IDENTIFIER
if
is_torch_available
():
from
transformers
import
(
AutoConfig
,
BertConfig
,
AutoModel
,
BertModel
,
...
...
@@ -33,12 +34,11 @@ if is_torch_available():
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
@
require_torch
class
AutoModelTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
for
value
in
loading_info
.
values
():
self
.
assertEqual
(
len
(
value
),
0
)
@
pytest
.
mark
.
slow
@
slow
def
test_lmhead_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
self
.
assertIsNotNone
(
model
)
self
.
assertIsInstance
(
model
,
BertForMaskedLM
)
@
pytest
.
mark
.
slow
@
slow
def
test_sequence_classification_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
self
.
assertIsNotNone
(
model
)
self
.
assertIsInstance
(
model
,
BertForSequenceClassification
)
@
pytest
.
mark
.
slow
@
slow
def
test_question_answering_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
self
.
assertIsNotNone
(
model
)
self
.
assertIsInstance
(
model
,
BertForQuestionAnswering
)
def
test_from_pretrained_identifier
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
model
=
AutoModelWithLMHead
.
from_pretrained
(
SMALL_MODEL_IDENTIFIER
)
self
.
assertIsInstance
(
model
,
BertForMaskedLM
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_bert_test.py
View file @
d5712f7c
...
...
@@ -18,12 +18,12 @@ from __future__ import print_function
import
unittest
import
shutil
import
pytest
from
transformers
import
is_torch_available
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
,
floats_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
if
is_torch_available
():
from
transformers
import
(
BertConfig
,
BertModel
,
BertForMaskedLM
,
...
...
@@ -31,11 +31,9 @@ if is_torch_available():
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForTokenClassification
,
BertForMultipleChoice
)
from
transformers.modeling_bert
import
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
@
pytest
.
mark
.
usefixtures
(
"use_cuda"
)
@
require_torch
class
BertModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
BertModel
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
...
...
@@ -67,7 +65,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
device
=
'cpu'
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
...
...
@@ -91,26 +88,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
self
.
device
=
device
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
.
to
(
self
.
device
)
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
.
to
(
self
.
device
)
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
.
to
(
self
.
device
)
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
.
to
(
self
.
device
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
.
to
(
self
.
device
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
.
to
(
self
.
device
)
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
BertConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
...
...
@@ -144,7 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertModel
(
config
=
config
)
model
.
to
(
input_ids
.
device
)
model
.
to
(
torch_
device
)
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
...
...
@@ -161,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_model_as_decoder
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
,
encoder_hidden_states
,
encoder_attention_mask
):
model
=
BertModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
encoder_attention_mask
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
encoder_hidden_states
=
encoder_hidden_states
)
...
...
@@ -177,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForMaskedLM
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
)
result
=
{
...
...
@@ -190,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_model_for_masked_lm_as_decoder
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
,
encoder_hidden_states
,
encoder_attention_mask
):
model
=
BertForMaskedLM
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
encoder_attention_mask
)
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
,
encoder_hidden_states
=
encoder_hidden_states
)
...
...
@@ -204,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_next_sequence_prediction
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForNextSentencePrediction
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
seq_relationship_score
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
next_sentence_label
=
sequence_labels
)
result
=
{
...
...
@@ -217,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_pretraining
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForPreTraining
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
prediction_scores
,
seq_relationship_score
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
,
next_sentence_label
=
sequence_labels
)
...
...
@@ -235,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForQuestionAnswering
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
...
...
@@ -254,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_sequence_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
BertForSequenceClassification
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
sequence_labels
)
result
=
{
...
...
@@ -268,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_token_classification
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
BertForTokenClassification
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
token_labels
)
result
=
{
...
...
@@ -282,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_multiple_choice
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_choices
=
self
.
num_choices
model
=
BertForMultipleChoice
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
multiple_choice_inputs_ids
=
input_ids
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
multiple_choice_token_type_ids
=
token_type_ids
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
...
...
@@ -313,10 +318,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_bert_model
(
self
,
use_cuda
=
False
):
# ^^ This could be a real fixture
if
use_cuda
:
self
.
model_tester
.
device
=
"cuda"
def
test_bert_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_bert_model
(
*
config_and_inputs
)
...
...
@@ -356,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_bert_for_token_classification
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_common_test.py
View file @
d5712f7c
...
...
@@ -27,10 +27,11 @@ import uuid
import
unittest
import
logging
import
pytest
from
transformers
import
is_torch_available
from
.utils
import
require_torch
,
slow
,
torch_device
if
is_torch_available
():
import
torch
import
numpy
as
np
...
...
@@ -38,8 +39,6 @@ if is_torch_available():
from
transformers
import
(
AdaptiveEmbedding
,
PretrainedConfig
,
PreTrainedModel
,
BertModel
,
BertConfig
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
GPT2LMHeadModel
,
GPT2Config
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
...
...
@@ -65,6 +64,7 @@ def _config_zero_init(config):
class
CommonTestCases
:
@
require_torch
class
CommonModelTester
(
unittest
.
TestCase
):
model_tester
=
None
...
...
@@ -79,6 +79,7 @@ class CommonTestCases:
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
...
...
@@ -86,12 +87,13 @@ class CommonTestCases:
with
TemporaryDirectory
()
as
tmpdirname
:
model
.
save_pretrained
(
tmpdirname
)
model
=
model_class
.
from_pretrained
(
tmpdirname
)
model
.
to
(
torch_device
)
with
torch
.
no_grad
():
after_outputs
=
model
(
**
inputs_dict
)
# Make sure we don't have nans
out_1
=
after_outputs
[
0
].
numpy
()
out_2
=
outputs
[
0
].
numpy
()
out_1
=
after_outputs
[
0
].
cpu
().
numpy
()
out_2
=
outputs
[
0
].
cpu
().
numpy
()
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
...
...
@@ -113,6 +115,7 @@ class CommonTestCases:
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
first
,
second
=
model
(
inputs_dict
[
"input_ids"
])[
0
],
model
(
inputs_dict
[
"input_ids"
])[
0
]
self
.
assertEqual
(
first
.
ne
(
second
).
sum
().
item
(),
0
)
...
...
@@ -125,6 +128,7 @@ class CommonTestCases:
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
...
...
@@ -142,6 +146,7 @@ class CommonTestCases:
config
.
output_attentions
=
True
config
.
output_hidden_states
=
True
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
**
inputs_dict
)
self
.
assertEqual
(
out_len
+
1
,
len
(
outputs
))
...
...
@@ -181,6 +186,7 @@ class CommonTestCases:
configs_no_init
.
torchscript
=
True
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
=
configs_no_init
)
model
.
to
(
torch_device
)
model
.
eval
()
inputs
=
inputs_dict
[
'input_ids'
]
# Let's keep only input_ids
...
...
@@ -201,7 +207,10 @@ class CommonTestCases:
except
ValueError
:
self
.
fail
(
"Couldn't load module."
)
model
.
to
(
torch_device
)
model
.
eval
()
loaded_model
.
to
(
torch_device
)
loaded_model
.
eval
()
model_params
=
model
.
parameters
()
...
...
@@ -228,11 +237,12 @@ class CommonTestCases:
configs_no_init
=
_config_zero_init
(
config
)
# To be sure we have no Nan
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
=
configs_no_init
)
model
.
to
(
torch_device
)
model
.
eval
()
# Prepare head_mask
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
head_mask
=
torch
.
ones
(
self
.
model_tester
.
num_hidden_layers
,
self
.
model_tester
.
num_attention_heads
)
head_mask
=
torch
.
ones
(
self
.
model_tester
.
num_hidden_layers
,
self
.
model_tester
.
num_attention_heads
,
device
=
torch_device
)
head_mask
[
0
,
0
]
=
0
head_mask
[
-
1
,
:
-
1
]
=
0
head_mask
.
requires_grad_
(
requires_grad
=
True
)
...
...
@@ -282,6 +292,7 @@ class CommonTestCases:
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
heads_to_prune
=
{
0
:
list
(
range
(
1
,
self
.
model_tester
.
num_attention_heads
)),
-
1
:
[
0
]}
...
...
@@ -310,6 +321,7 @@ class CommonTestCases:
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
heads_to_prune
=
{
0
:
list
(
range
(
1
,
self
.
model_tester
.
num_attention_heads
)),
-
1
:
[
0
]}
...
...
@@ -319,6 +331,7 @@ class CommonTestCases:
os
.
makedirs
(
directory
)
model
.
save_pretrained
(
directory
)
model
=
model_class
.
from_pretrained
(
directory
)
model
.
to
(
torch_device
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
...
...
@@ -346,6 +359,7 @@ class CommonTestCases:
config
.
pruned_heads
=
heads_to_prune
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
**
inputs_dict
)
...
...
@@ -372,6 +386,7 @@ class CommonTestCases:
config
.
pruned_heads
=
heads_to_prune
model
=
model_class
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
**
inputs_dict
)
...
...
@@ -388,6 +403,7 @@ class CommonTestCases:
os
.
makedirs
(
directory
)
model
.
save_pretrained
(
directory
)
model
=
model_class
.
from_pretrained
(
directory
)
model
.
to
(
torch_device
)
shutil
.
rmtree
(
directory
)
outputs
=
model
(
**
inputs_dict
)
...
...
@@ -419,6 +435,7 @@ class CommonTestCases:
config
.
output_hidden_states
=
True
config
.
output_attentions
=
False
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
**
inputs_dict
)
hidden_states
=
outputs
[
-
1
]
...
...
@@ -538,6 +555,7 @@ class CommonTestCases:
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
wte
=
model
.
get_input_embeddings
()
...
...
@@ -628,6 +646,7 @@ class CommonTestCases:
def
create_and_check_base_model
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
):
model
=
self
.
base_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
)
...
...
@@ -643,6 +662,7 @@ class CommonTestCases:
def
create_and_check_lm_head
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
):
model
=
self
.
lm_head_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
,
lm_labels
)
loss
,
lm_logits
=
outputs
[:
2
]
...
...
@@ -659,6 +679,7 @@ class CommonTestCases:
mc_labels
,
lm_labels
,
mc_token_ids
):
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids
)
presents
=
outputs
[
-
1
]
...
...
@@ -671,6 +692,7 @@ class CommonTestCases:
def
create_and_check_double_heads
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
mc_labels
,
lm_labels
,
mc_token_ids
):
model
=
self
.
double_head_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
outputs
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
=
lm_labels
,
mc_labels
=
mc_labels
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
)
...
...
@@ -716,7 +738,7 @@ class CommonTestCases:
config_and_inputs
=
self
.
prepare_config_and_inputs
()
self
.
create_and_check_presents
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
run_slow_tests
(
self
):
self
.
create_and_check_model_from_pretrained
()
...
...
@@ -770,7 +792,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
for
_
in
range
(
total_dims
):
values
.
append
(
rng
.
randint
(
0
,
vocab_size
-
1
))
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
long
).
view
(
shape
).
contiguous
()
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
long
,
device
=
torch_device
).
view
(
shape
).
contiguous
()
def
floats_tensor
(
shape
,
scale
=
1.0
,
rng
=
None
,
name
=
None
):
...
...
@@ -786,11 +808,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
for
_
in
range
(
total_dims
):
values
.
append
(
rng
.
random
()
*
scale
)
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
float
).
view
(
shape
).
contiguous
()
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
float
,
device
=
torch_device
).
view
(
shape
).
contiguous
()
@
require_torch
class
ModelUtilsTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_ctrl_test.py
View file @
d5712f7c
...
...
@@ -16,7 +16,6 @@ from __future__ import division
from
__future__
import
print_function
import
unittest
import
pytest
import
shutil
import
pdb
...
...
@@ -25,13 +24,13 @@ from transformers import is_torch_available
if
is_torch_available
():
from
transformers
import
(
CTRLConfig
,
CTRLModel
,
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
,
CTRLLMHeadModel
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
@
require_torch
class
CTRLModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
CTRLModel
,
CTRLLMHeadModel
)
if
is_torch_available
()
else
()
...
...
@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_ctrl_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
CTRLModel
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
head_mask
=
head_mask
)
...
...
@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_lm_head_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
CTRLLMHeadModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
lm_logits
,
_
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
labels
=
input_ids
)
...
...
@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_lm_head_model
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_distilbert_test.py
View file @
d5712f7c
...
...
@@ -17,7 +17,6 @@ from __future__ import division
from
__future__
import
print_function
import
unittest
import
pytest
from
transformers
import
is_torch_available
...
...
@@ -25,13 +24,13 @@ if is_torch_available():
from
transformers
import
(
DistilBertConfig
,
DistilBertModel
,
DistilBertForMaskedLM
,
DistilBertForTokenClassification
,
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
@
require_torch
class
DistilBertModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
DistilBertModel
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
...
...
@@ -126,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_distilbert_model
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertModel
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
(
sequence_output
,)
=
model
(
input_ids
,
input_mask
)
(
sequence_output
,)
=
model
(
input_ids
)
...
...
@@ -139,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_distilbert_for_masked_lm
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForMaskedLM
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
masked_lm_labels
=
token_labels
)
result
=
{
...
...
@@ -152,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_distilbert_for_question_answering
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForQuestionAnswering
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
result
=
{
...
...
@@ -170,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_distilbert_for_sequence_classification
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
DistilBertForSequenceClassification
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
labels
=
sequence_labels
)
result
=
{
...
...
@@ -184,6 +187,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_distilbert_for_token_classification
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
DistilBertForTokenClassification
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
labels
=
token_labels
)
...
...
@@ -229,7 +233,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_token_classification
(
*
config_and_inputs
)
# @
pytest.mark.
slow
# @slow
# def test_model_from_pretrained(self):
# cache_dir = "/tmp/transformers_test/"
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
...
...
transformers/tests/modeling_encoder_decoder_test.py
View file @
d5712f7c
...
...
@@ -15,19 +15,18 @@
import
logging
import
unittest
import
pytest
from
transformers
import
is_torch_available
from
.utils
import
require_torch
,
slow
if
is_torch_available
():
from
transformers
import
BertModel
,
BertForMaskedLM
,
Model2Model
from
transformers.modeling_bert
import
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
@
require_torch
class
EncoderDecoderModelTest
(
unittest
.
TestCase
):
@
pytest
.
mark
.
slow
@
slow
def
test_model2model_from_pretrained
(
self
):
logging
.
basicConfig
(
level
=
logging
.
INFO
)
for
model_name
in
list
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_gpt2_test.py
View file @
d5712f7c
...
...
@@ -17,7 +17,6 @@ from __future__ import division
from
__future__
import
print_function
import
unittest
import
pytest
import
shutil
from
transformers
import
is_torch_available
...
...
@@ -25,13 +24,13 @@ from transformers import is_torch_available
if
is_torch_available
():
from
transformers
import
(
GPT2Config
,
GPT2Model
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
@
require_torch
class
GPT2ModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
GPT2Model
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
)
if
is_torch_available
()
else
()
...
...
@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_gpt2_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
GPT2Model
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
head_mask
=
head_mask
)
...
...
@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_lm_head_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
GPT2LMHeadModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
lm_logits
,
_
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
labels
=
input_ids
)
...
...
@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_double_lm_head_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
*
args
):
model
=
GPT2DoubleHeadsModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
...
...
@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_double_lm_head_model
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
transformers/tests/modeling_openai_test.py
View file @
d5712f7c
...
...
@@ -17,7 +17,6 @@ from __future__ import division
from
__future__
import
print_function
import
unittest
import
pytest
import
shutil
from
transformers
import
is_torch_available
...
...
@@ -25,13 +24,13 @@ from transformers import is_torch_available
if
is_torch_available
():
from
transformers
import
(
OpenAIGPTConfig
,
OpenAIGPTModel
,
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
,
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require Torch"
)
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
@
require_torch
class
OpenAIGPTModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
OpenAIGPTModel
,
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
)
if
is_torch_available
()
else
()
...
...
@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_openai_gpt_model
(
self
,
config
,
input_ids
,
head_mask
,
token_type_ids
,
*
args
):
model
=
OpenAIGPTModel
(
config
=
config
)
model
.
to
(
torch_device
)
model
.
eval
()
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
head_mask
=
head_mask
)
...
...
@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_lm_head_model
(
self
,
config
,
input_ids
,
head_mask
,
token_type_ids
,
*
args
):
model
=
OpenAIGPTLMHeadModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
lm_logits
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
labels
=
input_ids
)
...
...
@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_double_lm_head_model
(
self
,
config
,
input_ids
,
head_mask
,
token_type_ids
,
*
args
):
model
=
OpenAIGPTDoubleHeadsModel
(
config
)
model
.
to
(
torch_device
)
model
.
eval
()
loss
,
lm_logits
,
mc_logits
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
,
lm_labels
=
input_ids
)
...
...
@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_double_lm_head_model
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment