Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
611d3a09
Unverified
Commit
611d3a09
authored
Mar 15, 2022
by
Minh Chien Vu
Committed by
GitHub
Mar 15, 2022
Browse files
Change unpacking of TF inputs: layoutlm, mpnet, rag, and roformer (#16112)
Co-authored-by:
ChienVM
<
chien_vm@detomo.co.jp
>
parent
0d7322c1
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
223 additions
and
626 deletions
+223
-626
src/transformers/models/layoutlm/modeling_tf_layoutlm.py
src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+45
-129
src/transformers/models/mpnet/modeling_tf_mpnet.py
src/transformers/models/mpnet/modeling_tf_mpnet.py
+64
-171
src/transformers/models/rag/modeling_tf_rag.py
src/transformers/models/rag/modeling_tf_rag.py
+42
-129
src/transformers/models/roformer/modeling_tf_roformer.py
src/transformers/models/roformer/modeling_tf_roformer.py
+72
-197
No files found.
src/transformers/models/layoutlm/modeling_tf_layoutlm.py
View file @
611d3a09
...
@@ -37,8 +37,8 @@ from ...modeling_tf_utils import (
...
@@ -37,8 +37,8 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss
,
TFSequenceClassificationLoss
,
TFTokenClassificationLoss
,
TFTokenClassificationLoss
,
get_initializer
,
get_initializer
,
input_processing
,
keras_serializable
,
keras_serializable
,
unpack_inputs
,
)
)
from
...tf_utils
import
shape_list
from
...tf_utils
import
shape_list
from
...utils
import
logging
from
...utils
import
logging
...
@@ -691,6 +691,7 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
...
@@ -691,6 +691,7 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
"""
"""
raise
NotImplementedError
raise
NotImplementedError
@
unpack_inputs
def
call
(
def
call
(
self
,
self
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
...
@@ -708,47 +709,31 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
...
@@ -708,47 +709,31 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
training
:
bool
=
False
,
training
:
bool
=
False
,
**
kwargs
,
**
kwargs
,
)
->
Union
[
TFBaseModelOutputWithPoolingAndCrossAttentions
,
Tuple
[
tf
.
Tensor
]]:
)
->
Union
[
TFBaseModelOutputWithPoolingAndCrossAttentions
,
Tuple
[
tf
.
Tensor
]]:
inputs
=
input_processing
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
bbox
=
bbox
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
,
inputs_embeds
=
inputs_embeds
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
training
=
training
,
kwargs_call
=
kwargs
,
)
if
inputs
[
"
input_ids
"
]
is
not
None
and
inputs
[
"
inputs_embeds
"
]
is
not
None
:
if
input_ids
is
not
None
and
inputs_embeds
is
not
None
:
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif
inputs
[
"
input_ids
"
]
is
not
None
:
elif
input_ids
is
not
None
:
input_shape
=
shape_list
(
inputs
[
"
input_ids
"
]
)
input_shape
=
shape_list
(
input_ids
)
elif
inputs
[
"
inputs_embeds
"
]
is
not
None
:
elif
inputs_embeds
is
not
None
:
input_shape
=
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[:
-
1
]
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
else
:
else
:
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
if
inputs
[
"
attention_mask
"
]
is
None
:
if
attention_mask
is
None
:
inputs
[
"
attention_mask
"
]
=
tf
.
fill
(
dims
=
input_shape
,
value
=
1
)
attention_mask
=
tf
.
fill
(
dims
=
input_shape
,
value
=
1
)
if
inputs
[
"
token_type_ids
"
]
is
None
:
if
token_type_ids
is
None
:
inputs
[
"
token_type_ids
"
]
=
tf
.
fill
(
dims
=
input_shape
,
value
=
0
)
token_type_ids
=
tf
.
fill
(
dims
=
input_shape
,
value
=
0
)
if
inputs
[
"
bbox
"
]
is
None
:
if
bbox
is
None
:
inputs
[
"
bbox
"
]
=
tf
.
fill
(
dims
=
input_shape
+
[
4
],
value
=
0
)
bbox
=
tf
.
fill
(
dims
=
input_shape
+
[
4
],
value
=
0
)
embedding_output
=
self
.
embeddings
(
embedding_output
=
self
.
embeddings
(
input_ids
=
inputs
[
"
input_ids
"
]
,
input_ids
=
input_ids
,
bbox
=
inputs
[
"
bbox
"
]
,
bbox
=
bbox
,
position_ids
=
inputs
[
"
position_ids
"
]
,
position_ids
=
position_ids
,
token_type_ids
=
inputs
[
"
token_type_ids
"
]
,
token_type_ids
=
token_type_ids
,
inputs_embeds
=
inputs
[
"inputs
_embeds
"
]
,
inputs_embeds
=
inputs_embeds
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
# We create a 3D attention mask from a 2D tensor mask.
# We create a 3D attention mask from a 2D tensor mask.
...
@@ -756,7 +741,7 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
...
@@ -756,7 +741,7 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
tf
.
reshape
(
inputs
[
"
attention_mask
"
]
,
(
input_shape
[
0
],
1
,
1
,
input_shape
[
1
]))
extended_attention_mask
=
tf
.
reshape
(
attention_mask
,
(
input_shape
[
0
],
1
,
1
,
input_shape
[
1
]))
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
...
@@ -773,30 +758,30 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
...
@@ -773,30 +758,30 @@ class TFLayoutLMMainLayer(tf.keras.layers.Layer):
# attention_probs has shape bsz x n_heads x N x N
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if
inputs
[
"
head_mask
"
]
is
not
None
:
if
head_mask
is
not
None
:
raise
NotImplementedError
raise
NotImplementedError
else
:
else
:
inputs
[
"
head_mask
"
]
=
[
None
]
*
self
.
config
.
num_hidden_layers
head_mask
=
[
None
]
*
self
.
config
.
num_hidden_layers
encoder_outputs
=
self
.
encoder
(
encoder_outputs
=
self
.
encoder
(
hidden_states
=
embedding_output
,
hidden_states
=
embedding_output
,
attention_mask
=
extended_attention_mask
,
attention_mask
=
extended_attention_mask
,
head_mask
=
inputs
[
"
head_mask
"
]
,
head_mask
=
head_mask
,
# Need to pass these required positional arguments to `Encoder`
# Need to pass these required positional arguments to `Encoder`
encoder_hidden_states
=
encoder_hidden_states
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
None
,
encoder_attention_mask
=
None
,
past_key_values
=
None
,
past_key_values
=
None
,
use_cache
=
False
,
use_cache
=
False
,
output_attentions
=
inputs
[
"
output_attentions
"
]
,
output_attentions
=
output_attentions
,
output_hidden_states
=
inputs
[
"
output_hidden_states
"
]
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
inputs
[
"
return_dict
"
]
,
return_dict
=
return_dict
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
sequence_output
=
encoder_outputs
[
0
]
sequence_output
=
encoder_outputs
[
0
]
pooled_output
=
self
.
pooler
(
hidden_states
=
sequence_output
)
if
self
.
pooler
is
not
None
else
None
pooled_output
=
self
.
pooler
(
hidden_states
=
sequence_output
)
if
self
.
pooler
is
not
None
else
None
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
return
(
return
(
sequence_output
,
sequence_output
,
pooled_output
,
pooled_output
,
...
@@ -924,6 +909,7 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
...
@@ -924,6 +909,7 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
self
.
layoutlm
=
TFLayoutLMMainLayer
(
config
,
name
=
"layoutlm"
)
self
.
layoutlm
=
TFLayoutLMMainLayer
(
config
,
name
=
"layoutlm"
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
replace_return_docstrings
(
@
replace_return_docstrings
(
output_type
=
TFBaseModelOutputWithPoolingAndCrossAttentions
,
config_class
=
_CONFIG_FOR_DOC
output_type
=
TFBaseModelOutputWithPoolingAndCrossAttentions
,
config_class
=
_CONFIG_FOR_DOC
...
@@ -979,9 +965,7 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
...
@@ -979,9 +965,7 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
>>> last_hidden_states = outputs.last_hidden_state
>>> last_hidden_states = outputs.last_hidden_state
```"""
```"""
inputs
=
input_processing
(
outputs
=
self
.
layoutlm
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
bbox
=
bbox
,
bbox
=
bbox
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
...
@@ -989,26 +973,10 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
...
@@ -989,26 +973,10 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
position_ids
=
position_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
,
head_mask
=
head_mask
,
inputs_embeds
=
inputs_embeds
,
inputs_embeds
=
inputs_embeds
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
encoder_attention_mask
,
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
layoutlm
(
input_ids
=
inputs
[
"input_ids"
],
bbox
=
inputs
[
"bbox"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
return
outputs
return
outputs
...
@@ -1064,6 +1032,7 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
...
@@ -1064,6 +1032,7 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
warnings
.
warn
(
"The method get_prefix_bias_name is deprecated. Please use `get_bias` instead."
,
FutureWarning
)
warnings
.
warn
(
"The method get_prefix_bias_name is deprecated. Please use `get_bias` instead."
,
FutureWarning
)
return
self
.
name
+
"/"
+
self
.
mlm
.
name
+
"/"
+
self
.
mlm
.
predictions
.
name
return
self
.
name
+
"/"
+
self
.
mlm
.
name
+
"/"
+
self
.
mlm
.
predictions
.
name
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
replace_return_docstrings
(
output_type
=
TFMaskedLMOutput
,
config_class
=
_CONFIG_FOR_DOC
)
@
replace_return_docstrings
(
output_type
=
TFMaskedLMOutput
,
config_class
=
_CONFIG_FOR_DOC
)
def
call
(
def
call
(
...
@@ -1127,9 +1096,7 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
...
@@ -1127,9 +1096,7 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
>>> loss = outputs.loss
>>> loss = outputs.loss
```"""
```"""
inputs
=
input_processing
(
outputs
=
self
.
layoutlm
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
bbox
=
bbox
,
bbox
=
bbox
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
...
@@ -1140,32 +1107,13 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
...
@@ -1140,32 +1107,13 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
layoutlm
(
input_ids
=
inputs
[
"input_ids"
],
bbox
=
inputs
[
"bbox"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
mlm
(
sequence_output
=
sequence_output
,
training
=
inputs
[
"training"
])
prediction_scores
=
self
.
mlm
(
sequence_output
=
sequence_output
,
training
=
training
)
loss
=
(
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
prediction_scores
)
None
if
inputs
[
"labels"
]
is
None
else
self
.
hf_compute_loss
(
labels
=
inputs
[
"labels"
],
logits
=
prediction_scores
)
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
prediction_scores
,)
+
outputs
[
2
:]
output
=
(
prediction_scores
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1208,6 +1156,7 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
...
@@ -1208,6 +1156,7 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
name
=
"classifier"
,
name
=
"classifier"
,
)
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
replace_return_docstrings
(
output_type
=
TFSequenceClassifierOutput
,
config_class
=
_CONFIG_FOR_DOC
)
@
replace_return_docstrings
(
output_type
=
TFSequenceClassifierOutput
,
config_class
=
_CONFIG_FOR_DOC
)
def
call
(
def
call
(
...
@@ -1271,9 +1220,7 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
...
@@ -1271,9 +1220,7 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
>>> loss = outputs.loss
>>> loss = outputs.loss
>>> logits = outputs.logits
>>> logits = outputs.logits
```"""
```"""
inputs
=
input_processing
(
outputs
=
self
.
layoutlm
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
bbox
=
bbox
,
bbox
=
bbox
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
...
@@ -1284,29 +1231,14 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
...
@@ -1284,29 +1231,14 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
layoutlm
(
input_ids
=
inputs
[
"input_ids"
],
bbox
=
inputs
[
"bbox"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
pooled_output
=
self
.
dropout
(
inputs
=
pooled_output
,
training
=
inputs
[
"
training
"
]
)
pooled_output
=
self
.
dropout
(
inputs
=
pooled_output
,
training
=
training
)
logits
=
self
.
classifier
(
inputs
=
pooled_output
)
logits
=
self
.
classifier
(
inputs
=
pooled_output
)
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
labels
=
inputs
[
"
labels
"
]
,
logits
=
logits
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
2
:]
output
=
(
logits
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1355,6 +1287,7 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
...
@@ -1355,6 +1287,7 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
name
=
"classifier"
,
name
=
"classifier"
,
)
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
LAYOUTLM_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
replace_return_docstrings
(
output_type
=
TFTokenClassifierOutput
,
config_class
=
_CONFIG_FOR_DOC
)
@
replace_return_docstrings
(
output_type
=
TFTokenClassifierOutput
,
config_class
=
_CONFIG_FOR_DOC
)
def
call
(
def
call
(
...
@@ -1416,9 +1349,7 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
...
@@ -1416,9 +1349,7 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
>>> loss = outputs.loss
>>> loss = outputs.loss
>>> logits = outputs.logits
>>> logits = outputs.logits
```"""
```"""
inputs
=
input_processing
(
outputs
=
self
.
layoutlm
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
bbox
=
bbox
,
bbox
=
bbox
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
...
@@ -1429,29 +1360,14 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
...
@@ -1429,29 +1360,14 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
layoutlm
(
input_ids
=
inputs
[
"input_ids"
],
bbox
=
inputs
[
"bbox"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
sequence_output
=
self
.
dropout
(
inputs
=
sequence_output
,
training
=
inputs
[
"
training
"
]
)
sequence_output
=
self
.
dropout
(
inputs
=
sequence_output
,
training
=
training
)
logits
=
self
.
classifier
(
inputs
=
sequence_output
)
logits
=
self
.
classifier
(
inputs
=
sequence_output
)
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
labels
=
inputs
[
"
labels
"
]
,
logits
=
logits
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
2
:]
output
=
(
logits
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
...
src/transformers/models/mpnet/modeling_tf_mpnet.py
View file @
611d3a09
...
@@ -45,8 +45,8 @@ from ...modeling_tf_utils import (
...
@@ -45,8 +45,8 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss
,
TFSequenceClassificationLoss
,
TFTokenClassificationLoss
,
TFTokenClassificationLoss
,
get_initializer
,
get_initializer
,
input_processing
,
keras_serializable
,
keras_serializable
,
unpack_inputs
,
)
)
from
...tf_utils
import
shape_list
from
...tf_utils
import
shape_list
from
...utils
import
logging
from
...utils
import
logging
...
@@ -485,6 +485,7 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
...
@@ -485,6 +485,7 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
"""
"""
raise
NotImplementedError
raise
NotImplementedError
@
unpack_inputs
def
call
(
def
call
(
self
,
self
,
input_ids
=
None
,
input_ids
=
None
,
...
@@ -498,38 +499,24 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
...
@@ -498,38 +499,24 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
training
=
False
,
training
=
False
,
**
kwargs
,
**
kwargs
,
):
):
inputs
=
input_processing
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
head_mask
=
head_mask
,
inputs_embeds
=
inputs_embeds
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
training
=
training
,
kwargs_call
=
kwargs
,
)
if
inputs
[
"
input_ids
"
]
is
not
None
and
inputs
[
"
inputs_embeds
"
]
is
not
None
:
if
input_ids
is
not
None
and
inputs_embeds
is
not
None
:
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif
inputs
[
"
input_ids
"
]
is
not
None
:
elif
input_ids
is
not
None
:
input_shape
=
shape_list
(
inputs
[
"
input_ids
"
]
)
input_shape
=
shape_list
(
input_ids
)
elif
inputs
[
"
inputs_embeds
"
]
is
not
None
:
elif
inputs_embeds
is
not
None
:
input_shape
=
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[:
-
1
]
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
else
:
else
:
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
if
inputs
[
"
attention_mask
"
]
is
None
:
if
attention_mask
is
None
:
inputs
[
"
attention_mask
"
]
=
tf
.
fill
(
input_shape
,
1
)
attention_mask
=
tf
.
fill
(
input_shape
,
1
)
embedding_output
=
self
.
embeddings
(
embedding_output
=
self
.
embeddings
(
inputs
[
"
input_ids
"
]
,
input_ids
,
inputs
[
"
position_ids
"
]
,
position_ids
,
inputs
[
"
inputs_embeds
"
]
,
inputs_embeds
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
# We create a 3D attention mask from a 2D tensor mask.
# We create a 3D attention mask from a 2D tensor mask.
...
@@ -537,7 +524,7 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
...
@@ -537,7 +524,7 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
tf
.
reshape
(
inputs
[
"
attention_mask
"
]
,
(
input_shape
[
0
],
1
,
1
,
input_shape
[
1
]))
extended_attention_mask
=
tf
.
reshape
(
attention_mask
,
(
input_shape
[
0
],
1
,
1
,
input_shape
[
1
]))
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
...
@@ -554,25 +541,25 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
...
@@ -554,25 +541,25 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
# attention_probs has shape bsz x n_heads x N x N
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if
inputs
[
"
head_mask
"
]
is
not
None
:
if
head_mask
is
not
None
:
raise
NotImplementedError
raise
NotImplementedError
else
:
else
:
inputs
[
"
head_mask
"
]
=
[
None
]
*
self
.
num_hidden_layers
head_mask
=
[
None
]
*
self
.
num_hidden_layers
encoder_outputs
=
self
.
encoder
(
encoder_outputs
=
self
.
encoder
(
embedding_output
,
embedding_output
,
extended_attention_mask
,
extended_attention_mask
,
inputs
[
"
head_mask
"
]
,
head_mask
,
inputs
[
"
output_attentions
"
]
,
output_attentions
,
inputs
[
"
output_hidden_states
"
]
,
output_hidden_states
,
inputs
[
"
return_dict
"
]
,
return_dict
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
sequence_output
=
encoder_outputs
[
0
]
sequence_output
=
encoder_outputs
[
0
]
pooled_output
=
self
.
pooler
(
sequence_output
)
pooled_output
=
self
.
pooler
(
sequence_output
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
return
(
return
(
sequence_output
,
sequence_output
,
pooled_output
,
pooled_output
,
...
@@ -680,6 +667,7 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
...
@@ -680,6 +667,7 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
mpnet
=
TFMPNetMainLayer
(
config
,
name
=
"mpnet"
)
self
.
mpnet
=
TFMPNetMainLayer
(
config
,
name
=
"mpnet"
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -700,9 +688,7 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
...
@@ -700,9 +688,7 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
training
=
False
,
training
=
False
,
**
kwargs
,
**
kwargs
,
):
):
inputs
=
input_processing
(
outputs
=
self
.
mpnet
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
position_ids
=
position_ids
,
...
@@ -712,18 +698,6 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
...
@@ -712,18 +698,6 @@ class TFMPNetModel(TFMPNetPreTrainedModel):
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
mpnet
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
return
outputs
return
outputs
...
@@ -809,6 +783,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
...
@@ -809,6 +783,7 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
warnings
.
warn
(
"The method get_prefix_bias_name is deprecated. Please use `get_bias` instead."
,
FutureWarning
)
warnings
.
warn
(
"The method get_prefix_bias_name is deprecated. Please use `get_bias` instead."
,
FutureWarning
)
return
self
.
name
+
"/"
+
self
.
lm_head
.
name
return
self
.
name
+
"/"
+
self
.
lm_head
.
name
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -836,11 +811,8 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
...
@@ -836,11 +811,8 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
"""
outputs
=
self
.
mpnet
(
inputs
=
input_processing
(
input_ids
,
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
,
head_mask
=
head_mask
,
...
@@ -848,27 +820,14 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
...
@@ -848,27 +820,14 @@ class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
mpnet
(
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
lm_head
(
sequence_output
)
prediction_scores
=
self
.
lm_head
(
sequence_output
)
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
inputs
[
"
labels
"
]
,
prediction_scores
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
,
prediction_scores
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
prediction_scores
,)
+
outputs
[
2
:]
output
=
(
prediction_scores
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -930,6 +889,7 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
...
@@ -930,6 +889,7 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
self
.
mpnet
=
TFMPNetMainLayer
(
config
,
name
=
"mpnet"
)
self
.
mpnet
=
TFMPNetMainLayer
(
config
,
name
=
"mpnet"
)
self
.
classifier
=
TFMPNetClassificationHead
(
config
,
name
=
"classifier"
)
self
.
classifier
=
TFMPNetClassificationHead
(
config
,
name
=
"classifier"
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -957,11 +917,8 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
...
@@ -957,11 +917,8 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
"""
outputs
=
self
.
mpnet
(
inputs
=
input_processing
(
input_ids
,
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
,
head_mask
=
head_mask
,
...
@@ -969,28 +926,15 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
...
@@ -969,28 +926,15 @@ class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassif
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
mpnet
(
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
logits
=
self
.
classifier
(
sequence_output
,
training
=
training
)
logits
=
self
.
classifier
(
sequence_output
,
training
=
training
)
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
inputs
[
"
labels
"
]
,
logits
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
,
logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
2
:]
output
=
(
logits
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1036,6 +980,7 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
...
@@ -1036,6 +980,7 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
"""
"""
return
{
"input_ids"
:
tf
.
constant
(
MULTIPLE_CHOICE_DUMMY_INPUTS
)}
return
{
"input_ids"
:
tf
.
constant
(
MULTIPLE_CHOICE_DUMMY_INPUTS
)}
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, num_choices, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, num_choices, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -1062,59 +1007,39 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
...
@@ -1062,59 +1007,39 @@ class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
"""
inputs
=
input_processing
(
if
input_ids
is
not
None
:
func
=
self
.
call
,
num_choices
=
shape_list
(
input_ids
)[
1
]
config
=
self
.
config
,
seq_length
=
shape_list
(
input_ids
)[
2
]
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
head_mask
=
head_mask
,
inputs_embeds
=
inputs_embeds
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
kwargs_call
=
kwargs
,
)
if
inputs
[
"input_ids"
]
is
not
None
:
num_choices
=
shape_list
(
inputs
[
"input_ids"
])[
1
]
seq_length
=
shape_list
(
inputs
[
"input_ids"
])[
2
]
else
:
else
:
num_choices
=
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[
1
]
num_choices
=
shape_list
(
inputs_embeds
)[
1
]
seq_length
=
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[
2
]
seq_length
=
shape_list
(
inputs_embeds
)[
2
]
flat_input_ids
=
tf
.
reshape
(
inputs
[
"input_ids"
],
(
-
1
,
seq_length
))
if
inputs
[
"input_ids"
]
is
not
None
else
None
flat_input_ids
=
tf
.
reshape
(
input_ids
,
(
-
1
,
seq_length
))
if
input_ids
is
not
None
else
None
flat_attention_mask
=
(
flat_attention_mask
=
tf
.
reshape
(
attention_mask
,
(
-
1
,
seq_length
))
if
attention_mask
is
not
None
else
None
tf
.
reshape
(
inputs
[
"attention_mask"
],
(
-
1
,
seq_length
))
if
inputs
[
"attention_mask"
]
is
not
None
else
None
flat_position_ids
=
tf
.
reshape
(
position_ids
,
(
-
1
,
seq_length
))
if
position_ids
is
not
None
else
None
)
flat_position_ids
=
(
tf
.
reshape
(
inputs
[
"position_ids"
],
(
-
1
,
seq_length
))
if
inputs
[
"position_ids"
]
is
not
None
else
None
)
flat_inputs_embeds
=
(
flat_inputs_embeds
=
(
tf
.
reshape
(
inputs
[
"inputs
_embeds
"
]
,
(
-
1
,
seq_length
,
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[
3
]))
tf
.
reshape
(
inputs_embeds
,
(
-
1
,
seq_length
,
shape_list
(
inputs_embeds
)[
3
]))
if
inputs
[
"
inputs_embeds
"
]
is
not
None
if
inputs_embeds
is
not
None
else
None
else
None
)
)
outputs
=
self
.
mpnet
(
outputs
=
self
.
mpnet
(
flat_input_ids
,
flat_input_ids
,
flat_attention_mask
,
flat_attention_mask
,
flat_position_ids
,
flat_position_ids
,
inputs
[
"
head_mask
"
]
,
head_mask
,
flat_inputs_embeds
,
flat_inputs_embeds
,
inputs
[
"
output_attentions
"
]
,
output_attentions
,
inputs
[
"
output_hidden_states
"
]
,
output_hidden_states
,
return_dict
=
inputs
[
"
return_dict
"
]
,
return_dict
=
return_dict
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
inputs
[
"
training
"
]
)
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
training
)
logits
=
self
.
classifier
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
inputs
[
"
labels
"
]
,
reshaped_logits
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
,
reshaped_logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
reshaped_logits
,)
+
outputs
[
2
:]
output
=
(
reshaped_logits
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1167,6 +1092,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
...
@@ -1167,6 +1092,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"classifier"
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"classifier"
)
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -1192,10 +1118,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
...
@@ -1192,10 +1118,7 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
"""
outputs
=
self
.
mpnet
(
inputs
=
input_processing
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
position_ids
=
position_ids
,
...
@@ -1204,29 +1127,16 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
...
@@ -1204,29 +1127,16 @@ class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificatio
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
mpnet
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
sequence_output
=
self
.
dropout
(
sequence_output
,
training
=
inputs
[
"
training
"
]
)
sequence_output
=
self
.
dropout
(
sequence_output
,
training
=
training
)
logits
=
self
.
classifier
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
inputs
[
"
labels
"
]
,
logits
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
,
logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
1
:]
output
=
(
logits
,)
+
outputs
[
1
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1265,6 +1175,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
...
@@ -1265,6 +1175,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
)
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
MPNET_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -1297,11 +1208,8 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
...
@@ -1297,11 +1208,8 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
are not taken into account for computing the loss.
"""
"""
outputs
=
self
.
mpnet
(
inputs
=
input_processing
(
input_ids
,
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
,
head_mask
=
head_mask
,
...
@@ -1309,21 +1217,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
...
@@ -1309,21 +1217,7 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
start_positions
=
start_positions
,
end_positions
=
end_positions
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
mpnet
(
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
position_ids
=
inputs
[
"position_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
...
@@ -1333,12 +1227,11 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
...
@@ -1333,12 +1227,11 @@ class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLos
end_logits
=
tf
.
squeeze
(
end_logits
,
axis
=-
1
)
end_logits
=
tf
.
squeeze
(
end_logits
,
axis
=-
1
)
loss
=
None
loss
=
None
if
inputs
[
"start_positions"
]
is
not
None
and
inputs
[
"end_positions"
]
is
not
None
:
if
start_positions
is
not
None
and
end_positions
is
not
None
:
labels
=
{
"start_position"
:
inputs
[
"start_positions"
]}
labels
=
{
"start_position"
:
start_positions
,
"end_position"
:
end_positions
}
labels
[
"end_position"
]
=
inputs
[
"end_positions"
]
loss
=
self
.
hf_compute_loss
(
labels
,
(
start_logits
,
end_logits
))
loss
=
self
.
hf_compute_loss
(
labels
,
(
start_logits
,
end_logits
))
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
start_logits
,
end_logits
)
+
outputs
[
2
:]
output
=
(
start_logits
,
end_logits
)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
...
src/transformers/models/rag/modeling_tf_rag.py
View file @
611d3a09
...
@@ -23,7 +23,7 @@ import tensorflow as tf
...
@@ -23,7 +23,7 @@ import tensorflow as tf
from
...configuration_utils
import
PretrainedConfig
from
...configuration_utils
import
PretrainedConfig
from
...file_utils
import
ModelOutput
,
add_start_docstrings_to_model_forward
,
replace_return_docstrings
from
...file_utils
import
ModelOutput
,
add_start_docstrings_to_model_forward
,
replace_return_docstrings
from
...modeling_tf_utils
import
TFCausalLanguageModelingLoss
,
TFPreTrainedModel
,
input_processing
,
shape_list
from
...modeling_tf_utils
import
TFCausalLanguageModelingLoss
,
TFPreTrainedModel
,
shape_list
,
unpack_inputs
from
...utils
import
logging
from
...utils
import
logging
from
.configuration_rag
import
RagConfig
from
.configuration_rag
import
RagConfig
from
.retrieval_rag
import
RagRetriever
from
.retrieval_rag
import
RagRetriever
...
@@ -532,6 +532,7 @@ class TFRagModel(TFRagPreTrainedModel):
...
@@ -532,6 +532,7 @@ class TFRagModel(TFRagPreTrainedModel):
def
set_retriever
(
self
,
retriever
:
RagRetriever
):
def
set_retriever
(
self
,
retriever
:
RagRetriever
):
self
.
retriever
=
retriever
self
.
retriever
=
retriever
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
RAG_FORWARD_INPUTS_DOCSTRING
)
@
add_start_docstrings_to_model_forward
(
RAG_FORWARD_INPUTS_DOCSTRING
)
@
replace_return_docstrings
(
output_type
=
TFRetrievAugLMOutput
,
config_class
=
_CONFIG_FOR_DOC
)
@
replace_return_docstrings
(
output_type
=
TFRetrievAugLMOutput
,
config_class
=
_CONFIG_FOR_DOC
)
def
call
(
def
call
(
...
@@ -580,46 +581,8 @@ class TFRagModel(TFRagPreTrainedModel):
...
@@ -580,46 +581,8 @@ class TFRagModel(TFRagPreTrainedModel):
"decoder_cached_states"
not
in
kwargs
"decoder_cached_states"
not
in
kwargs
),
"Please use past_key_values to cache intermediate outputs"
# from modeling_tf_bart.py
),
"Please use past_key_values to cache intermediate outputs"
# from modeling_tf_bart.py
inputs
=
input_processing
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
decoder_input_ids
=
decoder_input_ids
,
decoder_attention_mask
=
decoder_attention_mask
,
encoder_outputs
=
encoder_outputs
,
past_key_values
=
past_key_values
,
doc_scores
=
doc_scores
,
context_input_ids
=
context_input_ids
,
context_attention_mask
=
context_attention_mask
,
use_cache
=
use_cache
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_retrieved
=
output_retrieved
,
return_dict
=
return_dict
,
n_docs
=
n_docs
,
training
=
training
,
kwargs_call
=
kwargs
,
)
# aliasing to minimize code changing
# aliasing to minimize code changing
input_ids
=
inputs
[
"input_ids"
]
n_docs
=
n_docs
if
n_docs
is
not
None
else
self
.
config
.
n_docs
attention_mask
=
inputs
[
"attention_mask"
]
decoder_input_ids
=
inputs
[
"decoder_input_ids"
]
decoder_attention_mask
=
inputs
[
"decoder_attention_mask"
]
encoder_outputs
=
inputs
[
"encoder_outputs"
]
past_key_values
=
inputs
[
"past_key_values"
]
doc_scores
=
inputs
[
"doc_scores"
]
context_input_ids
=
inputs
[
"context_input_ids"
]
context_attention_mask
=
inputs
[
"context_attention_mask"
]
use_cache
=
inputs
[
"use_cache"
]
output_attentions
=
inputs
[
"output_attentions"
]
output_hidden_states
=
inputs
[
"output_hidden_states"
]
return_dict
=
inputs
[
"return_dict"
]
n_docs
=
inputs
[
"n_docs"
]
if
inputs
[
"n_docs"
]
is
not
None
else
self
.
config
.
n_docs
output_retrieved
=
inputs
[
"output_retrieved"
]
training
=
inputs
[
"training"
]
# whether retriever has to be used
# whether retriever has to be used
has_to_retrieve
=
(
has_to_retrieve
=
(
...
@@ -855,6 +818,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
...
@@ -855,6 +818,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
log_prob_sum
=
seq_logprobs
+
doc_logprobs
log_prob_sum
=
seq_logprobs
+
doc_logprobs
return
tf
.
reduce_logsumexp
(
log_prob_sum
,
axis
=
1
)
return
tf
.
reduce_logsumexp
(
log_prob_sum
,
axis
=
1
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
RAG_FORWARD_INPUTS_DOCSTRING
)
@
add_start_docstrings_to_model_forward
(
RAG_FORWARD_INPUTS_DOCSTRING
)
@
replace_return_docstrings
(
output_type
=
TFRetrievAugLMMarginOutput
,
config_class
=
_CONFIG_FOR_DOC
)
@
replace_return_docstrings
(
output_type
=
TFRetrievAugLMMarginOutput
,
config_class
=
_CONFIG_FOR_DOC
)
def
call
(
def
call
(
...
@@ -948,72 +912,47 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
...
@@ -948,72 +912,47 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
"decoder_cached_states"
not
in
kwargs
"decoder_cached_states"
not
in
kwargs
),
"Please use past_key_values to cache intermediate outputs"
# from modeling_tf_bart.py
),
"Please use past_key_values to cache intermediate outputs"
# from modeling_tf_bart.py
inputs
=
input_processing
(
do_marginalize
=
do_marginalize
if
do_marginalize
else
self
.
config
.
do_marginalize
func
=
self
.
call
,
reduce_loss
=
reduce_loss
if
reduce_loss
else
self
.
config
.
reduce_loss
config
=
self
.
config
,
input_ids
=
input_ids
,
if
labels
is
not
None
:
if
decoder_input_ids
is
None
:
decoder_input_ids
=
labels
use_cache
=
False
outputs
=
self
.
rag
(
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
encoder_outputs
=
encoder_outputs
,
decoder_input_ids
=
decoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
,
decoder_attention_mask
=
decoder_attention_mask
,
decoder_attention_mask
=
decoder_attention_mask
,
encoder_outputs
=
encoder_outputs
,
past_key_values
=
past_key_values
,
doc_scores
=
doc_scores
,
context_input_ids
=
context_input_ids
,
context_input_ids
=
context_input_ids
,
context_attention_mask
=
context_attention_mask
,
context_attention_mask
=
context_attention_mask
,
doc_scores
=
doc_scores
,
past_key_values
=
past_key_values
,
use_cache
=
use_cache
,
use_cache
=
use_cache
,
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
output_retrieved
=
output_retrieved
,
output_retrieved
=
output_retrieved
,
n_docs
=
n_docs
,
n_docs
=
n_docs
,
do_marginalize
=
do_marginalize
,
labels
=
labels
,
reduce_loss
=
reduce_loss
,
return_dict
=
return_dict
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
inputs
[
"do_marginalize"
]
=
inputs
[
"do_marginalize"
]
if
inputs
[
"do_marginalize"
]
else
self
.
config
.
do_marginalize
inputs
[
"reduce_loss"
]
=
inputs
[
"reduce_loss"
]
if
inputs
[
"reduce_loss"
]
else
self
.
config
.
reduce_loss
if
inputs
[
"labels"
]
is
not
None
:
if
inputs
[
"decoder_input_ids"
]
is
None
:
inputs
[
"decoder_input_ids"
]
=
inputs
[
"labels"
]
inputs
[
"use_cache"
]
=
False
outputs
=
self
.
rag
(
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
encoder_outputs
=
inputs
[
"encoder_outputs"
],
decoder_input_ids
=
inputs
[
"decoder_input_ids"
],
decoder_attention_mask
=
inputs
[
"decoder_attention_mask"
],
context_input_ids
=
inputs
[
"context_input_ids"
],
context_attention_mask
=
inputs
[
"context_attention_mask"
],
doc_scores
=
inputs
[
"doc_scores"
],
past_key_values
=
inputs
[
"past_key_values"
],
use_cache
=
inputs
[
"use_cache"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
output_retrieved
=
inputs
[
"output_retrieved"
],
n_docs
=
inputs
[
"n_docs"
],
training
=
inputs
[
"training"
],
)
)
loss
=
None
loss
=
None
logits
=
outputs
.
logits
logits
=
outputs
.
logits
if
inputs
[
"
labels
"
]
is
not
None
:
if
labels
is
not
None
:
assert
inputs
[
"
decoder_input_ids
"
]
is
not
None
assert
decoder_input_ids
is
not
None
loss
=
self
.
get_nll
(
loss
=
self
.
get_nll
(
outputs
.
logits
,
outputs
.
logits
,
outputs
.
doc_scores
,
outputs
.
doc_scores
,
inputs
[
"
labels
"
]
,
labels
,
reduce_loss
=
inputs
[
"
reduce_loss
"
]
,
reduce_loss
=
reduce_loss
,
epsilon
=
self
.
config
.
label_smoothing
,
epsilon
=
self
.
config
.
label_smoothing
,
n_docs
=
inputs
[
"
n_docs
"
]
,
n_docs
=
n_docs
,
)
)
if
inputs
[
"
do_marginalize
"
]
:
if
do_marginalize
:
logits
=
self
.
marginalize
(
logits
,
outputs
.
doc_scores
,
inputs
[
"
n_docs
"
]
)
logits
=
self
.
marginalize
(
logits
,
outputs
.
doc_scores
,
n_docs
)
return
TFRetrievAugLMMarginOutput
(
return
TFRetrievAugLMMarginOutput
(
loss
=
loss
,
loss
=
loss
,
...
@@ -1465,6 +1404,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
...
@@ -1465,6 +1404,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
def
question_encoder
(
self
):
def
question_encoder
(
self
):
return
self
.
rag
.
question_encoder
return
self
.
rag
.
question_encoder
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
RAG_FORWARD_INPUTS_DOCSTRING
)
@
add_start_docstrings_to_model_forward
(
RAG_FORWARD_INPUTS_DOCSTRING
)
@
replace_return_docstrings
(
output_type
=
TFRetrievAugLMMarginOutput
,
config_class
=
_CONFIG_FOR_DOC
)
@
replace_return_docstrings
(
output_type
=
TFRetrievAugLMMarginOutput
,
config_class
=
_CONFIG_FOR_DOC
)
def
call
(
def
call
(
...
@@ -1559,68 +1499,41 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
...
@@ -1559,68 +1499,41 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
"decoder_cached_states"
not
in
kwargs
"decoder_cached_states"
not
in
kwargs
),
"Please use past_key_values to cache intermediate outputs"
# from modeling_tf_bart.py
),
"Please use past_key_values to cache intermediate outputs"
# from modeling_tf_bart.py
inputs
=
input_processing
(
exclude_bos_score
=
exclude_bos_score
if
exclude_bos_score
else
self
.
config
.
exclude_bos_score
func
=
self
.
call
,
reduce_loss
=
reduce_loss
if
reduce_loss
else
self
.
config
.
reduce_loss
config
=
self
.
config
,
input_ids
=
input_ids
,
if
labels
is
not
None
:
if
decoder_input_ids
is
None
:
decoder_input_ids
=
labels
use_cache
=
False
outputs
=
self
.
rag
(
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
encoder_outputs
=
encoder_outputs
,
decoder_input_ids
=
decoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
,
decoder_attention_mask
=
decoder_attention_mask
,
decoder_attention_mask
=
decoder_attention_mask
,
encoder_outputs
=
encoder_outputs
,
past_key_values
=
past_key_values
,
doc_scores
=
doc_scores
,
context_input_ids
=
context_input_ids
,
context_input_ids
=
context_input_ids
,
context_attention_mask
=
context_attention_mask
,
context_attention_mask
=
context_attention_mask
,
doc_scores
=
doc_scores
,
past_key_values
=
past_key_values
,
use_cache
=
use_cache
,
use_cache
=
use_cache
,
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
output_retrieved
=
output_retrieved
,
output_retrieved
=
output_retrieved
,
n_docs
=
n_docs
,
n_docs
=
n_docs
,
exclude_bos_score
=
exclude_bos_score
,
labels
=
labels
,
reduce_loss
=
reduce_loss
,
training
=
training
,
training
=
training
,
return_dict
=
return_dict
,
kwargs_call
=
kwargs
,
)
inputs
[
"exclude_bos_score"
]
=
(
inputs
[
"exclude_bos_score"
]
if
inputs
[
"exclude_bos_score"
]
else
self
.
config
.
exclude_bos_score
)
inputs
[
"reduce_loss"
]
=
inputs
[
"reduce_loss"
]
if
inputs
[
"reduce_loss"
]
else
self
.
config
.
reduce_loss
if
inputs
[
"labels"
]
is
not
None
:
if
inputs
[
"decoder_input_ids"
]
is
None
:
inputs
[
"decoder_input_ids"
]
=
inputs
[
"labels"
]
inputs
[
"use_cache"
]
=
False
outputs
=
self
.
rag
(
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
encoder_outputs
=
inputs
[
"encoder_outputs"
],
decoder_input_ids
=
inputs
[
"decoder_input_ids"
],
decoder_attention_mask
=
inputs
[
"decoder_attention_mask"
],
context_input_ids
=
inputs
[
"context_input_ids"
],
context_attention_mask
=
inputs
[
"context_attention_mask"
],
doc_scores
=
inputs
[
"doc_scores"
],
past_key_values
=
inputs
[
"past_key_values"
],
use_cache
=
inputs
[
"use_cache"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
output_retrieved
=
inputs
[
"output_retrieved"
],
n_docs
=
inputs
[
"n_docs"
],
training
=
inputs
[
"training"
],
)
)
loss
=
None
loss
=
None
if
inputs
[
"
labels
"
]
is
not
None
:
if
labels
is
not
None
:
loss
=
self
.
get_nll
(
loss
=
self
.
get_nll
(
outputs
.
logits
,
outputs
.
logits
,
outputs
.
doc_scores
,
outputs
.
doc_scores
,
inputs
[
"
labels
"
]
,
labels
,
reduce_loss
=
inputs
[
"
reduce_loss
"
]
,
reduce_loss
=
reduce_loss
,
epsilon
=
self
.
config
.
label_smoothing
,
epsilon
=
self
.
config
.
label_smoothing
,
n_docs
=
inputs
[
"
n_docs
"
]
,
n_docs
=
n_docs
,
)
)
return
TFRetrievAugLMMarginOutput
(
return
TFRetrievAugLMMarginOutput
(
...
...
src/transformers/models/roformer/modeling_tf_roformer.py
View file @
611d3a09
...
@@ -49,8 +49,8 @@ from ...modeling_tf_utils import (
...
@@ -49,8 +49,8 @@ from ...modeling_tf_utils import (
TFSequenceSummary
,
TFSequenceSummary
,
TFTokenClassificationLoss
,
TFTokenClassificationLoss
,
get_initializer
,
get_initializer
,
input_processing
,
keras_serializable
,
keras_serializable
,
unpack_inputs
,
)
)
from
...tf_utils
import
shape_list
from
...tf_utils
import
shape_list
from
...utils
import
logging
from
...utils
import
logging
...
@@ -602,6 +602,7 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
...
@@ -602,6 +602,7 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
"""
"""
raise
NotImplementedError
raise
NotImplementedError
@
unpack_inputs
def
call
(
def
call
(
self
,
self
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
...
@@ -615,51 +616,37 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
...
@@ -615,51 +616,37 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
training
:
bool
=
False
,
training
:
bool
=
False
,
**
kwargs
,
**
kwargs
,
)
->
Union
[
TFBaseModelOutput
,
Tuple
[
tf
.
Tensor
]]:
)
->
Union
[
TFBaseModelOutput
,
Tuple
[
tf
.
Tensor
]]:
inputs
=
input_processing
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
head_mask
=
head_mask
,
inputs_embeds
=
inputs_embeds
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
training
=
training
,
kwargs_call
=
kwargs
,
)
if
inputs
[
"
input_ids
"
]
is
not
None
and
inputs
[
"
inputs_embeds
"
]
is
not
None
:
if
input_ids
is
not
None
and
inputs_embeds
is
not
None
:
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif
inputs
[
"
input_ids
"
]
is
not
None
:
elif
input_ids
is
not
None
:
input_shape
=
shape_list
(
inputs
[
"
input_ids
"
]
)
input_shape
=
shape_list
(
input_ids
)
elif
inputs
[
"
inputs_embeds
"
]
is
not
None
:
elif
inputs_embeds
is
not
None
:
input_shape
=
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[:
-
1
]
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
else
:
else
:
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
if
inputs
[
"
attention_mask
"
]
is
None
:
if
attention_mask
is
None
:
inputs
[
"
attention_mask
"
]
=
tf
.
fill
(
dims
=
input_shape
,
value
=
1
)
attention_mask
=
tf
.
fill
(
dims
=
input_shape
,
value
=
1
)
if
inputs
[
"
token_type_ids
"
]
is
None
:
if
token_type_ids
is
None
:
inputs
[
"
token_type_ids
"
]
=
tf
.
fill
(
dims
=
input_shape
,
value
=
0
)
token_type_ids
=
tf
.
fill
(
dims
=
input_shape
,
value
=
0
)
embedding_output
=
self
.
embeddings
(
embedding_output
=
self
.
embeddings
(
input_ids
=
inputs
[
"
input_ids
"
]
,
input_ids
=
input_ids
,
token_type_ids
=
inputs
[
"
token_type_ids
"
]
,
token_type_ids
=
token_type_ids
,
inputs_embeds
=
inputs
[
"inputs
_embeds
"
]
,
inputs_embeds
=
inputs_embeds
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
if
hasattr
(
self
,
"embeddings_project"
):
if
hasattr
(
self
,
"embeddings_project"
):
embedding_output
=
self
.
embeddings_project
(
embedding_output
,
training
=
inputs
[
"
training
"
]
)
embedding_output
=
self
.
embeddings_project
(
embedding_output
,
training
=
training
)
# We create a 3D attention mask from a 2D tensor mask.
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
tf
.
reshape
(
inputs
[
"
attention_mask
"
]
,
(
input_shape
[
0
],
1
,
1
,
input_shape
[
1
]))
extended_attention_mask
=
tf
.
reshape
(
attention_mask
,
(
input_shape
[
0
],
1
,
1
,
input_shape
[
1
]))
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
...
@@ -676,24 +663,24 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
...
@@ -676,24 +663,24 @@ class TFRoFormerMainLayer(tf.keras.layers.Layer):
# attention_probs has shape bsz x n_heads x N x N
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if
inputs
[
"
head_mask
"
]
is
not
None
:
if
head_mask
is
not
None
:
raise
NotImplementedError
raise
NotImplementedError
else
:
else
:
inputs
[
"
head_mask
"
]
=
[
None
]
*
self
.
config
.
num_hidden_layers
head_mask
=
[
None
]
*
self
.
config
.
num_hidden_layers
encoder_outputs
=
self
.
encoder
(
encoder_outputs
=
self
.
encoder
(
hidden_states
=
embedding_output
,
hidden_states
=
embedding_output
,
attention_mask
=
extended_attention_mask
,
attention_mask
=
extended_attention_mask
,
head_mask
=
inputs
[
"
head_mask
"
]
,
head_mask
=
head_mask
,
output_attentions
=
inputs
[
"
output_attentions
"
]
,
output_attentions
=
output_attentions
,
output_hidden_states
=
inputs
[
"
output_hidden_states
"
]
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
inputs
[
"
return_dict
"
]
,
return_dict
=
return_dict
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
sequence_output
=
encoder_outputs
[
0
]
sequence_output
=
encoder_outputs
[
0
]
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
return
(
sequence_output
,)
+
encoder_outputs
[
1
:]
return
(
sequence_output
,)
+
encoder_outputs
[
1
:]
return
TFBaseModelOutput
(
return
TFBaseModelOutput
(
...
@@ -811,6 +798,7 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):
...
@@ -811,6 +798,7 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):
self
.
roformer
=
TFRoFormerMainLayer
(
config
,
name
=
"roformer"
)
self
.
roformer
=
TFRoFormerMainLayer
(
config
,
name
=
"roformer"
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -831,9 +819,7 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):
...
@@ -831,9 +819,7 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):
training
:
Optional
[
bool
]
=
False
,
training
:
Optional
[
bool
]
=
False
,
**
kwargs
,
**
kwargs
,
)
->
Union
[
TFBaseModelOutputWithPooling
,
Tuple
[
tf
.
Tensor
]]:
)
->
Union
[
TFBaseModelOutputWithPooling
,
Tuple
[
tf
.
Tensor
]]:
inputs
=
input_processing
(
outputs
=
self
.
roformer
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
token_type_ids
=
token_type_ids
,
...
@@ -843,18 +829,6 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):
...
@@ -843,18 +829,6 @@ class TFRoFormerModel(TFRoFormerPreTrainedModel):
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
roformer
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
return
outputs
return
outputs
...
@@ -883,6 +857,7 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
...
@@ -883,6 +857,7 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
def
get_lm_head
(
self
)
->
tf
.
keras
.
layers
.
Layer
:
def
get_lm_head
(
self
)
->
tf
.
keras
.
layers
.
Layer
:
return
self
.
mlm
.
predictions
return
self
.
mlm
.
predictions
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -910,9 +885,7 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
...
@@ -910,9 +885,7 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
"""
inputs
=
input_processing
(
outputs
=
self
.
roformer
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
token_type_ids
=
token_type_ids
,
...
@@ -921,30 +894,13 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
...
@@ -921,30 +894,13 @@ class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingL
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
roformer
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
mlm
(
sequence_output
=
sequence_output
,
training
=
inputs
[
"training"
])
prediction_scores
=
self
.
mlm
(
sequence_output
=
sequence_output
,
training
=
training
)
loss
=
(
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
prediction_scores
)
None
if
inputs
[
"labels"
]
is
None
else
self
.
hf_compute_loss
(
labels
=
inputs
[
"labels"
],
logits
=
prediction_scores
)
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
prediction_scores
,)
+
outputs
[
2
:]
output
=
(
prediction_scores
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -978,6 +934,7 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
...
@@ -978,6 +934,7 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
def
get_lm_head
(
self
)
->
tf
.
keras
.
layers
.
Layer
:
def
get_lm_head
(
self
)
->
tf
.
keras
.
layers
.
Layer
:
return
self
.
mlm
.
predictions
return
self
.
mlm
.
predictions
@
unpack_inputs
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
checkpoint
=
_CHECKPOINT_FOR_DOC
,
checkpoint
=
_CHECKPOINT_FOR_DOC
,
...
@@ -1003,9 +960,7 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
...
@@ -1003,9 +960,7 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
config.vocab_size - 1]`.
"""
"""
inputs
=
input_processing
(
outputs
=
self
.
roformer
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
token_type_ids
=
token_type_ids
,
...
@@ -1014,32 +969,19 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
...
@@ -1014,32 +969,19 @@ class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingL
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
roformer
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
logits
=
self
.
mlm
(
sequence_output
=
sequence_output
,
training
=
inputs
[
"
training
"
]
)
logits
=
self
.
mlm
(
sequence_output
=
sequence_output
,
training
=
training
)
loss
=
None
loss
=
None
if
inputs
[
"
labels
"
]
is
not
None
:
if
labels
is
not
None
:
# shift labels to the left and cut last logit token
# shift labels to the left and cut last logit token
shifted_logits
=
logits
[:,
:
-
1
]
shifted_logits
=
logits
[:,
:
-
1
]
labels
=
inputs
[
"
labels
"
]
[:,
1
:]
labels
=
labels
[:,
1
:]
loss
=
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
shifted_logits
)
loss
=
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
shifted_logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
2
:]
output
=
(
logits
,)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1102,6 +1044,7 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
...
@@ -1102,6 +1044,7 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
self
.
roformer
=
TFRoFormerMainLayer
(
config
,
name
=
"roformer"
)
self
.
roformer
=
TFRoFormerMainLayer
(
config
,
name
=
"roformer"
)
self
.
classifier
=
TFRoFormerClassificationHead
(
config
,
name
=
"classifier"
)
self
.
classifier
=
TFRoFormerClassificationHead
(
config
,
name
=
"classifier"
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -1129,9 +1072,7 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
...
@@ -1129,9 +1072,7 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
"""
inputs
=
input_processing
(
outputs
=
self
.
roformer
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
token_type_ids
=
token_type_ids
,
...
@@ -1140,25 +1081,12 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
...
@@ -1140,25 +1081,12 @@ class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceC
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
roformer
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
logits
=
self
.
classifier
(
hidden_states
=
outputs
[
0
],
training
=
inputs
[
"
training
"
]
)
logits
=
self
.
classifier
(
hidden_states
=
outputs
[
0
],
training
=
training
)
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
labels
=
inputs
[
"
labels
"
]
,
logits
=
logits
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
1
:]
output
=
(
logits
,)
+
outputs
[
1
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1205,6 +1133,7 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
...
@@ -1205,6 +1133,7 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
"""
"""
return
{
"input_ids"
:
tf
.
constant
(
MULTIPLE_CHOICE_DUMMY_INPUTS
)}
return
{
"input_ids"
:
tf
.
constant
(
MULTIPLE_CHOICE_DUMMY_INPUTS
)}
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, num_choices, sequence_length"
)
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, num_choices, sequence_length"
)
)
)
...
@@ -1233,66 +1162,42 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
...
@@ -1233,66 +1162,42 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
"""
inputs
=
input_processing
(
if
input_ids
is
not
None
:
func
=
self
.
call
,
num_choices
=
shape_list
(
input_ids
)[
1
]
config
=
self
.
config
,
seq_length
=
shape_list
(
input_ids
)[
2
]
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
head_mask
=
head_mask
,
inputs_embeds
=
inputs_embeds
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
kwargs_call
=
kwargs
,
)
if
inputs
[
"input_ids"
]
is
not
None
:
num_choices
=
shape_list
(
inputs
[
"input_ids"
])[
1
]
seq_length
=
shape_list
(
inputs
[
"input_ids"
])[
2
]
else
:
else
:
num_choices
=
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[
1
]
num_choices
=
shape_list
(
inputs_embeds
)[
1
]
seq_length
=
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[
2
]
seq_length
=
shape_list
(
inputs_embeds
)[
2
]
flat_input_ids
=
(
flat_input_ids
=
tf
.
reshape
(
tensor
=
input_ids
,
shape
=
(
-
1
,
seq_length
))
if
input_ids
is
not
None
else
None
tf
.
reshape
(
tensor
=
inputs
[
"input_ids"
],
shape
=
(
-
1
,
seq_length
))
if
inputs
[
"input_ids"
]
is
not
None
else
None
)
flat_attention_mask
=
(
flat_attention_mask
=
(
tf
.
reshape
(
tensor
=
inputs
[
"attention_mask"
],
shape
=
(
-
1
,
seq_length
))
tf
.
reshape
(
tensor
=
attention_mask
,
shape
=
(
-
1
,
seq_length
))
if
attention_mask
is
not
None
else
None
if
inputs
[
"attention_mask"
]
is
not
None
else
None
)
)
flat_token_type_ids
=
(
flat_token_type_ids
=
(
tf
.
reshape
(
tensor
=
inputs
[
"token_type_ids"
],
shape
=
(
-
1
,
seq_length
))
tf
.
reshape
(
tensor
=
token_type_ids
,
shape
=
(
-
1
,
seq_length
))
if
token_type_ids
is
not
None
else
None
if
inputs
[
"token_type_ids"
]
is
not
None
else
None
)
)
flat_inputs_embeds
=
(
flat_inputs_embeds
=
(
tf
.
reshape
(
tensor
=
inputs
[
"inputs
_embeds
"
]
,
shape
=
(
-
1
,
seq_length
,
shape_list
(
inputs
[
"inputs
_embeds
"
]
)[
3
]))
tf
.
reshape
(
tensor
=
inputs_embeds
,
shape
=
(
-
1
,
seq_length
,
shape_list
(
inputs_embeds
)[
3
]))
if
inputs
[
"
inputs_embeds
"
]
is
not
None
if
inputs_embeds
is
not
None
else
None
else
None
)
)
outputs
=
self
.
roformer
(
outputs
=
self
.
roformer
(
input_ids
=
flat_input_ids
,
input_ids
=
flat_input_ids
,
attention_mask
=
flat_attention_mask
,
attention_mask
=
flat_attention_mask
,
token_type_ids
=
flat_token_type_ids
,
token_type_ids
=
flat_token_type_ids
,
head_mask
=
inputs
[
"
head_mask
"
]
,
head_mask
=
head_mask
,
inputs_embeds
=
flat_inputs_embeds
,
inputs_embeds
=
flat_inputs_embeds
,
output_attentions
=
inputs
[
"
output_attentions
"
]
,
output_attentions
=
output_attentions
,
output_hidden_states
=
inputs
[
"
output_hidden_states
"
]
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
inputs
[
"
return_dict
"
]
,
return_dict
=
return_dict
,
training
=
inputs
[
"
training
"
]
,
training
=
training
,
)
)
logits
=
self
.
sequence_summary
(
inputs
=
outputs
[
0
],
training
=
inputs
[
"
training
"
]
)
logits
=
self
.
sequence_summary
(
inputs
=
outputs
[
0
],
training
=
training
)
logits
=
self
.
classifier
(
inputs
=
logits
)
logits
=
self
.
classifier
(
inputs
=
logits
)
reshaped_logits
=
tf
.
reshape
(
tensor
=
logits
,
shape
=
(
-
1
,
num_choices
))
reshaped_logits
=
tf
.
reshape
(
tensor
=
logits
,
shape
=
(
-
1
,
num_choices
))
loss
=
(
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
reshaped_logits
)
None
if
inputs
[
"labels"
]
is
None
else
self
.
hf_compute_loss
(
labels
=
inputs
[
"labels"
],
logits
=
reshaped_logits
)
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
reshaped_logits
,)
+
outputs
[
1
:]
output
=
(
reshaped_logits
,)
+
outputs
[
1
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1344,6 +1249,7 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
...
@@ -1344,6 +1249,7 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
units
=
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"classifier"
units
=
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"classifier"
)
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -1369,9 +1275,7 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
...
@@ -1369,9 +1275,7 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
"""
inputs
=
input_processing
(
outputs
=
self
.
roformer
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
token_type_ids
=
token_type_ids
,
...
@@ -1380,27 +1284,14 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
...
@@ -1380,27 +1284,14 @@ class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassif
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
labels
=
labels
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
roformer
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
sequence_output
=
self
.
dropout
(
inputs
=
sequence_output
,
training
=
inputs
[
"
training
"
]
)
sequence_output
=
self
.
dropout
(
inputs
=
sequence_output
,
training
=
training
)
logits
=
self
.
classifier
(
inputs
=
sequence_output
)
logits
=
self
.
classifier
(
inputs
=
sequence_output
)
loss
=
None
if
inputs
[
"
labels
"
]
is
None
else
self
.
hf_compute_loss
(
labels
=
inputs
[
"
labels
"
]
,
logits
=
logits
)
loss
=
None
if
labels
is
None
else
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
logits
)
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
1
:]
output
=
(
logits
,)
+
outputs
[
1
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
@@ -1436,6 +1327,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
...
@@ -1436,6 +1327,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
units
=
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
units
=
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
)
)
@
unpack_inputs
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_start_docstrings_to_model_forward
(
ROFORMER_INPUTS_DOCSTRING
.
format
(
"batch_size, sequence_length"
))
@
add_code_sample_docstrings
(
@
add_code_sample_docstrings
(
processor_class
=
_TOKENIZER_FOR_DOC
,
processor_class
=
_TOKENIZER_FOR_DOC
,
...
@@ -1468,9 +1360,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
...
@@ -1468,9 +1360,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
are not taken into account for computing the loss.
"""
"""
inputs
=
input_processing
(
outputs
=
self
.
roformer
(
func
=
self
.
call
,
config
=
self
.
config
,
input_ids
=
input_ids
,
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
token_type_ids
=
token_type_ids
,
...
@@ -1479,21 +1369,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
...
@@ -1479,21 +1369,7 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
return_dict
=
return_dict
,
start_positions
=
start_positions
,
end_positions
=
end_positions
,
training
=
training
,
training
=
training
,
kwargs_call
=
kwargs
,
)
outputs
=
self
.
roformer
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
],
token_type_ids
=
inputs
[
"token_type_ids"
],
head_mask
=
inputs
[
"head_mask"
],
inputs_embeds
=
inputs
[
"inputs_embeds"
],
output_attentions
=
inputs
[
"output_attentions"
],
output_hidden_states
=
inputs
[
"output_hidden_states"
],
return_dict
=
inputs
[
"return_dict"
],
training
=
inputs
[
"training"
],
)
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
logits
=
self
.
qa_outputs
(
inputs
=
sequence_output
)
logits
=
self
.
qa_outputs
(
inputs
=
sequence_output
)
...
@@ -1502,12 +1378,11 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
...
@@ -1502,12 +1378,11 @@ class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnswer
end_logits
=
tf
.
squeeze
(
input
=
end_logits
,
axis
=-
1
)
end_logits
=
tf
.
squeeze
(
input
=
end_logits
,
axis
=-
1
)
loss
=
None
loss
=
None
if
inputs
[
"start_positions"
]
is
not
None
and
inputs
[
"end_positions"
]
is
not
None
:
if
start_positions
is
not
None
and
end_positions
is
not
None
:
labels
=
{
"start_position"
:
inputs
[
"start_positions"
]}
labels
=
{
"start_position"
:
start_positions
,
"end_position"
:
end_positions
}
labels
[
"end_position"
]
=
inputs
[
"end_positions"
]
loss
=
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
(
start_logits
,
end_logits
))
loss
=
self
.
hf_compute_loss
(
labels
=
labels
,
logits
=
(
start_logits
,
end_logits
))
if
not
inputs
[
"
return_dict
"
]
:
if
not
return_dict
:
output
=
(
start_logits
,
end_logits
)
+
outputs
[
2
:]
output
=
(
start_logits
,
end_logits
)
+
outputs
[
2
:]
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
return
((
loss
,)
+
output
)
if
loss
is
not
None
else
output
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment