Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
83a41d39
Commit
83a41d39
authored
Jan 15, 2020
by
Julien Chaumond
Browse files
💄
super
parent
cd51893d
Changes
75
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
177 additions
and
177 deletions
+177
-177
src/transformers/modeling_gpt2.py
src/transformers/modeling_gpt2.py
+7
-7
src/transformers/modeling_mmbt.py
src/transformers/modeling_mmbt.py
+3
-3
src/transformers/modeling_openai.py
src/transformers/modeling_openai.py
+6
-6
src/transformers/modeling_roberta.py
src/transformers/modeling_roberta.py
+10
-10
src/transformers/modeling_t5.py
src/transformers/modeling_t5.py
+10
-10
src/transformers/modeling_tf_albert.py
src/transformers/modeling_tf_albert.py
+13
-13
src/transformers/modeling_tf_bert.py
src/transformers/modeling_tf_bert.py
+24
-24
src/transformers/modeling_tf_ctrl.py
src/transformers/modeling_tf_ctrl.py
+7
-7
src/transformers/modeling_tf_distilbert.py
src/transformers/modeling_tf_distilbert.py
+14
-14
src/transformers/modeling_tf_gpt2.py
src/transformers/modeling_tf_gpt2.py
+7
-7
src/transformers/modeling_tf_openai.py
src/transformers/modeling_tf_openai.py
+7
-7
src/transformers/modeling_tf_roberta.py
src/transformers/modeling_tf_roberta.py
+10
-10
src/transformers/modeling_tf_t5.py
src/transformers/modeling_tf_t5.py
+11
-11
src/transformers/modeling_tf_transfo_xl.py
src/transformers/modeling_tf_transfo_xl.py
+11
-11
src/transformers/modeling_tf_transfo_xl_utilities.py
src/transformers/modeling_tf_transfo_xl_utilities.py
+2
-2
src/transformers/modeling_tf_utils.py
src/transformers/modeling_tf_utils.py
+5
-5
src/transformers/modeling_tf_xlm.py
src/transformers/modeling_tf_xlm.py
+9
-9
src/transformers/modeling_tf_xlnet.py
src/transformers/modeling_tf_xlnet.py
+13
-13
src/transformers/modeling_transfo_xl.py
src/transformers/modeling_transfo_xl.py
+7
-7
src/transformers/modeling_transfo_xl_utilities.py
src/transformers/modeling_transfo_xl_utilities.py
+1
-1
No files found.
src/transformers/modeling_gpt2.py
View file @
83a41d39
...
@@ -101,7 +101,7 @@ def gelu(x):
...
@@ -101,7 +101,7 @@ def gelu(x):
class
Attention
(
nn
.
Module
):
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
super
(
Attention
,
self
).
__init__
()
super
().
__init__
()
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
@@ -202,7 +202,7 @@ class Attention(nn.Module):
...
@@ -202,7 +202,7 @@ class Attention(nn.Module):
class
MLP
(
nn
.
Module
):
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
super
(
MLP
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
...
@@ -217,7 +217,7 @@ class MLP(nn.Module):
...
@@ -217,7 +217,7 @@ class MLP(nn.Module):
class
Block
(
nn
.
Module
):
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
super
(
Block
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
...
@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
...
@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
base_model_prefix
=
"transformer"
base_model_prefix
=
"transformer"
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
super
(
GPT2PreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
().
__init__
(
*
inputs
,
**
kwargs
)
def
_init_weights
(
self
,
module
):
def
_init_weights
(
self
,
module
):
""" Initialize the weights.
""" Initialize the weights.
...
@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel):
...
@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
GPT2Model
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_past
=
config
.
output_past
self
.
output_past
=
config
.
output_past
...
@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
...
@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
GPT2LMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
GPT2DoubleHeadsModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
GPT2Model
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
...
src/transformers/modeling_mmbt.py
View file @
83a41d39
...
@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module):
...
@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module):
"""
"""
def
__init__
(
self
,
config
,
encoder
,
embeddings
):
def
__init__
(
self
,
config
,
encoder
,
embeddings
):
super
(
ModalEmbeddings
,
self
).
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
encoder
=
encoder
self
.
encoder
=
encoder
self
.
proj_embeddings
=
nn
.
Linear
(
config
.
modal_hidden_size
,
config
.
hidden_size
)
self
.
proj_embeddings
=
nn
.
Linear
(
config
.
modal_hidden_size
,
config
.
hidden_size
)
...
@@ -175,7 +175,7 @@ class MMBTModel(nn.Module):
...
@@ -175,7 +175,7 @@ class MMBTModel(nn.Module):
"""
"""
def
__init__
(
self
,
config
,
transformer
,
encoder
):
def
__init__
(
self
,
config
,
transformer
,
encoder
):
super
(
MMBTModel
,
self
).
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
transformer
=
transformer
self
.
transformer
=
transformer
self
.
modal_encoder
=
ModalEmbeddings
(
config
,
encoder
,
transformer
.
embeddings
)
self
.
modal_encoder
=
ModalEmbeddings
(
config
,
encoder
,
transformer
.
embeddings
)
...
@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module):
...
@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module):
"""
"""
def
__init__
(
self
,
config
,
transformer
,
encoder
):
def
__init__
(
self
,
config
,
transformer
,
encoder
):
super
(
MMBTForClassification
,
self
).
__init__
()
super
().
__init__
()
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
mmbt
=
MMBTModel
(
config
,
transformer
,
encoder
)
self
.
mmbt
=
MMBTModel
(
config
,
transformer
,
encoder
)
...
...
src/transformers/modeling_openai.py
View file @
83a41d39
...
@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
...
@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
class
Attention
(
nn
.
Module
):
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
super
(
Attention
,
self
).
__init__
()
super
().
__init__
()
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert
n_state
%
config
.
n_head
==
0
assert
n_state
%
config
.
n_head
==
0
...
@@ -221,7 +221,7 @@ class Attention(nn.Module):
...
@@ -221,7 +221,7 @@ class Attention(nn.Module):
class
MLP
(
nn
.
Module
):
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
super
(
MLP
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
...
@@ -236,7 +236,7 @@ class MLP(nn.Module):
...
@@ -236,7 +236,7 @@ class MLP(nn.Module):
class
Block
(
nn
.
Module
):
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
super
(
Block
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
...
@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
...
@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
OpenAIGPTModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
...
@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
OpenAIGPTLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
...
@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
OpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
...
...
src/transformers/modeling_roberta.py
View file @
83a41d39
...
@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings):
...
@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaEmbeddings
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
padding_idx
=
1
self
.
padding_idx
=
1
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
,
padding_idx
=
self
.
padding_idx
)
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
,
padding_idx
=
self
.
padding_idx
)
self
.
position_embeddings
=
nn
.
Embedding
(
self
.
position_embeddings
=
nn
.
Embedding
(
...
@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings):
...
@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings):
else
:
else
:
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
return
super
(
RobertaEmbeddings
,
self
).
forward
(
return
super
().
forward
(
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
inputs_embeds
=
inputs_embeds
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
inputs_embeds
=
inputs_embeds
)
)
...
@@ -204,7 +204,7 @@ class RobertaModel(BertModel):
...
@@ -204,7 +204,7 @@ class RobertaModel(BertModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
init_weights
()
self
.
init_weights
()
...
@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
...
@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForMaskedLM
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
lm_head
=
RobertaLMHead
(
config
)
self
.
lm_head
=
RobertaLMHead
(
config
)
...
@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module):
...
@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module):
"""Roberta Head for masked language modeling."""
"""Roberta Head for masked language modeling."""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaLMHead
,
self
).
__init__
()
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
layer_norm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
layer_norm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
...
@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
...
@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForSequenceClassification
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
...
@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
...
@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForMultipleChoice
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
...
@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
...
@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForTokenClassification
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
...
@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module):
...
@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
"""Head for sentence-level classification tasks."""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaClassificationHead
,
self
).
__init__
()
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
out_proj
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
num_labels
)
self
.
out_proj
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
num_labels
)
...
@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
...
@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForQuestionAnswering
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
...
...
src/transformers/modeling_t5.py
View file @
83a41d39
...
@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module):
...
@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module):
""" Construct a layernorm module in the T5 style
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
No bias and no substraction of mean.
"""
"""
super
(
T5LayerNorm
,
self
).
__init__
()
super
().
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
variance_epsilon
=
eps
self
.
variance_epsilon
=
eps
...
@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module):
...
@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module):
class
T5DenseReluDense
(
nn
.
Module
):
class
T5DenseReluDense
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5DenseReluDense
,
self
).
__init__
()
super
().
__init__
()
self
.
wi
=
nn
.
Linear
(
config
.
d_model
,
config
.
d_ff
,
bias
=
False
)
self
.
wi
=
nn
.
Linear
(
config
.
d_model
,
config
.
d_ff
,
bias
=
False
)
self
.
wo
=
nn
.
Linear
(
config
.
d_ff
,
config
.
d_model
,
bias
=
False
)
self
.
wo
=
nn
.
Linear
(
config
.
d_ff
,
config
.
d_model
,
bias
=
False
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module):
...
@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module):
class
T5LayerFF
(
nn
.
Module
):
class
T5LayerFF
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5LayerFF
,
self
).
__init__
()
super
().
__init__
()
self
.
DenseReluDense
=
T5DenseReluDense
(
config
)
self
.
DenseReluDense
=
T5DenseReluDense
(
config
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -185,7 +185,7 @@ class T5Attention(nn.Module):
...
@@ -185,7 +185,7 @@ class T5Attention(nn.Module):
NEW_ID
=
itertools
.
count
()
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5Attention
,
self
).
__init__
()
super
().
__init__
()
self
.
layer_id
=
next
(
T5Attention
.
NEW_ID
)
self
.
layer_id
=
next
(
T5Attention
.
NEW_ID
)
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
has_relative_attention_bias
=
has_relative_attention_bias
self
.
has_relative_attention_bias
=
has_relative_attention_bias
...
@@ -363,7 +363,7 @@ class T5Attention(nn.Module):
...
@@ -363,7 +363,7 @@ class T5Attention(nn.Module):
class
T5LayerSelfAttention
(
nn
.
Module
):
class
T5LayerSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5LayerSelfAttention
,
self
).
__init__
()
super
().
__init__
()
self
.
SelfAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
SelfAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module):
...
@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module):
class
T5LayerCrossAttention
(
nn
.
Module
):
class
T5LayerCrossAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5LayerCrossAttention
,
self
).
__init__
()
super
().
__init__
()
self
.
EncDecAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
EncDecAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module):
...
@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module):
class
T5Block
(
nn
.
Module
):
class
T5Block
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5Block
,
self
).
__init__
()
super
().
__init__
()
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
layer
=
nn
.
ModuleList
()
self
.
layer
=
nn
.
ModuleList
()
self
.
layer
.
append
(
T5LayerSelfAttention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
))
self
.
layer
.
append
(
T5LayerSelfAttention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
))
...
@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel):
...
@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel):
class
T5Stack
(
T5PreTrainedModel
):
class
T5Stack
(
T5PreTrainedModel
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5Stack
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
...
@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel):
...
@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5Model
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
encoder_config
=
copy
.
deepcopy
(
config
)
encoder_config
=
copy
.
deepcopy
(
config
)
...
@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
...
@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5WithLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
model_dim
=
config
.
d_model
self
.
model_dim
=
config
.
d_model
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
...
...
src/transformers/modeling_tf_albert.py
View file @
83a41d39
...
@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
...
@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
config
=
config
self
.
config
=
config
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
...
@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
...
@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
shape
=
[
self
.
config
.
vocab_size
,
self
.
config
.
embedding_size
],
shape
=
[
self
.
config
.
vocab_size
,
self
.
config
.
embedding_size
],
initializer
=
get_initializer
(
self
.
config
.
initializer_range
),
initializer
=
get_initializer
(
self
.
config
.
initializer_range
),
)
)
super
(
TFAlbertEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
...
@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
class
TFAlbertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"The hidden size (%d) is not a multiple of the number of attention "
...
@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
...
@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
class
TFAlbertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertSelfOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer):
...
@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer):
class
TFAlbertAttention
(
TFBertSelfAttention
):
class
TFAlbertAttention
(
TFBertSelfAttention
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertAttention
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
...
@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention):
...
@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention):
class
TFAlbertLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
attention
=
TFAlbertAttention
(
config
,
name
=
"attention"
)
self
.
attention
=
TFAlbertAttention
(
config
,
name
=
"attention"
)
self
.
ffn
=
tf
.
keras
.
layers
.
Dense
(
self
.
ffn
=
tf
.
keras
.
layers
.
Dense
(
...
@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
...
@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
class
TFAlbertLayerGroup
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertLayerGroup
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertLayerGroup
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
...
@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
class
TFAlbertTransformer
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertTransformer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertTransformer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
config
=
config
self
.
config
=
config
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
...
@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
...
@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
class
TFAlbertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFAlbertMLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
...
@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
...
@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self
.
decoder_bias
=
self
.
add_weight
(
self
.
decoder_bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"decoder/bias"
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"decoder/bias"
)
)
super
(
TFAlbertMLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dense
(
hidden_states
)
...
@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
...
@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertModel
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFAlbertEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
embeddings
=
TFAlbertEmbeddings
(
config
,
name
=
"embeddings"
)
...
@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
...
@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFAlbertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
self
.
predictions
=
TFAlbertMLMHead
(
config
,
self
.
albert
.
embeddings
,
name
=
"predictions"
)
self
.
predictions
=
TFAlbertMLMHead
(
config
,
self
.
albert
.
embeddings
,
name
=
"predictions"
)
...
@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
...
@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFAlbertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
...
...
src/transformers/modeling_tf_bert.py
View file @
83a41d39
...
@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
self
.
initializer_range
=
config
.
initializer_range
self
.
initializer_range
=
config
.
initializer_range
...
@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
)
super
(
TFBertEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
class
TFBertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"The hidden size (%d) is not a multiple of the number of attention "
...
@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
...
@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
class
TFBertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertSelfOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
...
@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
class
TFBertAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
self_attention
=
TFBertSelfAttention
(
config
,
name
=
"self"
)
self
.
self_attention
=
TFBertSelfAttention
(
config
,
name
=
"self"
)
self
.
dense_output
=
TFBertSelfOutput
(
config
,
name
=
"output"
)
self
.
dense_output
=
TFBertSelfOutput
(
config
,
name
=
"output"
)
...
@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer):
...
@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer):
class
TFBertIntermediate
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertIntermediate
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertIntermediate
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
intermediate_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
intermediate_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
...
@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
class
TFBertOutput
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer):
...
@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer):
class
TFBertLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
attention
=
TFBertAttention
(
config
,
name
=
"attention"
)
self
.
attention
=
TFBertAttention
(
config
,
name
=
"attention"
)
self
.
intermediate
=
TFBertIntermediate
(
config
,
name
=
"intermediate"
)
self
.
intermediate
=
TFBertIntermediate
(
config
,
name
=
"intermediate"
)
self
.
bert_output
=
TFBertOutput
(
config
,
name
=
"output"
)
self
.
bert_output
=
TFBertOutput
(
config
,
name
=
"output"
)
...
@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer):
...
@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer):
class
TFBertEncoder
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertEncoder
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertEncoder
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
layer
=
[
TFBertLayer
(
config
,
name
=
"layer_._{}"
.
format
(
i
))
for
i
in
range
(
config
.
num_hidden_layers
)]
self
.
layer
=
[
TFBertLayer
(
config
,
name
=
"layer_._{}"
.
format
(
i
))
for
i
in
range
(
config
.
num_hidden_layers
)]
...
@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
...
@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
class
TFBertPooler
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertPooler
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertPooler
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
...
@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer):
...
@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer):
class
TFBertPredictionHeadTransform
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertPredictionHeadTransform
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertPredictionHeadTransform
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
...
@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
class
TFBertLMPredictionHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertLMPredictionHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFBertLMPredictionHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
transform
=
TFBertPredictionHeadTransform
(
config
,
name
=
"transform"
)
self
.
transform
=
TFBertPredictionHeadTransform
(
config
,
name
=
"transform"
)
...
@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
...
@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFBertLMPredictionHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
transform
(
hidden_states
)
hidden_states
=
self
.
transform
(
hidden_states
)
...
@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
...
@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
class
TFBertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFBertMLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
predictions
=
TFBertLMPredictionHead
(
config
,
input_embeddings
,
name
=
"predictions"
)
self
.
predictions
=
TFBertLMPredictionHead
(
config
,
input_embeddings
,
name
=
"predictions"
)
def
call
(
self
,
sequence_output
):
def
call
(
self
,
sequence_output
):
...
@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer):
...
@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer):
class
TFBertNSPHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertNSPHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertNSPHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
seq_relationship
=
tf
.
keras
.
layers
.
Dense
(
self
.
seq_relationship
=
tf
.
keras
.
layers
.
Dense
(
2
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"seq_relationship"
2
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"seq_relationship"
)
)
...
@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
...
@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
class
TFBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFBertEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
embeddings
=
TFBertEmbeddings
(
config
,
name
=
"embeddings"
)
...
@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel):
...
@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
...
@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForPreTraining
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
...
@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
...
@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
mlm
=
TFBertMLMHead
(
config
,
self
.
bert
.
embeddings
,
name
=
"mlm___cls"
)
self
.
mlm
=
TFBertMLMHead
(
config
,
self
.
bert
.
embeddings
,
name
=
"mlm___cls"
)
...
@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
...
@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForNextSentencePrediction
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
...
@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
...
@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
...
@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMultipleChoice
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
...
@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
...
@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
...
@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
...
src/transformers/modeling_tf_ctrl.py
View file @
83a41d39
...
@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
...
@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
class
TFMultiHeadAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFMultiHeadAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model_size
,
num_heads
,
output_attentions
=
False
,
**
kwargs
):
def
__init__
(
self
,
d_model_size
,
num_heads
,
output_attentions
=
False
,
**
kwargs
):
super
(
TFMultiHeadAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
d_model_size
=
d_model_size
self
.
d_model_size
=
d_model_size
...
@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
...
@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
def
__init__
(
def
__init__
(
self
,
d_model_size
,
num_heads
,
dff
,
rate
=
0.1
,
layer_norm_epsilon
=
1e-6
,
output_attentions
=
False
,
**
kwargs
self
,
d_model_size
,
num_heads
,
dff
,
rate
=
0.1
,
layer_norm_epsilon
=
1e-6
,
output_attentions
=
False
,
**
kwargs
):
):
super
(
TFEncoderLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
multi_head_attention
=
TFMultiHeadAttention
(
self
.
multi_head_attention
=
TFMultiHeadAttention
(
d_model_size
,
num_heads
,
output_attentions
,
name
=
"multi_head_attention"
d_model_size
,
num_heads
,
output_attentions
,
name
=
"multi_head_attention"
...
@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
...
@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
class
TFCTRLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFCTRLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFCTRLMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_past
=
config
.
output_past
self
.
output_past
=
config
.
output_past
...
@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
...
@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFCTRLModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
...
@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
class
TFCTRLLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFCTRLLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFCTRLLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
# The output weights are the same as the input embeddings, but there is
...
@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
...
@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFCTRLLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
...
@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFCTRLLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
self
.
lm_head
=
TFCTRLLMHead
(
config
,
self
.
transformer
.
w
,
name
=
"lm_head"
)
self
.
lm_head
=
TFCTRLLMHead
(
config
,
self
.
transformer
.
w
,
name
=
"lm_head"
)
...
...
src/transformers/modeling_tf_distilbert.py
View file @
83a41d39
...
@@ -65,7 +65,7 @@ def gelu_new(x):
...
@@ -65,7 +65,7 @@ def gelu_new(x):
class
TFEmbeddings
(
tf
.
keras
.
layers
.
Layer
):
class
TFEmbeddings
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
self
.
initializer_range
=
config
.
initializer_range
self
.
initializer_range
=
config
.
initializer_range
...
@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self
.
word_embeddings
=
self
.
add_weight
(
self
.
word_embeddings
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
dim
],
initializer
=
get_initializer
(
self
.
initializer_range
)
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
dim
],
initializer
=
get_initializer
(
self
.
initializer_range
)
)
)
super
(
TFEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
inputs_embeds
=
None
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
inputs_embeds
=
None
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
class
TFMultiHeadSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFMultiHeadSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFMultiHeadSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_heads
=
config
.
n_heads
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
...
@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
...
@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
class
TFFFN
(
tf
.
keras
.
layers
.
Layer
):
class
TFFFN
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFFFN
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin1"
config
.
hidden_dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin1"
...
@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer):
...
@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer):
class
TFTransformerBlock
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransformerBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransformerBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_heads
=
config
.
n_heads
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
...
@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
...
@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
class
TFTransformer
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransformer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransformer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_layers
=
config
.
n_layers
self
.
n_layers
=
config
.
n_layers
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer):
...
@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer):
class
TFDistilBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFDistilBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFDistilBertMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFEmbeddings
(
config
,
name
=
"embeddings"
)
# Embeddings
self
.
embeddings
=
TFEmbeddings
(
config
,
name
=
"embeddings"
)
# Embeddings
...
@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
...
@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
# Embeddings
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
# Embeddings
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
...
@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
class
TFDistilBertLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFDistilBertLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFDistilBertLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
# The output weights are the same as the input embeddings, but there is
...
@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
...
@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFDistilBertLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
...
@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
...
@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
...
@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
...
@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
...
@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
...
@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
...
@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
...
...
src/transformers/modeling_tf_gpt2.py
View file @
83a41d39
...
@@ -58,7 +58,7 @@ def gelu(x):
...
@@ -58,7 +58,7 @@ def gelu(x):
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
...
@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
...
@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer):
...
@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer):
class
TFGPT2MainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFGPT2MainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2MainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
self
.
num_hidden_layers
=
config
.
n_layer
...
@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
...
@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
...
@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2LMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
def
get_output_embeddings
(
self
):
def
get_output_embeddings
(
self
):
...
@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
...
@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
self
.
multiple_choice_head
=
TFSequenceSummary
(
...
...
src/transformers/modeling_tf_openai.py
View file @
83a41d39
...
@@ -66,7 +66,7 @@ ACT_FNS = {
...
@@ -66,7 +66,7 @@ ACT_FNS = {
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
...
@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
...
@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
...
@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
class
TFOpenAIGPTMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFOpenAIGPTMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTMainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
self
.
num_hidden_layers
=
config
.
n_layer
...
@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
...
@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
...
@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
def
get_output_embeddings
(
self
):
def
get_output_embeddings
(
self
):
...
@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
...
@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
self
.
multiple_choice_head
=
TFSequenceSummary
(
...
...
src/transformers/modeling_tf_roberta.py
View file @
83a41d39
...
@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
...
@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaEmbeddings
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
padding_idx
=
1
self
.
padding_idx
=
1
def
create_position_ids_from_input_ids
(
self
,
x
):
def
create_position_ids_from_input_ids
(
self
,
x
):
...
@@ -78,7 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
...
@@ -78,7 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
else
:
else
:
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
return
super
(
TFRobertaEmbeddings
,
self
).
_embedding
(
return
super
().
_embedding
(
[
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
[
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
)
)
...
@@ -89,7 +89,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
...
@@ -89,7 +89,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaMainLayer
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
embeddings
=
TFRobertaEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
embeddings
=
TFRobertaEmbeddings
(
config
,
name
=
"embeddings"
)
def
get_input_embeddings
(
self
):
def
get_input_embeddings
(
self
):
...
@@ -234,7 +234,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
...
@@ -234,7 +234,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -246,7 +246,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
...
@@ -246,7 +246,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling."""
"""Roberta Head for masked language modeling."""
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFRobertaLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
...
@@ -260,7 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
...
@@ -260,7 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFRobertaLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
features
):
def
call
(
self
,
features
):
x
=
self
.
dense
(
features
)
x
=
self
.
dense
(
features
)
...
@@ -305,7 +305,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
...
@@ -305,7 +305,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
lm_head
=
TFRobertaLMHead
(
config
,
self
.
roberta
.
embeddings
,
name
=
"lm_head"
)
self
.
lm_head
=
TFRobertaLMHead
(
config
,
self
.
roberta
.
embeddings
,
name
=
"lm_head"
)
...
@@ -328,7 +328,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
...
@@ -328,7 +328,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
"""Head for sentence-level classification tasks."""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaClassificationHead
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
...
@@ -383,7 +383,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
...
@@ -383,7 +383,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
...
@@ -433,7 +433,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
...
@@ -433,7 +433,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
...
...
src/transformers/modeling_tf_t5.py
View file @
83a41d39
...
@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
...
@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
""" Construct a layernorm module in the T5 style
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
No bias and no substraction of mean.
"""
"""
super
(
TFT5LayerNorm
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
variance_epsilon
=
epsilon
self
.
variance_epsilon
=
epsilon
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer """
"""Build shared word embedding layer """
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
(
input_shape
[
-
1
],),
initializer
=
"ones"
)
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
(
input_shape
[
-
1
],),
initializer
=
"ones"
)
super
(
TFT5LayerNorm
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
x
):
def
call
(
self
,
x
):
variance
=
tf
.
math
.
reduce_mean
(
tf
.
math
.
square
(
x
),
axis
=-
1
,
keepdims
=
True
)
variance
=
tf
.
math
.
reduce_mean
(
tf
.
math
.
square
(
x
),
axis
=-
1
,
keepdims
=
True
)
...
@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
...
@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
class
TFT5DenseReluDense
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5DenseReluDense
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5DenseReluDense
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
wi
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_ff
,
use_bias
=
False
,
name
=
"wi"
)
self
.
wi
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_ff
,
use_bias
=
False
,
name
=
"wi"
)
self
.
wo
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
use_bias
=
False
,
name
=
"wo"
)
self
.
wo
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
use_bias
=
False
,
name
=
"wo"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
...
@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
...
@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
class
TFT5LayerFF
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5LayerFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5LayerFF
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
DenseReluDense
=
TFT5DenseReluDense
(
config
,
name
=
"DenseReluDense"
)
self
.
DenseReluDense
=
TFT5DenseReluDense
(
config
,
name
=
"DenseReluDense"
)
self
.
layer_norm
=
TFT5LayerNorm
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"layer_norm"
)
self
.
layer_norm
=
TFT5LayerNorm
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"layer_norm"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
...
@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer):
...
@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer):
NEW_ID
=
itertools
.
count
()
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5Attention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_id
=
next
(
TFT5Attention
.
NEW_ID
)
self
.
layer_id
=
next
(
TFT5Attention
.
NEW_ID
)
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
has_relative_attention_bias
=
has_relative_attention_bias
self
.
has_relative_attention_bias
=
has_relative_attention_bias
...
@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer):
...
@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer):
class
TFT5LayerSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5LayerSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5LayerSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
SelfAttention
=
TFT5Attention
(
self
.
SelfAttention
=
TFT5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"SelfAttention"
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"SelfAttention"
)
)
...
@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
...
@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
class
TFT5LayerCrossAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5LayerCrossAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5LayerCrossAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
EncDecAttention
=
TFT5Attention
(
self
.
EncDecAttention
=
TFT5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"EncDecAttention"
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"EncDecAttention"
)
)
...
@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
...
@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
class
TFT5Block
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5Block
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5Block
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
layer
=
[]
self
.
layer
=
[]
self
.
layer
.
append
(
self
.
layer
.
append
(
...
@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer):
...
@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer):
####################################################
####################################################
class
TFT5MainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5MainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5MainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
...
@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel):
...
@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFT5Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
encoder_config
=
copy
.
deepcopy
(
config
)
encoder_config
=
copy
.
deepcopy
(
config
)
...
@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
...
@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFT5WithLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
model_dim
=
config
.
d_model
self
.
model_dim
=
config
.
d_model
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
...
...
src/transformers/modeling_tf_transfo_xl.py
View file @
83a41d39
...
@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
...
@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
class
TFPositionalEmbedding
(
tf
.
keras
.
layers
.
Layer
):
class
TFPositionalEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
demb
,
**
kwargs
):
def
__init__
(
self
,
demb
,
**
kwargs
):
super
(
TFPositionalEmbedding
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
inv_freq
=
1
/
(
10000
**
(
tf
.
range
(
0
,
demb
,
2.0
)
/
demb
))
self
.
inv_freq
=
1
/
(
10000
**
(
tf
.
range
(
0
,
demb
,
2.0
)
/
demb
))
...
@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
...
@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
super
(
TFPositionwiseFF
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
d_model
=
d_model
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
self
.
d_inner
=
d_inner
...
@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
init_std
=
0.02
,
init_std
=
0.02
,
**
kwargs
**
kwargs
):
):
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
n_head
=
n_head
self
.
n_head
=
n_head
...
@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self
.
r_w_bias
=
self
.
add_weight
(
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_w_bias"
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_w_bias"
)
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
_rel_shift
(
self
,
x
):
def
_rel_shift
(
self
,
x
):
x_size
=
shape_list
(
x
)
x_size
=
shape_list
(
x
)
...
@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
init_std
=
0.02
,
init_std
=
0.02
,
**
kwargs
**
kwargs
):
):
super
(
TFRelPartialLearnableDecoderLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
n_head
,
n_head
,
...
@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
init_std
=
0.02
,
sample_softmax
=
False
,
**
kwargs
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
init_std
=
0.02
,
sample_softmax
=
False
,
**
kwargs
):
super
(
TFAdaptiveEmbedding
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
...
@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
name
=
"emb_projs_._{}"
.
format
(
i
),
name
=
"emb_projs_._{}"
.
format
(
i
),
)
)
)
)
super
(
TFAdaptiveEmbedding
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inp
):
def
call
(
self
,
inp
):
if
self
.
div_val
==
1
:
if
self
.
div_val
==
1
:
...
@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
...
@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
class
TFTransfoXLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransfoXLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransfoXLMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
...
@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
self
.
r_r_bias
=
self
.
add_weight
(
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_r_bias"
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_r_bias"
)
)
super
(
TFTransfoXLMainLayer
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
get_input_embeddings
(
self
):
def
get_input_embeddings
(
self
):
return
self
.
word_emb
return
self
.
word_emb
...
@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
...
@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFTransfoXLModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
...
@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TFTransfoXLLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
self
.
sample_softmax
=
config
.
sample_softmax
self
.
sample_softmax
=
config
.
sample_softmax
# use sampled softmax
# use sampled softmax
...
...
src/transformers/modeling_tf_transfo_xl_utilities.py
View file @
83a41d39
...
@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list
...
@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list
class
TFAdaptiveSoftmaxMask
(
tf
.
keras
.
layers
.
Layer
):
class
TFAdaptiveSoftmaxMask
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
vocab_size
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
,
**
kwargs
):
def
__init__
(
self
,
vocab_size
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
,
**
kwargs
):
super
(
TFAdaptiveSoftmaxMask
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
...
@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
name
=
"out_layers_._{}_._bias"
.
format
(
i
),
name
=
"out_layers_._{}_._bias"
.
format
(
i
),
)
)
self
.
out_layers
.
append
((
weight
,
bias
))
self
.
out_layers
.
append
((
weight
,
bias
))
super
(
TFAdaptiveSoftmaxMask
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
@
staticmethod
@
staticmethod
def
_logit
(
x
,
W
,
b
,
proj
=
None
):
def
_logit
(
x
,
W
,
b
,
proj
=
None
):
...
...
src/transformers/modeling_tf_utils.py
View file @
83a41d39
...
@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
...
@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
return
{
"input_ids"
:
tf
.
constant
(
DUMMY_INPUTS
)}
return
{
"input_ids"
:
tf
.
constant
(
DUMMY_INPUTS
)}
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFPreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
().
__init__
(
*
inputs
,
**
kwargs
)
if
not
isinstance
(
config
,
PretrainedConfig
):
if
not
isinstance
(
config
,
PretrainedConfig
):
raise
ValueError
(
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
...
@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer):
...
@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
Basically works like a Linear layer but the weights are transposed
"""
"""
super
(
TFConv1D
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
nf
=
nf
self
.
nf
=
nf
self
.
nx
=
nx
self
.
nx
=
nx
self
.
initializer_range
=
initializer_range
self
.
initializer_range
=
initializer_range
...
@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
...
@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
vocab_size
,
hidden_size
,
initializer_range
=
None
,
**
kwargs
):
def
__init__
(
self
,
vocab_size
,
hidden_size
,
initializer_range
=
None
,
**
kwargs
):
super
(
TFSharedEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
initializer_range
=
hidden_size
**
-
0.5
if
initializer_range
is
None
else
initializer_range
self
.
initializer_range
=
hidden_size
**
-
0.5
if
initializer_range
is
None
else
initializer_range
...
@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
...
@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
)
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
)
)
)
super
(
TFSharedEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
...
@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
initializer_range
=
0.02
,
**
kwargs
):
def
__init__
(
self
,
config
,
initializer_range
=
0.02
,
**
kwargs
):
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
"summary_use_proj"
)
else
"last"
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
"summary_use_proj"
)
else
"last"
if
self
.
summary_type
==
"attn"
:
if
self
.
summary_type
==
"attn"
:
...
...
src/transformers/modeling_tf_xlm.py
View file @
83a41d39
...
@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
...
@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
NEW_ID
=
itertools
.
count
()
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
n_heads
,
dim
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_heads
,
dim
,
config
,
**
kwargs
):
super
(
TFMultiHeadAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_id
=
next
(
TFMultiHeadAttention
.
NEW_ID
)
self
.
layer_id
=
next
(
TFMultiHeadAttention
.
NEW_ID
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
dim
=
dim
self
.
dim
=
dim
...
@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
...
@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
class
TFTransformerFFN
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransformerFFN
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
super
(
TFTransformerFFN
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin1"
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin1"
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin2"
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin2"
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
...
@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer):
...
@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer):
class
TFXLMMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLMMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLMMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
...
@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
...
@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFXLMPredLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
asm
=
config
.
asm
self
.
asm
=
config
.
asm
self
.
n_words
=
config
.
n_words
self
.
n_words
=
config
.
n_words
self
.
pad_index
=
config
.
pad_index
self
.
pad_index
=
config
.
pad_index
...
@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
...
@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
n_words
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
n_words
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFXLMPredLayer
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
...
@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMWithLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
pred_layer
=
TFXLMPredLayer
(
config
,
self
.
transformer
.
embeddings
,
name
=
"pred_layer_._proj"
)
self
.
pred_layer
=
TFXLMPredLayer
(
config
,
self
.
transformer
.
embeddings
,
name
=
"pred_layer_._proj"
)
...
@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
...
@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
...
@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
...
@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"qa_outputs"
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"qa_outputs"
...
...
src/transformers/modeling_tf_xlnet.py
View file @
83a41d39
...
@@ -57,7 +57,7 @@ ACT2FN = {
...
@@ -57,7 +57,7 @@ ACT2FN = {
class
TFXLNetRelativeAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetRelativeAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetRelativeAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
if
config
.
d_model
%
config
.
n_head
!=
0
:
if
config
.
d_model
%
config
.
n_head
!=
0
:
...
@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
seg_embed
=
self
.
add_weight
(
self
.
seg_embed
=
self
.
add_weight
(
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
trainable
=
True
,
name
=
"seg_embed"
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
trainable
=
True
,
name
=
"seg_embed"
)
)
super
(
TFXLNetRelativeAttention
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
prune_heads
(
self
,
heads
):
def
prune_heads
(
self
,
heads
):
raise
NotImplementedError
raise
NotImplementedError
...
@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
class
TFXLNetFeedForward
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetFeedForward
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetFeedForward
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
"layer_norm"
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
"layer_norm"
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_inner
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"layer_1"
config
.
d_inner
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"layer_1"
...
@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
...
@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
class
TFXLNetLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
rel_attn
=
TFXLNetRelativeAttention
(
config
,
name
=
"rel_attn"
)
self
.
rel_attn
=
TFXLNetRelativeAttention
(
config
,
name
=
"rel_attn"
)
self
.
ff
=
TFXLNetFeedForward
(
config
,
name
=
"ff"
)
self
.
ff
=
TFXLNetFeedForward
(
config
,
name
=
"ff"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
...
@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
...
@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
class
TFXLNetLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFXLNetLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
# an output-only bias for each token.
...
@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
...
@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFXLNetLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
...
@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
class
TFXLNetMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_past
=
config
.
output_past
self
.
output_past
=
config
.
output_past
...
@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
...
@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
...
@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
lm_loss
=
TFXLNetLMHead
(
config
,
self
.
transformer
.
word_embedding
,
name
=
"lm_loss"
)
self
.
lm_loss
=
TFXLNetLMHead
(
config
,
self
.
transformer
.
word_embedding
,
name
=
"lm_loss"
)
...
@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
...
@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
...
@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
...
@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
...
@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
...
@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
...
@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
...
@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
# """
# """
# def __init__(self, config, *inputs, **kwargs):
# def __init__(self, config, *inputs, **kwargs):
# super(
TFXLNetForQuestionAnswering, self
).__init__(config, *inputs, **kwargs)
# super().__init__(config, *inputs, **kwargs)
# self.start_n_top = config.start_n_top
# self.start_n_top = config.start_n_top
# self.end_n_top = config.end_n_top
# self.end_n_top = config.end_n_top
...
...
src/transformers/modeling_transfo_xl.py
View file @
83a41d39
...
@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
...
@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
class
PositionalEmbedding
(
nn
.
Module
):
class
PositionalEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
demb
):
def
__init__
(
self
,
demb
):
super
(
PositionalEmbedding
,
self
).
__init__
()
super
().
__init__
()
self
.
demb
=
demb
self
.
demb
=
demb
...
@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module):
...
@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module):
class
PositionwiseFF
(
nn
.
Module
):
class
PositionwiseFF
(
nn
.
Module
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
):
super
(
PositionwiseFF
,
self
).
__init__
()
super
().
__init__
()
self
.
d_model
=
d_model
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
self
.
d_inner
=
d_inner
...
@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
...
@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
output_attentions
=
False
,
output_attentions
=
False
,
layer_norm_epsilon
=
1e-5
,
layer_norm_epsilon
=
1e-5
,
):
):
super
(
RelPartialLearnableMultiHeadAttn
,
self
).
__init__
()
super
().
__init__
()
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
n_head
=
n_head
self
.
n_head
=
n_head
...
@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
...
@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
class
RelPartialLearnableDecoderLayer
(
nn
.
Module
):
class
RelPartialLearnableDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
d_inner
,
dropout
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
d_inner
,
dropout
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
super
(
RelPartialLearnableDecoderLayer
,
self
).
__init__
()
super
().
__init__
()
self
.
dec_attn
=
RelPartialLearnableMultiHeadAttn
(
self
.
dec_attn
=
RelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
layer_norm_epsilon
=
layer_norm_epsilon
,
**
kwargs
n_head
,
d_model
,
d_head
,
dropout
,
layer_norm_epsilon
=
layer_norm_epsilon
,
**
kwargs
...
@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
...
@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
class
AdaptiveEmbedding
(
nn
.
Module
):
class
AdaptiveEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
sample_softmax
=
False
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
sample_softmax
=
False
):
super
(
AdaptiveEmbedding
,
self
).
__init__
()
super
().
__init__
()
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
...
@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TransfoXLModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
...
@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TransfoXLLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
TransfoXLModel
(
config
)
self
.
transformer
=
TransfoXLModel
(
config
)
self
.
sample_softmax
=
config
.
sample_softmax
self
.
sample_softmax
=
config
.
sample_softmax
# use sampled softmax
# use sampled softmax
...
...
src/transformers/modeling_transfo_xl_utilities.py
View file @
83a41d39
...
@@ -29,7 +29,7 @@ import torch.nn.functional as F
...
@@ -29,7 +29,7 @@ import torch.nn.functional as F
class
ProjectedAdaptiveLogSoftmax
(
nn
.
Module
):
class
ProjectedAdaptiveLogSoftmax
(
nn
.
Module
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
):
super
(
ProjectedAdaptiveLogSoftmax
,
self
).
__init__
()
super
().
__init__
()
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment